Skip to content

Test

Test #1

Workflow file for this run

# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.

Check failure on line 1 in .github/workflows/cicd-main.yml

View workflow run for this annotation

GitHub Actions / .github/workflows/cicd-main.yml

Invalid workflow file

(Line: 184, Col: 14): Unrecognized named-value: 'secrets'. Located at position 1 within expression: secrets.PAT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: CICD Megatron-LM
on:
schedule:
- cron: 0 0 * * *
push:
branches:
- dev
- main
- 'pull-request/[0-9]+'
- 'deploy-release/*'
merge_group:
types: [checks_requested]
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }}-${{ github.event_name }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
env:
container-registry: 766267172432.dkr.ecr.us-east-1.amazonaws.com
jobs:
is-not-external-contributor:
runs-on: ubuntu-latest
if: github.repository == 'NVIDIA/Megatron-LM'
outputs:
is_external_contributor: ${{ github.event.pull_request.user.type == 'User' }}
is_maintainer: ${{ steps.check-membership.outputs.is_maintainer }}
selected_runner: ${{ steps.check-membership.outputs.is_maintainer == 'true' && 'nvidia-ci-aws-gpu-x8' || 'nvidia-ci-aws-gpu-x8-ephemeral' }}
permissions:
issues: write
pull-requests: write
env:
GITHUB_TOKEN: ${{ secrets.PAT }}
REPO: ${{ github.repository }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
token: ${{ env.GITHUB_TOKEN }}
- name: Get PR info
id: get-pr-info
if: startsWith(github.ref, 'refs/heads/pull-request/')
uses: nv-gha-runners/get-pr-info@main
- name: Check NVIDIA SSO membership
id: check-sso
uses: ./.github/actions/check-nvidia-sso-membership
with:
username: ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').user.login }}
github_token: ${{ secrets.NVIDIA_MANAGEMENT_ORG_PAT }}
- name: Set maintainer status
id: check-membership
env:
IS_MAIN_BRANCH: ${{ github.ref == 'refs/heads/main' }}
IS_MERGE_GROUP: ${{ github.event_name == 'merge_group' }}
SCHEDULED_JOB: ${{ github.event_name == 'schedule' }}
run: |
# Skip SSO check for scheduled jobs, main branch, or merge groups
if [ "${{ env.SCHEDULED_JOB }}" == "true" ] || [ "${IS_MAIN_BRANCH}" == "true" ] || [ "${IS_MERGE_GROUP}" == "true" ]; then
echo "is_maintainer=true" | tee -a $GITHUB_OUTPUT
exit 0
fi
# Use SSO membership check result
IS_MEMBER="${{ steps.check-sso.outputs.is_member }}"
if [ "$IS_MEMBER" == "true" ]; then
echo "is_maintainer=true" | tee -a $GITHUB_OUTPUT
else
echo "is_maintainer=false" | tee -a $GITHUB_OUTPUT
fi
- name: Find Comment
uses: peter-evans/find-comment@v4
if: startsWith(github.ref, 'refs/heads/pull-request/')
id: fc
with:
issue-number: ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
repository: ${{ github.repository }}
body-includes: '<!--external-contributor-comment-->'
- name: Delete comment
uses: actions/github-script@v7
if: startsWith(github.ref, 'refs/heads/pull-request/') && steps.fc.outputs.comment-id != ''
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
script: |
await github.rest.issues.deleteComment({
owner: context.repo.owner,
repo: context.repo.repo,
comment_id: ${{ steps.fc.outputs.comment-id }}
})
- name: Write pull request comment
if: startsWith(github.ref, 'refs/heads/pull-request/') && steps.check-membership.outputs.is_maintainer == 'false'
uses: peter-evans/create-or-update-comment@v5
with:
issue-number: ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
repository: ${{ github.repository }}
body: |
<!--external-contributor-comment-->
Thank you for your contribution!
NVIDIA Megatron-LM is currently transitioning to development on Github. We will aim to review your PR after we complete our transition and stabilize our Github development process.
Thank you for your understanding.
pre-flight:
needs: [is-not-external-contributor]
if: github.repository == 'NVIDIA/Megatron-LM'
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/[email protected]
linting:
runs-on: ubuntu-latest
needs: [pre-flight]
if: |
(
needs.pre-flight.outputs.is_deployment_workflow == 'false'
&& needs.pre-flight.outputs.is_ci_workload == 'true'
) || (
needs.pre-flight.outputs.is_deployment_workflow == 'false'
&& needs.pre-flight.outputs.is_ci_workload == 'false'
&& needs.pre-flight.outputs.docs_only == 'false'
)
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Install uv
uses: astral-sh/setup-uv@v1
with:
version: 0.7.2
- name: Install linting tools
run: |
uv sync --locked --only-group linting
- name: Get PR info
id: get-pr-info
if: startsWith(github.ref, 'refs/heads/pull-request/')
uses: nv-gha-runners/get-pr-info@main
- name: Run linting
if: startsWith(github.ref, 'refs/heads/pull-request/')
run: |
export PATH=".venv/bin:$PATH"
export GITLAB_ENDPOINT=github.com
export CI_PROJECT_NAMESPACE=NVIDIA
export BASE_REF="${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.ref }}"
export CHECK_ONLY=true
export SKIP_DOCS=false
bash tools/autoformat.sh
trigger:
name: Invoke workflow in another repo with inputs
uses: the-actions-org/workflow-dispatch@v4
with:
workflow: Approve Test Queue
repo: NVIDIA-NeMo/Megatron-Bridge
ref: main
token: ${{ secrets.PAT }}
inputs: '{ "mcore_commit": "${{ github.sha }}", "mcore_branch": "${{ github.ref }}" }'
display-workflow-run-url: true
wait-for-completion: true
# cicd-wait-in-queue:
# runs-on: ubuntu-latest
# needs: [pre-flight, linting]
# environment: ${{ needs.pre-flight.outputs.is_merge_group == 'true' && 'merge-gate' || 'test' }}
# if: |
# !(needs.pre-flight.outputs.is_ci_workload == 'true'
# || needs.pre-flight.outputs.is_deployment_workflow == 'true'
# || needs.pre-flight.outputs.docs_only == 'true')
# steps:
# - name: Running CI tests
# run: |
# echo "Running CI tests"
# echo "is_merge_group: ${{ needs.pre-flight.outputs.is_merge_group }}"
# cicd-container-build:
# needs: [is-not-external-contributor, pre-flight, cicd-wait-in-queue]
# runs-on: ${{ needs.is-not-external-contributor.outputs.selected_runner }}
# if: |
# (
# success()
# || needs.pre-flight.outputs.is_ci_workload == 'true'
# || needs.pre-flight.outputs.force_run_all == 'true'
# )
# && needs.pre-flight.outputs.is_merge_group == 'false'
# && !cancelled()
# steps:
# - name: Taint node for job isolation
# if: contains(needs.is-not-external-contributor.outputs.selected_runner, 'ephemeral')
# shell: bash
# run: taint-node.sh
# - name: Checkout
# uses: actions/checkout@v4
# - name: Setup python
# uses: actions/setup-python@v5
# with:
# python-version: 3.12
# - name: Install GH CLI
# shell: bash -x -e -u -o pipefail {0}
# run: |
# apt-get update
# apt-get install -y gh
# - name: Get PR info
# id: get-pr-info
# if: startsWith(github.ref, 'refs/heads/pull-request/')
# uses: nv-gha-runners/get-pr-info@main
# - name: Has lts label
# id: has-lts-label
# env:
# GH_TOKEN: ${{ secrets.PAT }}
# run: |
# PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
# HAS_LTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "container::lts")') || echo "false"
# echo "main=$HAS_LTS_LABEL" | tee -a $GITHUB_OUTPUT
# - name: Download test data
# shell: bash
# run: |
# echo "::group::Download test data"
# pip install --no-cache-dir click requests
# python tests/test_utils/python_scripts/download_unit_tests_dataset.py --assets-dir ./assets
# echo "::endgroup::"
# - name: Install GH CLI
# shell: bash
# run: |
# apt-get update
# apt-get install -y gh
# - name: Get last merged PR
# id: cache_from
# env:
# GH_TOKEN: ${{ github.token }}
# run: |
# LAST_PRS=$(gh api graphql -f query='
# query {
# repository(owner: "NVIDIA", name: "Megatron-LM") {
# pullRequests(states: MERGED, first: 100, orderBy: {field: UPDATED_AT, direction: DESC}) {
# nodes {
# number
# }
# }
# }
# }' | jq -r '.data.repository.pullRequests.nodes[].number' | while read -r number; do
# echo "type=registry,ref=${{ env.container-registry }}/megatron-lm:$number-buildcache,mode=max"
# done)
# echo "LAST_PRS<<EOF" | tee -a $GITHUB_OUTPUT
# echo "$LAST_PRS" | tee -a $GITHUB_OUTPUT
# echo "EOF" | tee -a $GITHUB_OUTPUT
# - name: Parse baseimage
# shell: bash
# id: base-image
# env:
# HAS_LTS_LABEL: ${{ steps.has-lts-label.outputs.main }}
# run: |
# if [ "$HAS_LTS_LABEL" == "true" ]; then
# NGC_VERSION=$(cat docker/.ngc_version.lts)
# echo "version=$NGC_VERSION" | tee -a $GITHUB_OUTPUT
# echo "image_type=lts" | tee -a $GITHUB_OUTPUT
# else
# NGC_VERSION=$(cat docker/.ngc_version.dev)
# echo "version=$NGC_VERSION" | tee -a $GITHUB_OUTPUT
# echo "image_type=dev" | tee -a $GITHUB_OUTPUT
# fi
# - name: Set up Docker Buildx
# uses: docker/setup-buildx-action@v3
# - name: Build and push
# uses: docker/build-push-action@v5
# with:
# file: ./docker/Dockerfile.ci.dev
# push: true
# context: .
# target: main
# build-args: |
# FROM_IMAGE_NAME=${{ steps.base-image.outputs.version }}
# IMAGE_TYPE=${{ steps.base-image.outputs.image_type }}
# cache-from: |
# type=registry,ref=${{ env.container-registry }}/megatron-lm:${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || 0 }}-buildcache,mode=max
# type=registry,ref=${{ env.container-registry }}/megatron-lm:main-buildcache,mode=max
# ${{ steps.cache_from.outputs.LAST_PRS }}
# cache-to: |
# type=registry,ref=${{ env.container-registry }}/megatron-lm:${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || 0 }}-buildcache,mode=max
# no-cache: false
# tags: |
# ${{ env.container-registry }}/megatron-lm:${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || 0 }}
# ${{ env.container-registry }}/megatron-lm:${{ github.sha }}
# secrets: |
# GH_TOKEN=${{ secrets.PAT }}
# cicd-parse-unit-tests:
# runs-on: ubuntu-latest
# outputs:
# unit-tests: ${{ steps.parse-unit-tests.outputs.unit-tests }}
# needs:
# - pre-flight
# - cicd-wait-in-queue
# - cicd-container-build
# if: |
# (
# success()
# || needs.pre-flight.outputs.is_ci_workload == 'true'
# || needs.pre-flight.outputs.force_run_all == 'true'
# )
# && needs.pre-flight.outputs.is_merge_group == 'false'
# && !cancelled()
# steps:
# - name: Checkout
# uses: actions/checkout@v4
# - name: Parse unit tests
# id: parse-unit-tests
# run: |
# cat tests/test_utils/recipes/unit-tests.yaml | yq -o json '[.products[].test_case[] | { "bucket": .}] | sort_by(.model, .test_case)' | jq -c > unit-tests.json
# echo "unit-tests=$(cat unit-tests.json)" | tee -a $GITHUB_OUTPUT
# cicd-unit-tests-latest:
# strategy:
# fail-fast: false
# matrix:
# include: ${{ fromJson(needs.cicd-parse-unit-tests.outputs.unit-tests) }}
# needs:
# - is-not-external-contributor
# - pre-flight
# - cicd-wait-in-queue
# - cicd-container-build
# - cicd-parse-unit-tests
# runs-on: ${{ needs.is-not-external-contributor.outputs.selected_runner }}
# name: '${{ matrix.bucket }} - latest'
# if: |
# (
# success()
# || needs.pre-flight.outputs.is_ci_workload == 'true'
# || needs.pre-flight.outputs.force_run_all == 'true'
# )
# && needs.pre-flight.outputs.is_merge_group == 'false'
# && !cancelled()
# env:
# PIP_DISABLE_PIP_VERSION_CHECK: 1
# PIP_NO_PYTHON_VERSION_WARNING: 1
# PIP_ROOT_USER_ACTION: ignore
# steps:
# - name: Taint node for job isolation
# if: contains(needs.is-not-external-contributor.outputs.selected_runner, 'ephemeral')
# shell: bash
# run: taint-node.sh
# - name: Checkout
# uses: actions/checkout@v4
# - name: main
# uses: ./.github/actions
# with:
# test_case: ${{ matrix.bucket }}
# tag: latest
# timeout: ${{ matrix.timeout || 30 }}
# is_unit_test: 'true'
# PAT: ${{ secrets.PAT }}
# container-image: ${{ env.container-registry }}/megatron-lm:${{ github.sha }}
# cicd-parse-integration-tests:
# runs-on: ubuntu-latest
# needs:
# - pre-flight
# - cicd-wait-in-queue
# - cicd-container-build
# - cicd-unit-tests-latest
# if: |
# (
# success()
# || needs.pre-flight.outputs.is_ci_workload == 'true'
# || needs.pre-flight.outputs.force_run_all == 'true'
# )
# && needs.pre-flight.outputs.is_merge_group == 'false'
# && !cancelled()
# outputs:
# integration-tests: ${{ steps.main.outputs.integration-tests }}
# steps:
# - name: Checkout
# uses: actions/checkout@v4
# - name: Get PR info
# id: get-pr-info
# if: startsWith(github.ref, 'refs/heads/pull-request/')
# uses: nv-gha-runners/get-pr-info@main
# - name: Has Run tests label
# id: has-run-tests-label
# env:
# GH_TOKEN: ${{ secrets.PAT }}
# run: |
# PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
# HAS_RUN_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run tests")') || echo "false"
# echo "main=$HAS_RUN_TESTS_LABEL" | tee -a $GITHUB_OUTPUT
# - name: Has Run functional tests label
# id: has-run-functional-tests-label
# env:
# GH_TOKEN: ${{ secrets.PAT }}
# run: |
# PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
# HAS_RUN_FUNCTIONAL_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run functional tests")') || echo "false"
# echo "main=$HAS_RUN_FUNCTIONAL_TESTS_LABEL" | tee -a $GITHUB_OUTPUT
# - name: Parse functional tests
# id: main
# env:
# HAS_RUN_TESTS_LABEL: ${{ steps.has-run-tests-label.outputs.main }}
# HAS_RUN_FUNCTIONAL_TESTS_LABEL: ${{ steps.has-run-functional-tests-label.outputs.main }}
# run: |
# export PYTHONPATH=$(pwd)
# if [ "$HAS_RUN_TESTS_LABEL" == "true" ]; then
# ARGS=(
# --scope mr-github
# --enable-lightweight-mode
# )
# elif [ "$HAS_RUN_FUNCTIONAL_TESTS_LABEL" == "true" ]; then
# ARGS=(
# --scope mr-github
# )
# else
# ARGS=(
# --scope mr-github-slim
# )
# fi
# python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
# --n-repeat 5 \
# --time-limit 2700 \
# --test-cases all \
# --container-image mcore_ci_dev \
# --container-tag latest \
# --dependent-job functional:configure \
# --record-checkpoints false \
# --slurm-account gh \
# --no-enable-warmup \
# --environment dev \
# --platform dgx_h100 \
# --cluster ghci \
# ${ARGS[@]} \
# --output-path integration-tests.yaml
# cat integration-tests.yaml | \
# yq -o json 'del(.default, .stages, .workflow) | to_entries | map({"model": .value.stage, "test_case": .key}) | sort_by(.model, .test_case)' | jq -c > integration-tests.json
# echo "integration-tests=$(cat integration-tests.json)" | tee -a "$GITHUB_OUTPUT"
# cicd-integration-tests-latest:
# strategy:
# fail-fast: false
# matrix:
# include: ${{ fromJson(needs.cicd-parse-integration-tests.outputs.integration-tests) }}
# needs:
# - is-not-external-contributor
# - pre-flight
# - cicd-wait-in-queue
# - cicd-parse-integration-tests
# - cicd-unit-tests-latest
# runs-on: ${{ needs.is-not-external-contributor.outputs.selected_runner }}
# name: '${{ matrix.model }}/${{ matrix.test_case }} - latest'
# env:
# PIP_DISABLE_PIP_VERSION_CHECK: 1
# PIP_NO_PYTHON_VERSION_WARNING: 1
# PIP_ROOT_USER_ACTION: ignore
# if: |
# (
# success()
# || needs.pre-flight.outputs.is_ci_workload == 'true'
# || needs.pre-flight.outputs.force_run_all == 'true'
# )
# && needs.pre-flight.outputs.is_merge_group == 'false'
# && !cancelled()
# steps:
# - name: Taint node for job isolation
# if: contains(needs.is-not-external-contributor.outputs.selected_runner, 'ephemeral')
# shell: bash
# run: taint-node.sh
# - name: Checkout
# uses: actions/checkout@v4
# - name: main
# uses: ./.github/actions
# with:
# test_case: ${{ matrix.test_case }}
# model: ${{ matrix.model }}
# tag: latest
# timeout: ${{ matrix.timeout || 30 }}
# is_unit_test: 'false'
# PAT: ${{ secrets.PAT }}
# container-image: ${{ env.container-registry }}/megatron-lm:${{ github.sha }}
# Nemo_CICD_Test:
# needs:
# - pre-flight
# - cicd-unit-tests-latest
# - cicd-integration-tests-latest
# if: |
# (
# needs.pre-flight.outputs.docs_only == 'true'
# || needs.pre-flight.outputs.is_deployment_workflow == 'true'
# || needs.pre-flight.outputs.is_ci_workload == 'true'
# || needs.pre-flight.outputs.is_merge_group == 'true'
# || always()
# )
# && !cancelled()
# && github.repository == 'NVIDIA/Megatron-LM'
# runs-on: ubuntu-latest
# permissions: write-all
# steps:
# - name: Checkout
# uses: actions/checkout@v4
# - name: Get workflow result
# id: result
# shell: bash -x -e -u -o pipefail {0}
# env:
# GH_TOKEN: ${{ github.token }}
# GITHUB_RUN_ID: ${{ github.run_id }}
# SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || needs.pre-flight.outputs.is_merge_group == 'true' || needs.pre-flight.outputs.is_ci_workload == 'true' }}
# run: |
# FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion == "failure")] | length') || echo 0
# SKIPPED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion == "skipped")] | length') || echo 0
# if [ "${FAILED_JOBS:-0}" -eq 0 ] && ([ "${SKIPPED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]); then
# echo "✅ All previous jobs completed successfully"
# exit 0
# else
# echo "❌ Found $FAILED_JOBS failed job(s)"
# # Show which jobs failed
# gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion == "failure") | .name'
# exit 1
# fi
# Coverage_Fake:
# runs-on: ubuntu-latest
# needs: [Nemo_CICD_Test, pre-flight]
# if: |
# (
# needs.pre-flight.outputs.docs_only == 'true'
# || needs.pre-flight.outputs.is_deployment_workflow == 'true'
# || github.event == 'merge_group'
# )
# && needs.pre-flight.outputs.is_ci_workload == 'false'
# && !cancelled()
# && github.repository == 'NVIDIA/Megatron-LM'
# steps:
# - name: Generate fake coverage report
# uses: actions/github-script@v6
# with:
# github-token: ${{ secrets.PAT }}
# script: |
# await github.rest.repos.createCommitStatus({
# owner: context.repo.owner,
# repo: context.repo.repo,
# sha: context.sha,
# state: 'success',
# description: 'No code changes - coverage check skipped',
# context: 'codecov/patch'
# });
# Coverage:
# runs-on: ubuntu-latest
# needs: [Nemo_CICD_Test]
# if: |
# (
# (needs.pre-flight.outputs.is_ci_workload == 'true' && !failure())
# || success()
# )
# && !cancelled()
# && github.repository == 'NVIDIA/Megatron-LM'
# strategy:
# matrix:
# flag: [unit-test]
# steps:
# - name: Checkout
# uses: actions/checkout@v4
# - name: Download coverage reports of current branch
# uses: actions/download-artifact@v4
# with:
# pattern: coverage-${{ matrix.flag }}-*
# - name: List coverage files
# run: find . -type f -name "*.xml" -o -name "*.lcov"
# - name: Get total coverage of current branch
# shell: bash -x -e -u -o pipefail {0}
# if: always()
# run: |
# pip install coverage
# ls -al .
# ls -al coverage-*/
# coverage combine --keep $(ls coverage-*/.coverage)
# coverage report -i
# rm -rf coverage-*
# ls -al
# - name: Upload coverage reports to Codecov
# uses: codecov/codecov-action@v5
# with:
# token: ${{ secrets.CODECOV_TOKEN }}
# verbose: true
# flags: ${{ matrix.flag }}
# - name: Upload artifacts
# uses: actions/upload-artifact@v4
# with:
# name: coverage-${{ matrix.flag }}-aggregated
# path: |
# .coverage
# include-hidden-files: true