Skip to content

Format benchmark files as json, add perf thresholds #38

Format benchmark files as json, add perf thresholds

Format benchmark files as json, add perf thresholds #38

Workflow file for this run

# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: MIT
name: tilegym-ci
on:
push:
branches:
- "pull-request/[0-9]+"
schedule:
# Run nightly at 12 PM UTC
- cron: '0 12 * * *'
workflow_dispatch: # Allow manual trigger
permissions:
contents: read
packages: write
pull-requests: read
checks: write
env:
# PR images go to a temp repo, main/nightly go to main repo
IMAGE_NAME_PR: tilegym-pr
IMAGE_NAME_MAIN: tilegym
jobs:
config:
name: parse-ci-config
runs-on: ubuntu-latest
outputs:
build: ${{ steps.parse.outputs.build }}
run_ops: ${{ steps.parse.outputs.run_ops }}
run_benchmark: ${{ steps.parse.outputs.run_benchmark }}
image_tag: ${{ steps.parse.outputs.image_tag }}
image_name: ${{ steps.parse.outputs.image_name }}
is_pr: ${{ steps.context.outputs.is_pr }}
steps:
- name: Determine context
id: context
run: |
if [[ "${{ github.ref }}" == "refs/heads/main" ]] || [[ "${{ github.event_name }}" == "schedule" ]]; then
echo "is_pr=false" >> $GITHUB_OUTPUT
echo "image_name=${{ env.IMAGE_NAME_MAIN }}" >> $GITHUB_OUTPUT
echo "Running in main/nightly context"
else
echo "is_pr=true" >> $GITHUB_OUTPUT
echo "image_name=${{ env.IMAGE_NAME_PR }}" >> $GITHUB_OUTPUT
echo "Running in PR context"
fi
- name: Checkout code
uses: actions/checkout@v4
- name: Get PR info
id: pr
uses: actions/github-script@v7
with:
script: |
let prBody = '';
let prNumber = '';
const branchName = context.ref.replace('refs/heads/', '');
core.info(`Looking for PR for branch: ${branchName}`);
// Try method 1: Extract PR number from branch name
const branchMatch = branchName.match(/^pull-request\/(\d+)/);
if (branchMatch) {
prNumber = branchMatch[1];
core.info(`Extracted PR #${prNumber} from branch name`);
// Fetch PR body by number
try {
const { data: pr } = await github.rest.pulls.get({
owner: context.repo.owner,
repo: context.repo.repo,
pull_number: parseInt(prNumber),
});
prBody = pr.body || '';
core.info(`Fetched PR body (${prBody.length} characters)`);
} catch (error) {
core.warning(`Failed to fetch PR #${prNumber}: ${error.message}`);
}
} else {
// Try method 2: Search by branch name
try {
const { data: prs } = await github.rest.pulls.list({
owner: context.repo.owner,
repo: context.repo.repo,
state: 'open',
head: `${context.repo.owner}:${branchName}`,
});
if (prs.length > 0) {
prBody = prs[0].body || '';
prNumber = prs[0].number.toString();
core.info(`Found PR #${prNumber} via API search`);
core.info(`PR body length: ${prBody.length} characters`);
} else {
core.info(`No open PR found for branch ${branchName}`);
}
} catch (error) {
core.warning(`Error searching for PR: ${error.message}`);
}
}
return { prBody, prNumber };
- name: Parse config and set image tag
id: parse
env:
PR_BODY: ${{ fromJSON(steps.pr.outputs.result).prBody }}
PR_NUMBER: ${{ fromJSON(steps.pr.outputs.result).prNumber }}
IS_PR: ${{ steps.context.outputs.is_pr }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
# Parse CI config from PR body (only for PRs)
if [[ "$IS_PR" == "true" ]]; then
pip install pyyaml --quiet
python3 .github/scripts/parse_pr_config.py
# Set PR-specific image tag
if [ -n "$PR_NUMBER" ]; then
echo "image_tag=pr-${PR_NUMBER}" >> $GITHUB_OUTPUT
echo "Using image tag: pr-${PR_NUMBER}"
else
echo "image_tag=latest" >> $GITHUB_OUTPUT
echo "Using image tag: latest (PR without number)"
fi
else
# Main/nightly: check if image already exists before building
echo "image_tag=${{ github.sha }}" >> $GITHUB_OUTPUT
# Check if 'latest' already points to current SHA (tests passed previously)
OWNER_LOWER=$(echo '${{ github.repository_owner }}' | tr '[:upper:]' '[:lower:]')
export REGISTRY_IMAGE="ghcr.io/${OWNER_LOWER}/${{ steps.context.outputs.image_name }}"
export IMAGE_TAG="${{ github.sha }}"
export IS_PR="false"
python3 .github/scripts/check_image_exists.py
# Read the skipped output from check_image_exists.py
if [ -f "$GITHUB_OUTPUT" ] && grep -q "skipped=true" "$GITHUB_OUTPUT"; then
echo "✅ Image already exists and tests passed, skipping build"
echo "build=false" >> $GITHUB_OUTPUT
echo "run_ops=false" >> $GITHUB_OUTPUT
echo "run_benchmark=false" >> $GITHUB_OUTPUT
else
echo "🔨 Building new image and running tests"
echo "build=true" >> $GITHUB_OUTPUT
echo "run_ops=true" >> $GITHUB_OUTPUT
echo "run_benchmark=true" >> $GITHUB_OUTPUT
fi
fi
# Pass through image name from context
echo "image_name=${{ steps.context.outputs.image_name }}" >> $GITHUB_OUTPUT
build:
name: build-tilegym-image
needs: config
if: needs.config.outputs.build == 'true'
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set image variables
id: vars
run: |
OWNER_LOWER=$(echo '${{ github.repository_owner }}' | tr '[:upper:]' '[:lower:]')
REGISTRY_IMAGE="ghcr.io/${OWNER_LOWER}/${{ needs.config.outputs.image_name }}"
echo "owner_lower=${OWNER_LOWER}" >> $GITHUB_OUTPUT
echo "registry_image=${REGISTRY_IMAGE}" >> $GITHUB_OUTPUT
- name: Free up disk space
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /usr/local/lib/android
sudo rm -rf /opt/ghc
sudo rm -rf /opt/hostedtoolcache/CodeQL
docker system prune -af
df -h
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Generate tags
id: tags
run: |
TAGS="${{ steps.vars.outputs.registry_image }}:${{ needs.config.outputs.image_tag }}"
TAGS="${TAGS},${{ steps.vars.outputs.registry_image }}:${{ github.sha }}"
# Add datetime tag for nightly builds
if [[ "${{ needs.config.outputs.is_pr }}" == "false" ]]; then
DATETIME=$(date -u +%Y%m%d-%H%M%S)
TAGS="${TAGS},${{ steps.vars.outputs.registry_image }}:nightly-${DATETIME}"
fi
echo "tags=${TAGS}" >> $GITHUB_OUTPUT
- name: Build and push Docker image to GHCR
if: steps.check-existing.outputs.skipped != 'true'
uses: docker/build-push-action@v5
with:
context: .
file: ./modeling/transformers/Dockerfile
tags: ${{ steps.tags.outputs.tags }}
push: true
provenance: false
outputs: type=image,push=true,compression=zstd,compression-level=3
cache-from: |
type=gha
type=registry,ref=${{ steps.vars.outputs.registry_image }}:latest
type=registry,ref=${{ steps.vars.outputs.registry_image }}:${{ needs.config.outputs.image_tag }}
type=registry,ref=ghcr.io/${{ steps.vars.outputs.owner_lower }}/tilegym:latest
cache-to: type=gha,mode=max
test-ops:
name: test-ops
needs: [config, build]
timeout-minutes: 12
if: |
always() &&
needs.config.outputs.run_ops == 'true' &&
(needs.build.result == 'success' || needs.build.result == 'skipped')
runs-on: linux-amd64-gpu-rtxpro6000-latest-1
steps:
- name: Create test results directory
run: mkdir -p ${{ github.workspace }}/test-results
- name: Login to GHCR
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Pull and run ops tests
timeout-minutes: 10
run: |
OWNER_LOWER=$(echo '${{ github.repository_owner }}' | tr '[:upper:]' '[:lower:]')
IMAGE="ghcr.io/${OWNER_LOWER}/${{ needs.config.outputs.image_name }}:${{ needs.config.outputs.image_tag }}"
docker pull ${IMAGE}
docker run --rm \
--gpus all \
-v ${{ github.workspace }}/test-results:/test-results \
-w /workspace/tilegym \
${IMAGE} \
bash -c "pip install --no-cache-dir pytest-xdist pytest-html && \
pytest -s tests/ops -v -k test_op \
-n auto \
--junitxml=/test-results/ops-results.xml \
--html=/test-results/ops-report.html \
--self-contained-html"
- name: Upload test results
if: always()
uses: actions/upload-artifact@v4
with:
name: ops-test-results
path: test-results/ops-*
retention-days: 30
- name: Publish test results
uses: EnricoMi/publish-unit-test-result-action@v2
if: always()
with:
files: test-results/ops-results.xml
check_name: Ops Test Results
comment_mode: off
test-benchmark:
name: test-benchmark
needs: [config, build]
timeout-minutes: 30
if: |
always() &&
needs.config.outputs.run_benchmark == 'true' &&
(needs.build.result == 'success' || needs.build.result == 'skipped')
runs-on: linux-amd64-gpu-rtxpro6000-latest-1
steps:
- name: Checkout code (sparse - only need scripts)
uses: actions/checkout@v4
with:
sparse-checkout: |
.github/scripts/format_benchmark_summary.py
.github/scripts/check_benchmark_regression.py
sparse-checkout-cone-mode: false
- name: Create test results directory
run: mkdir -p ${{ github.workspace }}/test-results
# Download previous baseline for regression detection
# Uses dawidd6 action to download artifacts from previous workflow runs
# (GitHub's built-in action only works within the same run)
- name: Download baseline benchmark results (nightly only)
if: needs.config.outputs.is_pr == 'false'
continue-on-error: true
uses: dawidd6/action-download-artifact@v3
with:
name: benchmark-baseline
path: ${{ github.workspace }}/baseline-results
workflow: tilegym-ci.yml
branch: main
if_no_artifact_found: warn
- name: Login to GHCR
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Pull and run benchmarks
timeout-minutes: 25
run: |
OWNER_LOWER=$(echo '${{ github.repository_owner }}' | tr '[:upper:]' '[:lower:]')
IMAGE="ghcr.io/${OWNER_LOWER}/${{ needs.config.outputs.image_name }}:${{ needs.config.outputs.image_tag }}"
docker pull ${IMAGE}
docker run --rm \
--gpus all \
-v ${{ github.workspace }}/test-results:/test-results \
-w /workspace/tilegym/tests/benchmark \
${IMAGE} \
./run_all.sh /test-results --json
# Compare current results against baseline with three zones:
# - Regression zone (< -2%): Build fails
# - Neutral zone (-2% to +2%): Build passes, baseline NOT updated
# - Improvement zone (> +2%): Build passes, baseline updated
# Outputs: has_baseline, passed, should_update_baseline
- name: Check for performance regressions (nightly only)
id: regression_check
if: needs.config.outputs.is_pr == 'false'
continue-on-error: false
run: |
if [ -d "${{ github.workspace }}/baseline-results" ] && [ "$(ls -A ${{ github.workspace }}/baseline-results/*.json 2>/dev/null)" ]; then
echo "Baseline results found, checking for regressions..."
echo "has_baseline=true" >> $GITHUB_OUTPUT
if python3 .github/scripts/check_benchmark_regression.py \
--current test-results \
--baseline baseline-results \
--threshold 2.0 \
--improvement-threshold 2.0 \
--output test-results/regression_report.json \
--fail-on-regression; then
echo "✅ No regressions detected"
echo "passed=true" >> $GITHUB_OUTPUT
# Check if we should update baseline (only if significant improvements)
SHOULD_UPDATE=$(python3 -c "import json; print(json.load(open('test-results/regression_report.json'))['summary']['should_update_baseline'])" 2>/dev/null || echo "false")
echo "should_update_baseline=${SHOULD_UPDATE}" >> $GITHUB_OUTPUT
if [ "$SHOULD_UPDATE" == "True" ]; then
echo "🎉 Significant improvements detected - will update baseline"
else
echo "🟡 Performance within neutral zone - baseline will not be updated"
fi
else
echo "❌ Performance regressions detected!"
echo "passed=false" >> $GITHUB_OUTPUT
echo "should_update_baseline=false" >> $GITHUB_OUTPUT
exit 1
fi
else
echo "No baseline results found - this will become the first baseline"
echo "has_baseline=false" >> $GITHUB_OUTPUT
echo "passed=true" >> $GITHUB_OUTPUT
echo "should_update_baseline=true" >> $GITHUB_OUTPUT
fi
- name: Debug - List test results directory
if: always()
run: |
echo "Contents of test-results directory:"
ls -lah ${{ github.workspace }}/test-results/ || echo "Directory does not exist"
echo ""
echo "JSON files:"
ls -lh ${{ github.workspace }}/test-results/*.json 2>/dev/null || echo "No JSON files found"
- name: Format benchmark summary
if: always()
run: python3 .github/scripts/format_benchmark_summary.py test-results
- name: Upload benchmark results
if: always()
uses: actions/upload-artifact@v4
with:
name: benchmark-results
path: test-results/*.json
retention-days: 30
# HIGH WATER MARK STRATEGY WITH NEUTRAL ZONE:
# Three zones for baseline updates:
# 1. Regression zone (< -2%): Build fails, baseline NOT updated
# 2. Neutral zone (-2% to +2%): Build passes, baseline NOT updated (prevents noise)
# 3. Improvement zone (> +2%): Build passes, baseline updated (high water mark)
# This prevents both performance drift AND noisy baseline updates from small variations
- name: Update baseline (only if significant improvements or first run)
if: |
needs.config.outputs.is_pr == 'false' &&
steps.regression_check.outputs.should_update_baseline == 'True'
uses: actions/upload-artifact@v4
with:
name: benchmark-baseline
path: test-results/*.json
retention-days: 90
- name: Log baseline decision
if: needs.config.outputs.is_pr == 'false'
run: |
if [ "${{ steps.regression_check.outputs.should_update_baseline }}" == "True" ]; then
if [ "${{ steps.regression_check.outputs.has_baseline }}" == "false" ]; then
echo "✅ Baseline created: This is the first baseline"
else
echo "✅ Baseline updated: Performance improved by more than 2%"
fi
elif [ "${{ steps.regression_check.outputs.passed }}" == "true" ]; then
echo "🟡 Baseline NOT updated: Performance within ±2% (neutral zone)"
echo " Baseline preserved to avoid noise from small variations"
else
echo "❌ Baseline NOT updated: Performance regressions detected"
fi
promote-to-latest:
name: promote-to-latest
needs: [config, build, test-ops, test-benchmark]
if: |
always() &&
needs.config.outputs.is_pr == 'false' &&
needs.build.result == 'success' &&
needs.test-ops.result == 'success' &&
needs.test-benchmark.result == 'success'
runs-on: ubuntu-latest
steps:
- name: Login to GHCR
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Promote SHA to latest and mark as verified
run: |
OWNER_LOWER=$(echo '${{ github.repository_owner }}' | tr '[:upper:]' '[:lower:]')
IMAGE="ghcr.io/${OWNER_LOWER}/${{ needs.config.outputs.image_name }}"
SHA="${{ github.sha }}"
echo "Promoting ${IMAGE}:${SHA} to latest and adding verified tags (tests passed)"
docker buildx imagetools create \
-t ${IMAGE}:latest \
-t ${IMAGE}:${SHA}-verified \
${IMAGE}:${SHA}