Format benchmark files as json, add perf thresholds #38

Workflow file for this run

.github/workflows/tilegym-ci.yml at 276cab1

	# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	#
	# SPDX-License-Identifier: MIT

	name: tilegym-ci

	on:
	push:
	branches:
	- "pull-request/[0-9]+"
	schedule:
	# Run nightly at 12 PM UTC
	- cron: '0 12 * * *'
	workflow_dispatch: # Allow manual trigger

	permissions:
	contents: read
	packages: write
	pull-requests: read
	checks: write

	env:
	# PR images go to a temp repo, main/nightly go to main repo
	IMAGE_NAME_PR: tilegym-pr
	IMAGE_NAME_MAIN: tilegym

	jobs:
	config:
	name: parse-ci-config
	runs-on: ubuntu-latest
	outputs:
	build: ${{ steps.parse.outputs.build }}
	run_ops: ${{ steps.parse.outputs.run_ops }}
	run_benchmark: ${{ steps.parse.outputs.run_benchmark }}
	image_tag: ${{ steps.parse.outputs.image_tag }}
	image_name: ${{ steps.parse.outputs.image_name }}
	is_pr: ${{ steps.context.outputs.is_pr }}
	steps:
	- name: Determine context
	id: context
	run: \|
	if [[ "${{ github.ref }}" == "refs/heads/main" ]] \|\| [[ "${{ github.event_name }}" == "schedule" ]]; then
	echo "is_pr=false" >> $GITHUB_OUTPUT
	echo "image_name=${{ env.IMAGE_NAME_MAIN }}" >> $GITHUB_OUTPUT
	echo "Running in main/nightly context"
	else
	echo "is_pr=true" >> $GITHUB_OUTPUT
	echo "image_name=${{ env.IMAGE_NAME_PR }}" >> $GITHUB_OUTPUT
	echo "Running in PR context"
	fi
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Get PR info
	id: pr
	uses: actions/github-script@v7
	with:
	script: \|
	let prBody = '';
	let prNumber = '';

	const branchName = context.ref.replace('refs/heads/', '');
	core.info(`Looking for PR for branch: ${branchName}`);

	// Try method 1: Extract PR number from branch name
	const branchMatch = branchName.match(/^pull-request\/(\d+)/);
	if (branchMatch) {
	prNumber = branchMatch[1];
	core.info(`Extracted PR #${prNumber} from branch name`);

	// Fetch PR body by number
	try {
	const { data: pr } = await github.rest.pulls.get({
	owner: context.repo.owner,
	repo: context.repo.repo,
	pull_number: parseInt(prNumber),
	});
	prBody = pr.body \|\| '';
	core.info(`Fetched PR body (${prBody.length} characters)`);
	} catch (error) {
	core.warning(`Failed to fetch PR #${prNumber}: ${error.message}`);
	}
	} else {
	// Try method 2: Search by branch name
	try {
	const { data: prs } = await github.rest.pulls.list({
	owner: context.repo.owner,
	repo: context.repo.repo,
	state: 'open',
	head: `${context.repo.owner}:${branchName}`,
	});

	if (prs.length > 0) {
	prBody = prs[0].body \|\| '';
	prNumber = prs[0].number.toString();
	core.info(`Found PR #${prNumber} via API search`);
	core.info(`PR body length: ${prBody.length} characters`);
	} else {
	core.info(`No open PR found for branch ${branchName}`);
	}
	} catch (error) {
	core.warning(`Error searching for PR: ${error.message}`);
	}
	}

	return { prBody, prNumber };

	- name: Parse config and set image tag
	id: parse
	env:
	PR_BODY: ${{ fromJSON(steps.pr.outputs.result).prBody }}
	PR_NUMBER: ${{ fromJSON(steps.pr.outputs.result).prNumber }}
	IS_PR: ${{ steps.context.outputs.is_pr }}
	GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	run: \|
	# Parse CI config from PR body (only for PRs)
	if [[ "$IS_PR" == "true" ]]; then
	pip install pyyaml --quiet
	python3 .github/scripts/parse_pr_config.py

	# Set PR-specific image tag
	if [ -n "$PR_NUMBER" ]; then
	echo "image_tag=pr-${PR_NUMBER}" >> $GITHUB_OUTPUT
	echo "Using image tag: pr-${PR_NUMBER}"
	else
	echo "image_tag=latest" >> $GITHUB_OUTPUT
	echo "Using image tag: latest (PR without number)"
	fi
	else
	# Main/nightly: check if image already exists before building
	echo "image_tag=${{ github.sha }}" >> $GITHUB_OUTPUT

	# Check if 'latest' already points to current SHA (tests passed previously)
	OWNER_LOWER=$(echo '${{ github.repository_owner }}' \| tr '[:upper:]' '[:lower:]')
	export REGISTRY_IMAGE="ghcr.io/${OWNER_LOWER}/${{ steps.context.outputs.image_name }}"
	export IMAGE_TAG="${{ github.sha }}"
	export IS_PR="false"

	python3 .github/scripts/check_image_exists.py

	# Read the skipped output from check_image_exists.py
	if [ -f "$GITHUB_OUTPUT" ] && grep -q "skipped=true" "$GITHUB_OUTPUT"; then
	echo "✅ Image already exists and tests passed, skipping build"
	echo "build=false" >> $GITHUB_OUTPUT
	echo "run_ops=false" >> $GITHUB_OUTPUT
	echo "run_benchmark=false" >> $GITHUB_OUTPUT
	else
	echo "🔨 Building new image and running tests"
	echo "build=true" >> $GITHUB_OUTPUT
	echo "run_ops=true" >> $GITHUB_OUTPUT
	echo "run_benchmark=true" >> $GITHUB_OUTPUT
	fi
	fi

	# Pass through image name from context
	echo "image_name=${{ steps.context.outputs.image_name }}" >> $GITHUB_OUTPUT

	build:
	name: build-tilegym-image
	needs: config
	if: needs.config.outputs.build == 'true'
	runs-on: ubuntu-latest
	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Set image variables
	id: vars
	run: \|
	OWNER_LOWER=$(echo '${{ github.repository_owner }}' \| tr '[:upper:]' '[:lower:]')
	REGISTRY_IMAGE="ghcr.io/${OWNER_LOWER}/${{ needs.config.outputs.image_name }}"

	echo "owner_lower=${OWNER_LOWER}" >> $GITHUB_OUTPUT
	echo "registry_image=${REGISTRY_IMAGE}" >> $GITHUB_OUTPUT

	- name: Free up disk space
	run: \|
	sudo rm -rf /usr/share/dotnet
	sudo rm -rf /usr/local/lib/android
	sudo rm -rf /opt/ghc
	sudo rm -rf /opt/hostedtoolcache/CodeQL
	docker system prune -af
	df -h

	- name: Set up Docker Buildx
	uses: docker/setup-buildx-action@v3

	- name: Login to GitHub Container Registry
	uses: docker/login-action@v3
	with:
	registry: ghcr.io
	username: ${{ github.actor }}
	password: ${{ secrets.GITHUB_TOKEN }}

	- name: Generate tags
	id: tags
	run: \|
	TAGS="${{ steps.vars.outputs.registry_image }}:${{ needs.config.outputs.image_tag }}"
	TAGS="${TAGS},${{ steps.vars.outputs.registry_image }}:${{ github.sha }}"

	# Add datetime tag for nightly builds
	if [[ "${{ needs.config.outputs.is_pr }}" == "false" ]]; then
	DATETIME=$(date -u +%Y%m%d-%H%M%S)
	TAGS="${TAGS},${{ steps.vars.outputs.registry_image }}:nightly-${DATETIME}"
	fi

	echo "tags=${TAGS}" >> $GITHUB_OUTPUT

	- name: Build and push Docker image to GHCR
	if: steps.check-existing.outputs.skipped != 'true'
	uses: docker/build-push-action@v5
	with:
	context: .
	file: ./modeling/transformers/Dockerfile
	tags: ${{ steps.tags.outputs.tags }}
	push: true
	provenance: false
	outputs: type=image,push=true,compression=zstd,compression-level=3
	cache-from: \|
	type=gha
	type=registry,ref=${{ steps.vars.outputs.registry_image }}:latest
	type=registry,ref=${{ steps.vars.outputs.registry_image }}:${{ needs.config.outputs.image_tag }}
	type=registry,ref=ghcr.io/${{ steps.vars.outputs.owner_lower }}/tilegym:latest
	cache-to: type=gha,mode=max

	test-ops:
	name: test-ops
	needs: [config, build]
	timeout-minutes: 12
	if: \|
	always() &&
	needs.config.outputs.run_ops == 'true' &&
	(needs.build.result == 'success' \|\| needs.build.result == 'skipped')
	runs-on: linux-amd64-gpu-rtxpro6000-latest-1
	steps:
	- name: Create test results directory
	run: mkdir -p ${{ github.workspace }}/test-results

	- name: Login to GHCR
	uses: docker/login-action@v3
	with:
	registry: ghcr.io
	username: ${{ github.actor }}
	password: ${{ secrets.GITHUB_TOKEN }}

	- name: Pull and run ops tests
	timeout-minutes: 10
	run: \|
	OWNER_LOWER=$(echo '${{ github.repository_owner }}' \| tr '[:upper:]' '[:lower:]')
	IMAGE="ghcr.io/${OWNER_LOWER}/${{ needs.config.outputs.image_name }}:${{ needs.config.outputs.image_tag }}"

	docker pull ${IMAGE}
	docker run --rm \
	--gpus all \
	-v ${{ github.workspace }}/test-results:/test-results \
	-w /workspace/tilegym \
	${IMAGE} \
	bash -c "pip install --no-cache-dir pytest-xdist pytest-html && \
	pytest -s tests/ops -v -k test_op \
	-n auto \
	--junitxml=/test-results/ops-results.xml \
	--html=/test-results/ops-report.html \
	--self-contained-html"

	- name: Upload test results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: ops-test-results
	path: test-results/ops-*
	retention-days: 30

	- name: Publish test results
	uses: EnricoMi/publish-unit-test-result-action@v2
	if: always()
	with:
	files: test-results/ops-results.xml
	check_name: Ops Test Results
	comment_mode: off

	test-benchmark:
	name: test-benchmark
	needs: [config, build]
	timeout-minutes: 30
	if: \|
	always() &&
	needs.config.outputs.run_benchmark == 'true' &&
	(needs.build.result == 'success' \|\| needs.build.result == 'skipped')
	runs-on: linux-amd64-gpu-rtxpro6000-latest-1
	steps:
	- name: Checkout code (sparse - only need scripts)
	uses: actions/checkout@v4
	with:
	sparse-checkout: \|
	.github/scripts/format_benchmark_summary.py
	.github/scripts/check_benchmark_regression.py
	sparse-checkout-cone-mode: false

	- name: Create test results directory
	run: mkdir -p ${{ github.workspace }}/test-results

	# Download previous baseline for regression detection
	# Uses dawidd6 action to download artifacts from previous workflow runs
	# (GitHub's built-in action only works within the same run)
	- name: Download baseline benchmark results (nightly only)
	if: needs.config.outputs.is_pr == 'false'
	continue-on-error: true
	uses: dawidd6/action-download-artifact@v3
	with:
	name: benchmark-baseline
	path: ${{ github.workspace }}/baseline-results
	workflow: tilegym-ci.yml
	branch: main
	if_no_artifact_found: warn

	- name: Login to GHCR
	uses: docker/login-action@v3
	with:
	registry: ghcr.io
	username: ${{ github.actor }}
	password: ${{ secrets.GITHUB_TOKEN }}

	- name: Pull and run benchmarks
	timeout-minutes: 25
	run: \|
	OWNER_LOWER=$(echo '${{ github.repository_owner }}' \| tr '[:upper:]' '[:lower:]')
	IMAGE="ghcr.io/${OWNER_LOWER}/${{ needs.config.outputs.image_name }}:${{ needs.config.outputs.image_tag }}"

	docker pull ${IMAGE}
	docker run --rm \
	--gpus all \
	-v ${{ github.workspace }}/test-results:/test-results \
	-w /workspace/tilegym/tests/benchmark \
	${IMAGE} \
	./run_all.sh /test-results --json

	# Compare current results against baseline with three zones:
	# - Regression zone (< -2%): Build fails
	# - Neutral zone (-2% to +2%): Build passes, baseline NOT updated
	# - Improvement zone (> +2%): Build passes, baseline updated
	# Outputs: has_baseline, passed, should_update_baseline
	- name: Check for performance regressions (nightly only)
	id: regression_check
	if: needs.config.outputs.is_pr == 'false'
	continue-on-error: false
	run: \|
	if [ -d "${{ github.workspace }}/baseline-results" ] && [ "$(ls -A ${{ github.workspace }}/baseline-results/*.json 2>/dev/null)" ]; then
	echo "Baseline results found, checking for regressions..."
	echo "has_baseline=true" >> $GITHUB_OUTPUT

	if python3 .github/scripts/check_benchmark_regression.py \
	--current test-results \
	--baseline baseline-results \
	--threshold 2.0 \
	--improvement-threshold 2.0 \
	--output test-results/regression_report.json \
	--fail-on-regression; then
	echo "✅ No regressions detected"
	echo "passed=true" >> $GITHUB_OUTPUT

	# Check if we should update baseline (only if significant improvements)
	SHOULD_UPDATE=$(python3 -c "import json; print(json.load(open('test-results/regression_report.json'))['summary']['should_update_baseline'])" 2>/dev/null \|\| echo "false")
	echo "should_update_baseline=${SHOULD_UPDATE}" >> $GITHUB_OUTPUT

	if [ "$SHOULD_UPDATE" == "True" ]; then
	echo "🎉 Significant improvements detected - will update baseline"
	else
	echo "🟡 Performance within neutral zone - baseline will not be updated"
	fi
	else
	echo "❌ Performance regressions detected!"
	echo "passed=false" >> $GITHUB_OUTPUT
	echo "should_update_baseline=false" >> $GITHUB_OUTPUT
	exit 1
	fi
	else
	echo "No baseline results found - this will become the first baseline"
	echo "has_baseline=false" >> $GITHUB_OUTPUT
	echo "passed=true" >> $GITHUB_OUTPUT
	echo "should_update_baseline=true" >> $GITHUB_OUTPUT
	fi

	- name: Debug - List test results directory
	if: always()
	run: \|
	echo "Contents of test-results directory:"
	ls -lah ${{ github.workspace }}/test-results/ \|\| echo "Directory does not exist"
	echo ""
	echo "JSON files:"
	ls -lh ${{ github.workspace }}/test-results/*.json 2>/dev/null \|\| echo "No JSON files found"

	- name: Format benchmark summary
	if: always()
	run: python3 .github/scripts/format_benchmark_summary.py test-results

	- name: Upload benchmark results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: benchmark-results
	path: test-results/*.json
	retention-days: 30

	# HIGH WATER MARK STRATEGY WITH NEUTRAL ZONE:
	# Three zones for baseline updates:
	# 1. Regression zone (< -2%): Build fails, baseline NOT updated
	# 2. Neutral zone (-2% to +2%): Build passes, baseline NOT updated (prevents noise)
	# 3. Improvement zone (> +2%): Build passes, baseline updated (high water mark)
	# This prevents both performance drift AND noisy baseline updates from small variations
	- name: Update baseline (only if significant improvements or first run)
	if: \|
	needs.config.outputs.is_pr == 'false' &&
	steps.regression_check.outputs.should_update_baseline == 'True'
	uses: actions/upload-artifact@v4
	with:
	name: benchmark-baseline
	path: test-results/*.json
	retention-days: 90

	- name: Log baseline decision
	if: needs.config.outputs.is_pr == 'false'
	run: \|
	if [ "${{ steps.regression_check.outputs.should_update_baseline }}" == "True" ]; then
	if [ "${{ steps.regression_check.outputs.has_baseline }}" == "false" ]; then
	echo "✅ Baseline created: This is the first baseline"
	else
	echo "✅ Baseline updated: Performance improved by more than 2%"
	fi
	elif [ "${{ steps.regression_check.outputs.passed }}" == "true" ]; then
	echo "🟡 Baseline NOT updated: Performance within ±2% (neutral zone)"
	echo " Baseline preserved to avoid noise from small variations"
	else
	echo "❌ Baseline NOT updated: Performance regressions detected"
	fi

	promote-to-latest:
	name: promote-to-latest
	needs: [config, build, test-ops, test-benchmark]
	if: \|
	always() &&
	needs.config.outputs.is_pr == 'false' &&
	needs.build.result == 'success' &&
	needs.test-ops.result == 'success' &&
	needs.test-benchmark.result == 'success'
	runs-on: ubuntu-latest
	steps:
	- name: Login to GHCR
	uses: docker/login-action@v3
	with:
	registry: ghcr.io
	username: ${{ github.actor }}
	password: ${{ secrets.GITHUB_TOKEN }}

	- name: Set up Docker Buildx
	uses: docker/setup-buildx-action@v3

	- name: Promote SHA to latest and mark as verified
	run: \|
	OWNER_LOWER=$(echo '${{ github.repository_owner }}' \| tr '[:upper:]' '[:lower:]')
	IMAGE="ghcr.io/${OWNER_LOWER}/${{ needs.config.outputs.image_name }}"
	SHA="${{ github.sha }}"

	echo "Promoting ${IMAGE}:${SHA} to latest and adding verified tags (tests passed)"
	docker buildx imagetools create \
	-t ${IMAGE}:latest \
	-t ${IMAGE}:${SHA}-verified \
	${IMAGE}:${SHA}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Format benchmark files as json, add perf thresholds #38

Workflow file

Format benchmark files as json, add perf thresholds #38

Uh oh!

Workflow file for this run