USD-Exporting improvements #1384

Workflow file for this run

.github/workflows/pr_aws_gpu_tests.yml at da7f5aa

	name: GPU Unit Tests on AWS EC2

	# Workflow configuration variables
	env:
	AWS_REGION: us-east-2
	AWS_INSTANCE_TYPE: g6e.xlarge
	AWS_VOLUME_SIZE: 64
	AWS_VOLUME_TYPE: gp3
	AWS_SECURITY_GROUP_IDS: sg-07807c44e7f2a368a
	AWS_ROLE_ARN: arn:aws:iam::968945269301:role/newton-physics-newton-github-actions-role-Role-GbdM0RBoT4xW
	AWS_ROLE_DURATION: 7200
	AWS_S3_BUCKET: newton-github-workflow-artifacts

	on:
	pull_request_target:

	# Cancels in-progress runs of this workflow for the same pull request,
	# but allows parallel runs for pushes to the main branch.
	concurrency:
	group: ${{ github.workflow }}-${{ github.head_ref }}
	cancel-in-progress: true

	jobs:
	check-author-membership:
	name: Check Author Membership
	runs-on: ubuntu-latest
	permissions: {}
	outputs:
	membership_status: ${{ steps.check_org.outputs.membership_status }}
	steps:
	- name: Check user's organization membership
	id: check_org
	run: \|
	ASSOCIATION="${{ github.event.pull_request.author_association }}"
	echo "Author's association with the repository: ${ASSOCIATION}"

	if [[ "${ASSOCIATION}" == "MEMBER" \|\| "${ASSOCIATION}" == "OWNER" \|\| "${ASSOCIATION}" == "COLLABORATOR" ]]; then
	echo "Author is a recognized member, owner, or collaborator."
	echo "membership_status=CONFIRMED_MEMBER" >> "$GITHUB_OUTPUT"
	else
	# Set the output for other jobs to use
	echo "membership_status=NOT_MEMBER" >> "$GITHUB_OUTPUT"

	# Print a message explaining the status and its impact on workflows.
	echo "--------------------------------------------------------------------------------" >&2
	echo "Thank you for your contribution!" >&2
	echo "This is the expected status for community contributors. Certain automated" >&2
	echo "workflows are reserved for verified organization members." >&2
	echo "" >&2
	echo "--------------------------------------------------------------------------------" >&2
	echo "❓ Are you a member of the 'newton-physics' organization and believe this is an error?" >&2
	echo "" >&2
	echo "This can happen if your organization membership is set to 'Private'. To fix this," >&2
	echo "please make your membership 'Public' to enable all workflow triggers:" >&2
	echo "" >&2
	echo "1. Go to the organization's People page: https://github.com/orgs/newton-physics/people" >&2
	echo "2. Find your username in the list." >&2
	echo "3. Click the dropdown next to your name and change your visibility from 'Private' to 'Public'." >&2
	echo "" >&2
	echo "After updating your visibility, push a new commit to this PR to re-run the check." >&2
	echo "--------------------------------------------------------------------------------" >&2
	fi

	aws-unit-tests-pr:
	name: Run GPU Unit Tests on AWS EC2 (Pull Request)
	if: github.repository == 'newton-physics/newton'
	needs: check-author-membership
	environment:
	name: ${{ needs.check-author-membership.outputs.membership_status != 'CONFIRMED_MEMBER' && 'external-pr-approval' \|\| '' }}
	url: ${{ github.event.pull_request.html_url }}
	runs-on: ubuntu-latest
	timeout-minutes: 120
	permissions:
	id-token: write
	contents: read
	steps:
	- name: Checkout repository
	uses: actions/checkout@v4

	- name: Configure AWS credentials
	uses: aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df
	with:
	aws-region: ${{ env.AWS_REGION }}
	role-to-assume: ${{ env.AWS_ROLE_ARN }}
	role-duration-seconds: ${{ env.AWS_ROLE_DURATION }}

	- name: Launch EC2 instance
	run: \|
	echo "Finding the latest AWS Deep Learning Base GPU AMI..."
	LATEST_AMI_ID=$(aws ssm get-parameter --region ${{ env.AWS_REGION }} \
	--name /aws/service/deeplearning/ami/x86_64/base-oss-nvidia-driver-gpu-ubuntu-24.04/latest/ami-id \
	--query "Parameter.Value" \
	--output text)
	if [[ -z "$LATEST_AMI_ID" ]]; then
	echo "❌ No AMI ID found. Exiting."
	exit 1
	fi

	echo "Latest AMI ID found: $LATEST_AMI_ID"
	echo "Launching EC2 instance..."

	# --- Define all tags ---
	TAGS="{Key=Name,Value=newton-github-workflow-runner},{Key=created-by,Value=github-actions-newton-role}"
	TAGS="$TAGS,{Key=GitHub-Repository,Value=${{ github.repository }}}"
	TAGS="$TAGS,{Key=Source-Event,Value=Pull-Request}"

	INSTANCE_ID=$(aws ec2 run-instances \
	--image-id $LATEST_AMI_ID \
	--region ${{ env.AWS_REGION }} \
	--instance-type ${{ env.AWS_INSTANCE_TYPE }} \
	--security-group-ids ${{ env.AWS_SECURITY_GROUP_IDS }} \
	--iam-instance-profile Name="NewtonEC2InstanceRole" \
	--block-device-mappings "DeviceName=/dev/sda1,Ebs={VolumeSize=${{ env.AWS_VOLUME_SIZE }},VolumeType=${{ env.AWS_VOLUME_TYPE }}}" \
	--tag-specifications "ResourceType=instance,Tags=[$TAGS]" \
	--query 'Instances[0].InstanceId' \
	--output text)

	echo "Instance launched with ID: $INSTANCE_ID"
	echo "INSTANCE_ID=$INSTANCE_ID" >> "$GITHUB_ENV"
	echo "$INSTANCE_ID" > instance_id.txt

	echo "Waiting for instance to be running..."
	aws ec2 wait instance-status-ok --instance-ids $INSTANCE_ID

	- name: Upload instance ID artifact
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: unittest-instance-id-artifact
	path: instance_id.txt

	- name: Generate script to run on instance
	env:
	BRANCH_NAME_ENV: ${{ github.head_ref }}
	S3_BUCKET: ${{ env.AWS_S3_BUCKET }}
	PR_NUMBER: ${{ github.event.number }}
	IS_FORK: ${{ github.event.pull_request.head.repo.fork }}
	run: \|
	# A unique key for this run to avoid file collisions
	S3_KEY="reports/${{ github.run_id }}/${{ github.run_attempt }}"
	SSM_LOG_PREFIX="ssm-logs/${{ github.run_id }}/${{ github.run_attempt }}"
	SAFE_BRANCH=$(printf '%q' "${BRANCH_NAME_ENV}")

	echo "S3_KEY=${S3_KEY}" >> "$GITHUB_ENV"
	echo "SSM_LOG_PREFIX=${SSM_LOG_PREFIX}" >> "$GITHUB_ENV"
	echo "SAFE_BRANCH=${SAFE_BRANCH}" >> "$GITHUB_ENV"

	cat << EOF > remote_script.sh
	#!/bin/bash
	set -euo pipefail

	# These are expanded immediately on the GitHub runner
	S3_BUCKET="${S3_BUCKET}"
	S3_KEY="${S3_KEY}"

	function finish {
	# This is escaped to run on the remote instance
	EXIT_CODE=\$?
	echo "--- Script finished with exit code \$EXIT_CODE ---"
	aws s3 cp /tmp/rspec.xml "s3://\${S3_BUCKET}/\${S3_KEY}/rspec.xml" 2>&1 \|\| true
	aws s3 cp /tmp/coverage.xml "s3://\${S3_BUCKET}/\${S3_KEY}/coverage.xml" 2>&1 \|\| true
	}
	trap finish EXIT

	echo "--- Starting sequence on EC2 instance ---"
	date

	# These are expanded immediately on the GitHub runner
	REPO_URL="${{ github.server_url }}/${{ github.repository }}.git"
	REPO_DIR="${{ github.event.repository.name }}"

	curl -LsSf https://astral.sh/uv/install.sh \| env UV_INSTALL_DIR="/tmp/uv-installation" sh
	source /tmp/uv-installation/env

	# --- Cloning Repository ---
	git clone "\$REPO_URL" "\$REPO_DIR"
	cd "\$REPO_DIR"

	# --- Checkout correct commit ---
	if [ "${IS_FORK}" == "true" ]; then
	echo "Pull request from a forked repository. Fetching and checking out PR head."
	git fetch origin pull/${PR_NUMBER}/head
	git checkout FETCH_HEAD
	else
	echo "Pull request from the same repository. Checking out the branch directly."
	git checkout "${SAFE_BRANCH}"
	fi

	echo "Running Newton test suite..."
	uv run --extra dev --extra torch-cu12 -m newton.tests --junit-report-xml /tmp/rspec.xml --coverage --coverage-xml /tmp/coverage.xml --serial-fallback --failfast 2>&1
	EOF

	# Prepare script to be passed as SSM parameters
	jq -n --arg script_body "$(cat remote_script.sh)" \
	--arg timeout "$AWS_ROLE_DURATION" \
	'{ "commands": [$script_body], "executionTimeout": [$timeout] }' > ssm_params.json

	# Echo script for debugging
	cat remote_script.sh

	- name: Run script on instance
	env:
	S3_BUCKET: ${{ env.AWS_S3_BUCKET }}
	run: \|
	echo "--- Sending command to instance $INSTANCE_ID via SSM ---"
	COMMAND_ID=$(aws ssm send-command \
	--instance-ids "$INSTANCE_ID" \
	--document-name "AWS-RunShellScript" \
	--comment "Running Newton GPU unit tests" \
	--parameters file://ssm_params.json \
	--output-s3-bucket-name "$S3_BUCKET" \
	--output-s3-key-prefix "$SSM_LOG_PREFIX" \
	--query "Command.CommandId" \
	--output text)

	echo "SSM Command ID: $COMMAND_ID"

	echo "--- Waiting for command to complete... ---"
	final_status="Success"
	TIMEOUT=6600
	INTERVAL=30
	elapsed_time=0

	while [ $elapsed_time -lt $TIMEOUT ]; do
	STATUS=$(aws ssm get-command-invocation \
	--command-id "$COMMAND_ID" \
	--instance-id "$INSTANCE_ID" \
	--query "Status" \
	--output text)

	if [[ "$STATUS" == "Success" ]]; then
	break
	elif [[ "$STATUS" == "InProgress" \|\| "$STATUS" == "Pending" ]]; then
	:
	else
	final_status="$STATUS"
	echo "SSM command failed or returned unexpected status: $STATUS"
	break
	fi

	sleep $INTERVAL
	elapsed_time=$((elapsed_time + INTERVAL))
	echo "Current status: $STATUS. Waited $elapsed_time seconds..."
	done

	if [ $elapsed_time -ge $TIMEOUT ]; then
	final_status="Timeout"
	fi

	echo "--- Downloading artifacts from S3 ---"
	aws s3 cp "s3://${S3_BUCKET}/${S3_KEY}/" . --recursive

	# --- Get the exact S3 output URL from the command invocation ---
	S3_STDOUT_URL=$(aws ssm get-command-invocation --command-id "$COMMAND_ID" --instance-id "$INSTANCE_ID" --query "StandardOutputUrl" --output text)

	if [ -z "$S3_STDOUT_URL" ]; then
	echo "❌ Could not retrieve S3 output URL. Cannot fetch logs."
	exit 1
	fi

	# Convert the https:// URL to an s3:// URI
	BUCKET_AND_KEY=$(echo "$S3_STDOUT_URL" \| sed 's\|https://s3\.[^.]*\.amazonaws\.com/\|\|')
	S3_URI="s3://${BUCKET_AND_KEY}"

	LOCAL_LOG_FILE="ssm_output.log"
	echo "Downloading combined log from ${S3_URI}"

	if ! aws s3 cp "${S3_URI}" "${LOCAL_LOG_FILE}"; then
	echo "❌ Failed to download logs from S3."
	exit 1
	fi

	echo "--- Downloaded Log Output ---"
	cat "${LOCAL_LOG_FILE}"

	# --- Final status check ---
	if [[ "$final_status" == "Success" ]]; then
	if [ ! -f "rspec.xml" ]; then
	final_status="MissingRspecFile"
	elif [ ! -f "coverage.xml" ]; then
	final_status="MissingCoverageFile"
	fi
	fi

	if [[ "$final_status" != "Success" ]]; then
	echo "❌ Workflow failed with status: $final_status"
	exit 1
	else
	echo "✅ Workflow completed successfully."
	fi

	- name: Test Summary
	if: ${{ !cancelled() }}
	uses: test-summary/action@31493c76ec9e7aa675f1585d3ed6f1da69269a86
	with:
	paths: "rspec.xml"
	show: "fail"

	- name: Upload coverage reports to Codecov
	if: ${{ !cancelled() }}
	uses: codecov/codecov-action@ad3126e916f78f00edff4ed0317cf185271ccc2d
	with:
	env_vars: AWS_INSTANCE_TYPE
	files: ./coverage.xml
	flags: unittests
	token: ${{ secrets.CODECOV_TOKEN }}

	- name: Save test artifacts
	if: ${{ !cancelled() }}
	uses: actions/upload-artifact@v4
	with:
	if-no-files-found: ignore
	name: test-artifacts
	path: \|
	rspec.xml
	coverage.xml
	ssm_output.log

	cleanup:
	# This runs as a separate job to ensure cleanup always occurs, even if the
	# main job's runner fails unexpectedly. This prevents orphaned EC2 instances.
	name: Cleanup EC2 Instance
	runs-on: ubuntu-latest
	needs: aws-unit-tests-pr
	if: always() && github.repository == 'newton-physics/newton'
	permissions:
	id-token: write
	contents: read
	steps:
	- name: Download instance ID artifact
	uses: actions/download-artifact@v4
	with:
	name: unittest-instance-id-artifact
	path: .

	- name: Configure AWS Credentials
	uses: aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df
	with:
	aws-region: ${{ env.AWS_REGION }}
	role-to-assume: ${{ env.AWS_ROLE_ARN }}

	- name: Read instance ID and terminate EC2 Instance
	run: \|
	if [ ! -f instance_id.txt ]; then
	echo "Instance ID file not found. Nothing to terminate."
	exit 0
	fi
	INSTANCE_ID=$(cat instance_id.txt)
	if [ -z "$INSTANCE_ID" ]; then
	echo "Instance ID is empty. Nothing to terminate."
	exit 0
	fi
	echo "Terminating instance: $INSTANCE_ID"
	aws ec2 terminate-instances --instance-ids $INSTANCE_ID

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

USD-Exporting improvements #1384

Workflow file

USD-Exporting improvements #1384

Uh oh!

Jobs

Run details

Workflow file for this run