USD-Exporting improvements #1384
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: GPU Unit Tests on AWS EC2 | |
# Workflow configuration variables | |
env: | |
AWS_REGION: us-east-2 | |
AWS_INSTANCE_TYPE: g6e.xlarge | |
AWS_VOLUME_SIZE: 64 | |
AWS_VOLUME_TYPE: gp3 | |
AWS_SECURITY_GROUP_IDS: sg-07807c44e7f2a368a | |
AWS_ROLE_ARN: arn:aws:iam::968945269301:role/newton-physics-newton-github-actions-role-Role-GbdM0RBoT4xW | |
AWS_ROLE_DURATION: 7200 | |
AWS_S3_BUCKET: newton-github-workflow-artifacts | |
on: | |
pull_request_target: | |
# Cancels in-progress runs of this workflow for the same pull request, | |
# but allows parallel runs for pushes to the main branch. | |
concurrency: | |
group: ${{ github.workflow }}-${{ github.head_ref }} | |
cancel-in-progress: true | |
jobs: | |
check-author-membership: | |
name: Check Author Membership | |
runs-on: ubuntu-latest | |
permissions: {} | |
outputs: | |
membership_status: ${{ steps.check_org.outputs.membership_status }} | |
steps: | |
- name: Check user's organization membership | |
id: check_org | |
run: | | |
ASSOCIATION="${{ github.event.pull_request.author_association }}" | |
echo "Author's association with the repository: ${ASSOCIATION}" | |
if [[ "${ASSOCIATION}" == "MEMBER" || "${ASSOCIATION}" == "OWNER" || "${ASSOCIATION}" == "COLLABORATOR" ]]; then | |
echo "Author is a recognized member, owner, or collaborator." | |
echo "membership_status=CONFIRMED_MEMBER" >> "$GITHUB_OUTPUT" | |
else | |
# Set the output for other jobs to use | |
echo "membership_status=NOT_MEMBER" >> "$GITHUB_OUTPUT" | |
# Print a message explaining the status and its impact on workflows. | |
echo "--------------------------------------------------------------------------------" >&2 | |
echo "Thank you for your contribution!" >&2 | |
echo "This is the expected status for community contributors. Certain automated" >&2 | |
echo "workflows are reserved for verified organization members." >&2 | |
echo "" >&2 | |
echo "--------------------------------------------------------------------------------" >&2 | |
echo "❓ Are you a member of the 'newton-physics' organization and believe this is an error?" >&2 | |
echo "" >&2 | |
echo "This can happen if your organization membership is set to 'Private'. To fix this," >&2 | |
echo "please make your membership 'Public' to enable all workflow triggers:" >&2 | |
echo "" >&2 | |
echo "1. Go to the organization's People page: https://github.com/orgs/newton-physics/people" >&2 | |
echo "2. Find your username in the list." >&2 | |
echo "3. Click the dropdown next to your name and change your visibility from 'Private' to 'Public'." >&2 | |
echo "" >&2 | |
echo "After updating your visibility, push a new commit to this PR to re-run the check." >&2 | |
echo "--------------------------------------------------------------------------------" >&2 | |
fi | |
aws-unit-tests-pr: | |
name: Run GPU Unit Tests on AWS EC2 (Pull Request) | |
if: github.repository == 'newton-physics/newton' | |
needs: check-author-membership | |
environment: | |
name: ${{ needs.check-author-membership.outputs.membership_status != 'CONFIRMED_MEMBER' && 'external-pr-approval' || '' }} | |
url: ${{ github.event.pull_request.html_url }} | |
runs-on: ubuntu-latest | |
timeout-minutes: 120 | |
permissions: | |
id-token: write | |
contents: read | |
steps: | |
- name: Checkout repository | |
uses: actions/checkout@v4 | |
- name: Configure AWS credentials | |
uses: aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df | |
with: | |
aws-region: ${{ env.AWS_REGION }} | |
role-to-assume: ${{ env.AWS_ROLE_ARN }} | |
role-duration-seconds: ${{ env.AWS_ROLE_DURATION }} | |
- name: Launch EC2 instance | |
run: | | |
echo "Finding the latest AWS Deep Learning Base GPU AMI..." | |
LATEST_AMI_ID=$(aws ssm get-parameter --region ${{ env.AWS_REGION }} \ | |
--name /aws/service/deeplearning/ami/x86_64/base-oss-nvidia-driver-gpu-ubuntu-24.04/latest/ami-id \ | |
--query "Parameter.Value" \ | |
--output text) | |
if [[ -z "$LATEST_AMI_ID" ]]; then | |
echo "❌ No AMI ID found. Exiting." | |
exit 1 | |
fi | |
echo "Latest AMI ID found: $LATEST_AMI_ID" | |
echo "Launching EC2 instance..." | |
# --- Define all tags --- | |
TAGS="{Key=Name,Value=newton-github-workflow-runner},{Key=created-by,Value=github-actions-newton-role}" | |
TAGS="$TAGS,{Key=GitHub-Repository,Value=${{ github.repository }}}" | |
TAGS="$TAGS,{Key=Source-Event,Value=Pull-Request}" | |
INSTANCE_ID=$(aws ec2 run-instances \ | |
--image-id $LATEST_AMI_ID \ | |
--region ${{ env.AWS_REGION }} \ | |
--instance-type ${{ env.AWS_INSTANCE_TYPE }} \ | |
--security-group-ids ${{ env.AWS_SECURITY_GROUP_IDS }} \ | |
--iam-instance-profile Name="NewtonEC2InstanceRole" \ | |
--block-device-mappings "DeviceName=/dev/sda1,Ebs={VolumeSize=${{ env.AWS_VOLUME_SIZE }},VolumeType=${{ env.AWS_VOLUME_TYPE }}}" \ | |
--tag-specifications "ResourceType=instance,Tags=[$TAGS]" \ | |
--query 'Instances[0].InstanceId' \ | |
--output text) | |
echo "Instance launched with ID: $INSTANCE_ID" | |
echo "INSTANCE_ID=$INSTANCE_ID" >> "$GITHUB_ENV" | |
echo "$INSTANCE_ID" > instance_id.txt | |
echo "Waiting for instance to be running..." | |
aws ec2 wait instance-status-ok --instance-ids $INSTANCE_ID | |
- name: Upload instance ID artifact | |
if: always() | |
uses: actions/upload-artifact@v4 | |
with: | |
name: unittest-instance-id-artifact | |
path: instance_id.txt | |
- name: Generate script to run on instance | |
env: | |
BRANCH_NAME_ENV: ${{ github.head_ref }} | |
S3_BUCKET: ${{ env.AWS_S3_BUCKET }} | |
PR_NUMBER: ${{ github.event.number }} | |
IS_FORK: ${{ github.event.pull_request.head.repo.fork }} | |
run: | | |
# A unique key for this run to avoid file collisions | |
S3_KEY="reports/${{ github.run_id }}/${{ github.run_attempt }}" | |
SSM_LOG_PREFIX="ssm-logs/${{ github.run_id }}/${{ github.run_attempt }}" | |
SAFE_BRANCH=$(printf '%q' "${BRANCH_NAME_ENV}") | |
echo "S3_KEY=${S3_KEY}" >> "$GITHUB_ENV" | |
echo "SSM_LOG_PREFIX=${SSM_LOG_PREFIX}" >> "$GITHUB_ENV" | |
echo "SAFE_BRANCH=${SAFE_BRANCH}" >> "$GITHUB_ENV" | |
cat << EOF > remote_script.sh | |
#!/bin/bash | |
set -euo pipefail | |
# These are expanded immediately on the GitHub runner | |
S3_BUCKET="${S3_BUCKET}" | |
S3_KEY="${S3_KEY}" | |
function finish { | |
# This is escaped to run on the remote instance | |
EXIT_CODE=\$? | |
echo "--- Script finished with exit code \$EXIT_CODE ---" | |
aws s3 cp /tmp/rspec.xml "s3://\${S3_BUCKET}/\${S3_KEY}/rspec.xml" 2>&1 || true | |
aws s3 cp /tmp/coverage.xml "s3://\${S3_BUCKET}/\${S3_KEY}/coverage.xml" 2>&1 || true | |
} | |
trap finish EXIT | |
echo "--- Starting sequence on EC2 instance ---" | |
date | |
# These are expanded immediately on the GitHub runner | |
REPO_URL="${{ github.server_url }}/${{ github.repository }}.git" | |
REPO_DIR="${{ github.event.repository.name }}" | |
curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR="/tmp/uv-installation" sh | |
source /tmp/uv-installation/env | |
# --- Cloning Repository --- | |
git clone "\$REPO_URL" "\$REPO_DIR" | |
cd "\$REPO_DIR" | |
# --- Checkout correct commit --- | |
if [ "${IS_FORK}" == "true" ]; then | |
echo "Pull request from a forked repository. Fetching and checking out PR head." | |
git fetch origin pull/${PR_NUMBER}/head | |
git checkout FETCH_HEAD | |
else | |
echo "Pull request from the same repository. Checking out the branch directly." | |
git checkout "${SAFE_BRANCH}" | |
fi | |
echo "Running Newton test suite..." | |
uv run --extra dev --extra torch-cu12 -m newton.tests --junit-report-xml /tmp/rspec.xml --coverage --coverage-xml /tmp/coverage.xml --serial-fallback --failfast 2>&1 | |
EOF | |
# Prepare script to be passed as SSM parameters | |
jq -n --arg script_body "$(cat remote_script.sh)" \ | |
--arg timeout "$AWS_ROLE_DURATION" \ | |
'{ "commands": [$script_body], "executionTimeout": [$timeout] }' > ssm_params.json | |
# Echo script for debugging | |
cat remote_script.sh | |
- name: Run script on instance | |
env: | |
S3_BUCKET: ${{ env.AWS_S3_BUCKET }} | |
run: | | |
echo "--- Sending command to instance $INSTANCE_ID via SSM ---" | |
COMMAND_ID=$(aws ssm send-command \ | |
--instance-ids "$INSTANCE_ID" \ | |
--document-name "AWS-RunShellScript" \ | |
--comment "Running Newton GPU unit tests" \ | |
--parameters file://ssm_params.json \ | |
--output-s3-bucket-name "$S3_BUCKET" \ | |
--output-s3-key-prefix "$SSM_LOG_PREFIX" \ | |
--query "Command.CommandId" \ | |
--output text) | |
echo "SSM Command ID: $COMMAND_ID" | |
echo "--- Waiting for command to complete... ---" | |
final_status="Success" | |
TIMEOUT=6600 | |
INTERVAL=30 | |
elapsed_time=0 | |
while [ $elapsed_time -lt $TIMEOUT ]; do | |
STATUS=$(aws ssm get-command-invocation \ | |
--command-id "$COMMAND_ID" \ | |
--instance-id "$INSTANCE_ID" \ | |
--query "Status" \ | |
--output text) | |
if [[ "$STATUS" == "Success" ]]; then | |
break | |
elif [[ "$STATUS" == "InProgress" || "$STATUS" == "Pending" ]]; then | |
: | |
else | |
final_status="$STATUS" | |
echo "SSM command failed or returned unexpected status: $STATUS" | |
break | |
fi | |
sleep $INTERVAL | |
elapsed_time=$((elapsed_time + INTERVAL)) | |
echo "Current status: $STATUS. Waited $elapsed_time seconds..." | |
done | |
if [ $elapsed_time -ge $TIMEOUT ]; then | |
final_status="Timeout" | |
fi | |
echo "--- Downloading artifacts from S3 ---" | |
aws s3 cp "s3://${S3_BUCKET}/${S3_KEY}/" . --recursive | |
# --- Get the exact S3 output URL from the command invocation --- | |
S3_STDOUT_URL=$(aws ssm get-command-invocation --command-id "$COMMAND_ID" --instance-id "$INSTANCE_ID" --query "StandardOutputUrl" --output text) | |
if [ -z "$S3_STDOUT_URL" ]; then | |
echo "❌ Could not retrieve S3 output URL. Cannot fetch logs." | |
exit 1 | |
fi | |
# Convert the https:// URL to an s3:// URI | |
BUCKET_AND_KEY=$(echo "$S3_STDOUT_URL" | sed 's|https://s3\.[^.]*\.amazonaws\.com/||') | |
S3_URI="s3://${BUCKET_AND_KEY}" | |
LOCAL_LOG_FILE="ssm_output.log" | |
echo "Downloading combined log from ${S3_URI}" | |
if ! aws s3 cp "${S3_URI}" "${LOCAL_LOG_FILE}"; then | |
echo "❌ Failed to download logs from S3." | |
exit 1 | |
fi | |
echo "--- Downloaded Log Output ---" | |
cat "${LOCAL_LOG_FILE}" | |
# --- Final status check --- | |
if [[ "$final_status" == "Success" ]]; then | |
if [ ! -f "rspec.xml" ]; then | |
final_status="MissingRspecFile" | |
elif [ ! -f "coverage.xml" ]; then | |
final_status="MissingCoverageFile" | |
fi | |
fi | |
if [[ "$final_status" != "Success" ]]; then | |
echo "❌ Workflow failed with status: $final_status" | |
exit 1 | |
else | |
echo "✅ Workflow completed successfully." | |
fi | |
- name: Test Summary | |
if: ${{ !cancelled() }} | |
uses: test-summary/action@31493c76ec9e7aa675f1585d3ed6f1da69269a86 | |
with: | |
paths: "rspec.xml" | |
show: "fail" | |
- name: Upload coverage reports to Codecov | |
if: ${{ !cancelled() }} | |
uses: codecov/codecov-action@ad3126e916f78f00edff4ed0317cf185271ccc2d | |
with: | |
env_vars: AWS_INSTANCE_TYPE | |
files: ./coverage.xml | |
flags: unittests | |
token: ${{ secrets.CODECOV_TOKEN }} | |
- name: Save test artifacts | |
if: ${{ !cancelled() }} | |
uses: actions/upload-artifact@v4 | |
with: | |
if-no-files-found: ignore | |
name: test-artifacts | |
path: | | |
rspec.xml | |
coverage.xml | |
ssm_output.log | |
cleanup: | |
# This runs as a separate job to ensure cleanup always occurs, even if the | |
# main job's runner fails unexpectedly. This prevents orphaned EC2 instances. | |
name: Cleanup EC2 Instance | |
runs-on: ubuntu-latest | |
needs: aws-unit-tests-pr | |
if: always() && github.repository == 'newton-physics/newton' | |
permissions: | |
id-token: write | |
contents: read | |
steps: | |
- name: Download instance ID artifact | |
uses: actions/download-artifact@v4 | |
with: | |
name: unittest-instance-id-artifact | |
path: . | |
- name: Configure AWS Credentials | |
uses: aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df | |
with: | |
aws-region: ${{ env.AWS_REGION }} | |
role-to-assume: ${{ env.AWS_ROLE_ARN }} | |
- name: Read instance ID and terminate EC2 Instance | |
run: | | |
if [ ! -f instance_id.txt ]; then | |
echo "Instance ID file not found. Nothing to terminate." | |
exit 0 | |
fi | |
INSTANCE_ID=$(cat instance_id.txt) | |
if [ -z "$INSTANCE_ID" ]; then | |
echo "Instance ID is empty. Nothing to terminate." | |
exit 0 | |
fi | |
echo "Terminating instance: $INSTANCE_ID" | |
aws ec2 terminate-instances --instance-ids $INSTANCE_ID |