Skip to content

USD-Exporting improvements #1384

USD-Exporting improvements

USD-Exporting improvements #1384

name: GPU Unit Tests on AWS EC2
# Workflow configuration variables
env:
AWS_REGION: us-east-2
AWS_INSTANCE_TYPE: g6e.xlarge
AWS_VOLUME_SIZE: 64
AWS_VOLUME_TYPE: gp3
AWS_SECURITY_GROUP_IDS: sg-07807c44e7f2a368a
AWS_ROLE_ARN: arn:aws:iam::968945269301:role/newton-physics-newton-github-actions-role-Role-GbdM0RBoT4xW
AWS_ROLE_DURATION: 7200
AWS_S3_BUCKET: newton-github-workflow-artifacts
on:
pull_request_target:
# Cancels in-progress runs of this workflow for the same pull request,
# but allows parallel runs for pushes to the main branch.
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref }}
cancel-in-progress: true
jobs:
check-author-membership:
name: Check Author Membership
runs-on: ubuntu-latest
permissions: {}
outputs:
membership_status: ${{ steps.check_org.outputs.membership_status }}
steps:
- name: Check user's organization membership
id: check_org
run: |
ASSOCIATION="${{ github.event.pull_request.author_association }}"
echo "Author's association with the repository: ${ASSOCIATION}"
if [[ "${ASSOCIATION}" == "MEMBER" || "${ASSOCIATION}" == "OWNER" || "${ASSOCIATION}" == "COLLABORATOR" ]]; then
echo "Author is a recognized member, owner, or collaborator."
echo "membership_status=CONFIRMED_MEMBER" >> "$GITHUB_OUTPUT"
else
# Set the output for other jobs to use
echo "membership_status=NOT_MEMBER" >> "$GITHUB_OUTPUT"
# Print a message explaining the status and its impact on workflows.
echo "--------------------------------------------------------------------------------" >&2
echo "Thank you for your contribution!" >&2
echo "This is the expected status for community contributors. Certain automated" >&2
echo "workflows are reserved for verified organization members." >&2
echo "" >&2
echo "--------------------------------------------------------------------------------" >&2
echo "❓ Are you a member of the 'newton-physics' organization and believe this is an error?" >&2
echo "" >&2
echo "This can happen if your organization membership is set to 'Private'. To fix this," >&2
echo "please make your membership 'Public' to enable all workflow triggers:" >&2
echo "" >&2
echo "1. Go to the organization's People page: https://github.com/orgs/newton-physics/people" >&2
echo "2. Find your username in the list." >&2
echo "3. Click the dropdown next to your name and change your visibility from 'Private' to 'Public'." >&2
echo "" >&2
echo "After updating your visibility, push a new commit to this PR to re-run the check." >&2
echo "--------------------------------------------------------------------------------" >&2
fi
aws-unit-tests-pr:
name: Run GPU Unit Tests on AWS EC2 (Pull Request)
if: github.repository == 'newton-physics/newton'
needs: check-author-membership
environment:
name: ${{ needs.check-author-membership.outputs.membership_status != 'CONFIRMED_MEMBER' && 'external-pr-approval' || '' }}
url: ${{ github.event.pull_request.html_url }}
runs-on: ubuntu-latest
timeout-minutes: 120
permissions:
id-token: write
contents: read
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df
with:
aws-region: ${{ env.AWS_REGION }}
role-to-assume: ${{ env.AWS_ROLE_ARN }}
role-duration-seconds: ${{ env.AWS_ROLE_DURATION }}
- name: Launch EC2 instance
run: |
echo "Finding the latest AWS Deep Learning Base GPU AMI..."
LATEST_AMI_ID=$(aws ssm get-parameter --region ${{ env.AWS_REGION }} \
--name /aws/service/deeplearning/ami/x86_64/base-oss-nvidia-driver-gpu-ubuntu-24.04/latest/ami-id \
--query "Parameter.Value" \
--output text)
if [[ -z "$LATEST_AMI_ID" ]]; then
echo "❌ No AMI ID found. Exiting."
exit 1
fi
echo "Latest AMI ID found: $LATEST_AMI_ID"
echo "Launching EC2 instance..."
# --- Define all tags ---
TAGS="{Key=Name,Value=newton-github-workflow-runner},{Key=created-by,Value=github-actions-newton-role}"
TAGS="$TAGS,{Key=GitHub-Repository,Value=${{ github.repository }}}"
TAGS="$TAGS,{Key=Source-Event,Value=Pull-Request}"
INSTANCE_ID=$(aws ec2 run-instances \
--image-id $LATEST_AMI_ID \
--region ${{ env.AWS_REGION }} \
--instance-type ${{ env.AWS_INSTANCE_TYPE }} \
--security-group-ids ${{ env.AWS_SECURITY_GROUP_IDS }} \
--iam-instance-profile Name="NewtonEC2InstanceRole" \
--block-device-mappings "DeviceName=/dev/sda1,Ebs={VolumeSize=${{ env.AWS_VOLUME_SIZE }},VolumeType=${{ env.AWS_VOLUME_TYPE }}}" \
--tag-specifications "ResourceType=instance,Tags=[$TAGS]" \
--query 'Instances[0].InstanceId' \
--output text)
echo "Instance launched with ID: $INSTANCE_ID"
echo "INSTANCE_ID=$INSTANCE_ID" >> "$GITHUB_ENV"
echo "$INSTANCE_ID" > instance_id.txt
echo "Waiting for instance to be running..."
aws ec2 wait instance-status-ok --instance-ids $INSTANCE_ID
- name: Upload instance ID artifact
if: always()
uses: actions/upload-artifact@v4
with:
name: unittest-instance-id-artifact
path: instance_id.txt
- name: Generate script to run on instance
env:
BRANCH_NAME_ENV: ${{ github.head_ref }}
S3_BUCKET: ${{ env.AWS_S3_BUCKET }}
PR_NUMBER: ${{ github.event.number }}
IS_FORK: ${{ github.event.pull_request.head.repo.fork }}
run: |
# A unique key for this run to avoid file collisions
S3_KEY="reports/${{ github.run_id }}/${{ github.run_attempt }}"
SSM_LOG_PREFIX="ssm-logs/${{ github.run_id }}/${{ github.run_attempt }}"
SAFE_BRANCH=$(printf '%q' "${BRANCH_NAME_ENV}")
echo "S3_KEY=${S3_KEY}" >> "$GITHUB_ENV"
echo "SSM_LOG_PREFIX=${SSM_LOG_PREFIX}" >> "$GITHUB_ENV"
echo "SAFE_BRANCH=${SAFE_BRANCH}" >> "$GITHUB_ENV"
cat << EOF > remote_script.sh
#!/bin/bash
set -euo pipefail
# These are expanded immediately on the GitHub runner
S3_BUCKET="${S3_BUCKET}"
S3_KEY="${S3_KEY}"
function finish {
# This is escaped to run on the remote instance
EXIT_CODE=\$?
echo "--- Script finished with exit code \$EXIT_CODE ---"
aws s3 cp /tmp/rspec.xml "s3://\${S3_BUCKET}/\${S3_KEY}/rspec.xml" 2>&1 || true
aws s3 cp /tmp/coverage.xml "s3://\${S3_BUCKET}/\${S3_KEY}/coverage.xml" 2>&1 || true
}
trap finish EXIT
echo "--- Starting sequence on EC2 instance ---"
date
# These are expanded immediately on the GitHub runner
REPO_URL="${{ github.server_url }}/${{ github.repository }}.git"
REPO_DIR="${{ github.event.repository.name }}"
curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR="/tmp/uv-installation" sh
source /tmp/uv-installation/env
# --- Cloning Repository ---
git clone "\$REPO_URL" "\$REPO_DIR"
cd "\$REPO_DIR"
# --- Checkout correct commit ---
if [ "${IS_FORK}" == "true" ]; then
echo "Pull request from a forked repository. Fetching and checking out PR head."
git fetch origin pull/${PR_NUMBER}/head
git checkout FETCH_HEAD
else
echo "Pull request from the same repository. Checking out the branch directly."
git checkout "${SAFE_BRANCH}"
fi
echo "Running Newton test suite..."
uv run --extra dev --extra torch-cu12 -m newton.tests --junit-report-xml /tmp/rspec.xml --coverage --coverage-xml /tmp/coverage.xml --serial-fallback --failfast 2>&1
EOF
# Prepare script to be passed as SSM parameters
jq -n --arg script_body "$(cat remote_script.sh)" \
--arg timeout "$AWS_ROLE_DURATION" \
'{ "commands": [$script_body], "executionTimeout": [$timeout] }' > ssm_params.json
# Echo script for debugging
cat remote_script.sh
- name: Run script on instance
env:
S3_BUCKET: ${{ env.AWS_S3_BUCKET }}
run: |
echo "--- Sending command to instance $INSTANCE_ID via SSM ---"
COMMAND_ID=$(aws ssm send-command \
--instance-ids "$INSTANCE_ID" \
--document-name "AWS-RunShellScript" \
--comment "Running Newton GPU unit tests" \
--parameters file://ssm_params.json \
--output-s3-bucket-name "$S3_BUCKET" \
--output-s3-key-prefix "$SSM_LOG_PREFIX" \
--query "Command.CommandId" \
--output text)
echo "SSM Command ID: $COMMAND_ID"
echo "--- Waiting for command to complete... ---"
final_status="Success"
TIMEOUT=6600
INTERVAL=30
elapsed_time=0
while [ $elapsed_time -lt $TIMEOUT ]; do
STATUS=$(aws ssm get-command-invocation \
--command-id "$COMMAND_ID" \
--instance-id "$INSTANCE_ID" \
--query "Status" \
--output text)
if [[ "$STATUS" == "Success" ]]; then
break
elif [[ "$STATUS" == "InProgress" || "$STATUS" == "Pending" ]]; then
:
else
final_status="$STATUS"
echo "SSM command failed or returned unexpected status: $STATUS"
break
fi
sleep $INTERVAL
elapsed_time=$((elapsed_time + INTERVAL))
echo "Current status: $STATUS. Waited $elapsed_time seconds..."
done
if [ $elapsed_time -ge $TIMEOUT ]; then
final_status="Timeout"
fi
echo "--- Downloading artifacts from S3 ---"
aws s3 cp "s3://${S3_BUCKET}/${S3_KEY}/" . --recursive
# --- Get the exact S3 output URL from the command invocation ---
S3_STDOUT_URL=$(aws ssm get-command-invocation --command-id "$COMMAND_ID" --instance-id "$INSTANCE_ID" --query "StandardOutputUrl" --output text)
if [ -z "$S3_STDOUT_URL" ]; then
echo "❌ Could not retrieve S3 output URL. Cannot fetch logs."
exit 1
fi
# Convert the https:// URL to an s3:// URI
BUCKET_AND_KEY=$(echo "$S3_STDOUT_URL" | sed 's|https://s3\.[^.]*\.amazonaws\.com/||')
S3_URI="s3://${BUCKET_AND_KEY}"
LOCAL_LOG_FILE="ssm_output.log"
echo "Downloading combined log from ${S3_URI}"
if ! aws s3 cp "${S3_URI}" "${LOCAL_LOG_FILE}"; then
echo "❌ Failed to download logs from S3."
exit 1
fi
echo "--- Downloaded Log Output ---"
cat "${LOCAL_LOG_FILE}"
# --- Final status check ---
if [[ "$final_status" == "Success" ]]; then
if [ ! -f "rspec.xml" ]; then
final_status="MissingRspecFile"
elif [ ! -f "coverage.xml" ]; then
final_status="MissingCoverageFile"
fi
fi
if [[ "$final_status" != "Success" ]]; then
echo "❌ Workflow failed with status: $final_status"
exit 1
else
echo "✅ Workflow completed successfully."
fi
- name: Test Summary
if: ${{ !cancelled() }}
uses: test-summary/action@31493c76ec9e7aa675f1585d3ed6f1da69269a86
with:
paths: "rspec.xml"
show: "fail"
- name: Upload coverage reports to Codecov
if: ${{ !cancelled() }}
uses: codecov/codecov-action@ad3126e916f78f00edff4ed0317cf185271ccc2d
with:
env_vars: AWS_INSTANCE_TYPE
files: ./coverage.xml
flags: unittests
token: ${{ secrets.CODECOV_TOKEN }}
- name: Save test artifacts
if: ${{ !cancelled() }}
uses: actions/upload-artifact@v4
with:
if-no-files-found: ignore
name: test-artifacts
path: |
rspec.xml
coverage.xml
ssm_output.log
cleanup:
# This runs as a separate job to ensure cleanup always occurs, even if the
# main job's runner fails unexpectedly. This prevents orphaned EC2 instances.
name: Cleanup EC2 Instance
runs-on: ubuntu-latest
needs: aws-unit-tests-pr
if: always() && github.repository == 'newton-physics/newton'
permissions:
id-token: write
contents: read
steps:
- name: Download instance ID artifact
uses: actions/download-artifact@v4
with:
name: unittest-instance-id-artifact
path: .
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df
with:
aws-region: ${{ env.AWS_REGION }}
role-to-assume: ${{ env.AWS_ROLE_ARN }}
- name: Read instance ID and terminate EC2 Instance
run: |
if [ ! -f instance_id.txt ]; then
echo "Instance ID file not found. Nothing to terminate."
exit 0
fi
INSTANCE_ID=$(cat instance_id.txt)
if [ -z "$INSTANCE_ID" ]; then
echo "Instance ID is empty. Nothing to terminate."
exit 0
fi
echo "Terminating instance: $INSTANCE_ID"
aws ec2 terminate-instances --instance-ids $INSTANCE_ID