This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: runpod nanogpt (8x H100 SXM) | |
| on: | |
| push: | |
| branches: | |
| - "**" | |
| jobs: | |
| run: | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 360 | |
| env: | |
| RUNPOD_API_BASE: https://rest.runpod.io/v1 | |
| # Secure Cloud, 8x H100 SXM (RunPod GPU ID for H100 SXM is "NVIDIA H100 80GB HBM3") | |
| RUNPOD_CLOUD_TYPE: SECURE | |
| RUNPOD_GPU_TYPE_ID: "NVIDIA H100 80GB HBM3" | |
| RUNPOD_GPU_COUNT: 8 | |
| # Pick a RunPod PyTorch image that already works well on RunPod | |
| RUNPOD_IMAGE: "runpod/pytorch:2.4.0-py3.11-cuda12.4.1-devel-ubuntu22.04" | |
| # Storage (adjust as needed) | |
| RUNPOD_CONTAINER_DISK_GB: 200 | |
| RUNPOD_VOLUME_GB: 500 | |
| RUNPOD_VOLUME_MOUNT_PATH: /workspace | |
| steps: | |
| - name: Install local deps | |
| run: | | |
| set -euo pipefail | |
| sudo apt-get update | |
| sudo apt-get install -y jq openssh-client | |
| - name: Prepare SSH key (private -> file, derive public key) | |
| run: | | |
| set -euo pipefail | |
| mkdir -p ~/.ssh | |
| # Write private key exactly, fix CRLF just in case | |
| printf '%s' "${{ secrets.RUNPOD_SSH_KEY }}" | tr -d '\r' > ~/.ssh/runpod_key | |
| chmod 600 ~/.ssh/runpod_key | |
| ssh-keygen -y -f ~/.ssh/runpod_key > ~/.ssh/runpod_key.pub | |
| chmod 644 ~/.ssh/runpod_key.pub | |
| - name: Create RunPod pod (Secure, 8x H100 SXM) | |
| id: create_pod | |
| env: | |
| RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }} | |
| run: | | |
| set -euo pipefail | |
| POD_NAME="gh-${GITHUB_REPOSITORY##*/}-${GITHUB_REF_NAME}-${GITHUB_RUN_ID}" | |
| PUBKEY="$(cat ~/.ssh/runpod_key.pub)" | |
| payload="$(jq -n \ | |
| --arg name "$POD_NAME" \ | |
| --arg image "$RUNPOD_IMAGE" \ | |
| --arg gpuType "$RUNPOD_GPU_TYPE_ID" \ | |
| --arg mountPath "$RUNPOD_VOLUME_MOUNT_PATH" \ | |
| --arg pubkey "$PUBKEY" \ | |
| --argjson gpuCount $RUNPOD_GPU_COUNT \ | |
| --argjson containerDisk $RUNPOD_CONTAINER_DISK_GB \ | |
| --argjson volumeGb $RUNPOD_VOLUME_GB \ | |
| '{ | |
| cloudType: "SECURE", | |
| computeType: "GPU", | |
| gpuCount: $gpuCount, | |
| gpuTypeIds: [$gpuType], | |
| imageName: $image, | |
| name: $name, | |
| interruptible: false, | |
| supportPublicIp: true, | |
| ports: ["22/tcp"], | |
| containerDiskInGb: $containerDisk, | |
| volumeInGb: $volumeGb, | |
| volumeMountPath: $mountPath, | |
| # Ensures the pod uses the public key that matches RUNPOD_SSH_KEY. | |
| # This avoids relying on account-level SSH keys. | |
| env: { | |
| SSH_PUBLIC_KEY: $pubkey | |
| } | |
| }')" | |
| resp="$(curl -sSf -X POST "$RUNPOD_API_BASE/pods" \ | |
| -H "Authorization: Bearer $RUNPOD_API_KEY" \ | |
| -H "Content-Type: application/json" \ | |
| -d "$payload")" | |
| pod_id="$(jq -r '.id' <<<"$resp")" | |
| if [[ -z "$pod_id" || "$pod_id" == "null" ]]; then | |
| echo "Create pod failed, response:" >&2 | |
| echo "$resp" >&2 | |
| exit 1 | |
| fi | |
| echo "POD_ID=$pod_id" >> "$GITHUB_ENV" | |
| - name: Wait for RUNNING + public IP + SSH port mapping | |
| env: | |
| RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }} | |
| run: | | |
| set -euo pipefail | |
| : "${POD_ID:?missing POD_ID}" | |
| for i in $(seq 1 180); do | |
| pod="$(curl -sSf -H "Authorization: Bearer $RUNPOD_API_KEY" \ | |
| "$RUNPOD_API_BASE/pods/$POD_ID")" | |
| status="$(jq -r '.desiredStatus // empty' <<<"$pod")" | |
| ip="$(jq -r '.publicIp // empty' <<<"$pod")" | |
| port="$(jq -r '.portMappings["22"] // empty' <<<"$pod")" | |
| if [[ "$status" == "RUNNING" && -n "$ip" && -n "$port" ]]; then | |
| echo "POD_IP=$ip" >> "$GITHUB_ENV" | |
| echo "POD_SSH_PORT=$port" >> "$GITHUB_ENV" | |
| exit 0 | |
| fi | |
| echo "waiting: status=$status ip=$ip port=$port" | |
| sleep 10 | |
| done | |
| echo "Timed out waiting for pod to become reachable over exposed TCP (22/tcp)." >&2 | |
| exit 1 | |
| - name: Wait for SSH handshake | |
| run: | | |
| set -euo pipefail | |
| : "${POD_IP:?missing POD_IP}" | |
| : "${POD_SSH_PORT:?missing POD_SSH_PORT}" | |
| for i in $(seq 1 60); do | |
| if ssh -i ~/.ssh/runpod_key \ | |
| -o BatchMode=yes \ | |
| -o StrictHostKeyChecking=no \ | |
| -o UserKnownHostsFile=/dev/null \ | |
| -o ConnectTimeout=10 \ | |
| -o ServerAliveInterval=30 \ | |
| -o ServerAliveCountMax=6 \ | |
| -p "$POD_SSH_PORT" \ | |
| root@"$POD_IP" \ | |
| 'echo ssh-ready' >/dev/null 2>&1; then | |
| exit 0 | |
| fi | |
| echo "waiting for ssh..." | |
| sleep 10 | |
| done | |
| echo "SSH never became ready." >&2 | |
| exit 1 | |
| - name: Run workload on pod (fails job on non-zero) | |
| env: | |
| RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }} | |
| run: | | |
| set -euo pipefail | |
| : "${POD_IP:?missing POD_IP}" | |
| : "${POD_SSH_PORT:?missing POD_SSH_PORT}" | |
| : "${POD_ID:?missing POD_ID}" | |
| : "${RUNPOD_API_KEY:?missing RUNPOD_API_KEY}" | |
| refresh_pod_endpoint() { | |
| pod="$(curl -sSf -H "Authorization: Bearer $RUNPOD_API_KEY" \ | |
| "$RUNPOD_API_BASE/pods/$POD_ID")" | |
| ip="$(jq -r '.publicIp // empty' <<<"$pod")" | |
| port="$(jq -r '.portMappings["22"] // empty' <<<"$pod")" | |
| if [[ -n "$ip" && -n "$port" ]]; then | |
| POD_IP="$ip" | |
| POD_SSH_PORT="$port" | |
| export POD_IP POD_SSH_PORT | |
| fi | |
| } | |
| ssh_pod() { | |
| ssh -i ~/.ssh/runpod_key \ | |
| -o BatchMode=yes \ | |
| -o StrictHostKeyChecking=no \ | |
| -o UserKnownHostsFile=/dev/null \ | |
| -o ConnectTimeout=15 \ | |
| -o LogLevel=ERROR \ | |
| -o ServerAliveInterval=30 \ | |
| -o ServerAliveCountMax=6 \ | |
| -p "$POD_SSH_PORT" \ | |
| root@"$POD_IP" "$@" | |
| } | |
| scp_to_pod() { | |
| scp -i ~/.ssh/runpod_key \ | |
| -o BatchMode=yes \ | |
| -o StrictHostKeyChecking=no \ | |
| -o UserKnownHostsFile=/dev/null \ | |
| -o ConnectTimeout=15 \ | |
| -o LogLevel=ERROR \ | |
| -P "$POD_SSH_PORT" \ | |
| "$@" | |
| } | |
| # Build a remote job script so the workload survives transient SSH disconnects. | |
| cat > pod_job.sh <<'POD_EOF' | |
| #!/usr/bin/env bash | |
| set -euo pipefail | |
| RC_FILE=/workspace/modded-nanogpt.rc | |
| LOG_ZIP=/workspace/modded-nanogpt-logs.zip | |
| JOB_LOG=/workspace/modded-nanogpt-job.log | |
| _finalize() { | |
| rc=$? | |
| set +e | |
| printf '%s' "$rc" > "$RC_FILE" 2>/dev/null || true | |
| cd /workspace || true | |
| zip_items=() | |
| if [ -f "$RC_FILE" ]; then | |
| zip_items+=("$(basename "$RC_FILE")") | |
| fi | |
| if [ -f "$JOB_LOG" ]; then | |
| zip_items+=("$(basename "$JOB_LOG")") | |
| fi | |
| if [ -d /workspace/modded-nanogpt/logs ]; then | |
| zip_items+=("modded-nanogpt/logs") | |
| fi | |
| rm -f "$LOG_ZIP" | |
| if [ "${#zip_items[@]}" -gt 0 ]; then | |
| zip -r "$LOG_ZIP" "${zip_items[@]}" >/dev/null 2>&1 || true | |
| fi | |
| exit "$rc" | |
| } | |
| trap _finalize EXIT | |
| cd /workspace | |
| export DEBIAN_FRONTEND=noninteractive | |
| export PIP_DISABLE_PIP_VERSION_CHECK=1 | |
| export GIT_LFS_SKIP_SMUDGE=1 | |
| echo "[job] installing apt deps..." | |
| apt-get update | |
| apt-get install -y git zip | |
| echo "[job] python:" | |
| python3 --version | |
| echo "[job] cloning repo..." | |
| : "${GITHUB_SERVER_URL:?missing GITHUB_SERVER_URL}" | |
| : "${GITHUB_REPOSITORY:?missing GITHUB_REPOSITORY}" | |
| : "${GITHUB_REF_NAME:?missing GITHUB_REF_NAME}" | |
| : "${GITHUB_SHA:?missing GITHUB_SHA}" | |
| REPO_URL="${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}.git" | |
| rm -rf /workspace/modded-nanogpt | |
| git clone --depth 1 --branch "${GITHUB_REF_NAME}" "$REPO_URL" /workspace/modded-nanogpt | |
| cd /workspace/modded-nanogpt | |
| if [[ "$(git rev-parse HEAD)" != "$GITHUB_SHA" ]]; then | |
| git fetch --depth 1 origin "$GITHUB_SHA" | |
| git checkout "$GITHUB_SHA" | |
| fi | |
| echo "[job] installing python deps..." | |
| pip install -r requirements.txt | |
| # Use Torch 2.10.x nightlies (2.10 dev builds are < 2.10 final per PEP 440, so we bound with .dev0). | |
| pip install --pre 'torch>=2.10.0.dev0,<2.11.0.dev0' --index-url https://download.pytorch.org/whl/nightly/cu126 --upgrade | |
| echo "[job] torch:" | |
| python3 - <<'PY' | |
| import torch | |
| print(torch.__version__) | |
| print("cuda:", torch.version.cuda) | |
| print("is_available:", torch.cuda.is_available()) | |
| PY | |
| echo "[job] preparing data..." | |
| python3 data/cached_fineweb10B.py 9 | |
| echo "[job] starting training..." | |
| ./run.sh | |
| POD_EOF | |
| chmod +x pod_job.sh | |
| # Upload the script. | |
| scp_to_pod ./pod_job.sh root@"$POD_IP":/workspace/pod_job.sh | |
| # Start it in the background (log to file) and return immediately. | |
| ssh_pod "GITHUB_SERVER_URL='${GITHUB_SERVER_URL}' GITHUB_REPOSITORY='${GITHUB_REPOSITORY}' GITHUB_REF_NAME='${GITHUB_REF_NAME}' GITHUB_SHA='${GITHUB_SHA}' nohup /workspace/pod_job.sh > /workspace/modded-nanogpt-job.log 2>&1 & echo \$! > /workspace/modded-nanogpt.pid" | |
| # Poll for completion (RC file). Retry SSH with refreshed port mapping if needed. | |
| for i in $(seq 1 360); do | |
| set +e | |
| ssh_pod 'test -f /workspace/modded-nanogpt.rc' >/dev/null 2>&1 | |
| rc=$? | |
| set -e | |
| if [[ $rc -eq 0 ]]; then | |
| break | |
| fi | |
| if (( i % 5 == 0 )); then | |
| echo "still running... (minute=$i)" | |
| set +e | |
| ssh_pod 'tail -n 30 /workspace/modded-nanogpt-job.log' || true | |
| set -e | |
| fi | |
| sleep 60 | |
| if (( i % 2 == 0 )); then | |
| refresh_pod_endpoint || true | |
| fi | |
| done | |
| if ! ssh_pod 'test -f /workspace/modded-nanogpt.rc' >/dev/null 2>&1; then | |
| echo "Timed out waiting for remote job to finish." >&2 | |
| set +e | |
| ssh_pod 'tail -n 200 /workspace/modded-nanogpt-job.log' || true | |
| set -e | |
| exit 1 | |
| fi | |
| job_rc="$(ssh_pod 'cat /workspace/modded-nanogpt.rc' | tr -d '\r\n')" | |
| echo "Remote job rc=$job_rc" | |
| scp_from_pod() { | |
| scp -i ~/.ssh/runpod_key \ | |
| -o BatchMode=yes \ | |
| -o StrictHostKeyChecking=no \ | |
| -o UserKnownHostsFile=/dev/null \ | |
| -o ConnectTimeout=15 \ | |
| -o LogLevel=ERROR \ | |
| -P "$POD_SSH_PORT" \ | |
| "$@" | |
| } | |
| download_from_pod() { | |
| remote_path="$1" | |
| local_path="$2" | |
| label="${3:-$remote_path}" | |
| for attempt in $(seq 1 8); do | |
| set +e | |
| scp_from_pod root@"$POD_IP":"$remote_path" "$local_path" >/dev/null 2>&1 | |
| ok=$? | |
| set -e | |
| if [[ $ok -eq 0 ]]; then | |
| echo "downloaded: $label" | |
| return 0 | |
| fi | |
| refresh_pod_endpoint || true | |
| sleep 5 | |
| done | |
| echo "failed to download: $label" >&2 | |
| return 1 | |
| } | |
| download_from_pod /workspace/modded-nanogpt-job.log ./modded-nanogpt-job.log "job log" || true | |
| download_from_pod /workspace/modded-nanogpt.rc ./modded-nanogpt.rc "job rc" || true | |
| download_from_pod /workspace/modded-nanogpt-logs.zip ./modded-nanogpt-logs.zip "logs zip" || true | |
| if [[ "$job_rc" != "0" ]]; then | |
| if [[ -f ./modded-nanogpt-job.log ]]; then | |
| tail -n 200 ./modded-nanogpt-job.log || true | |
| else | |
| set +e | |
| ssh_pod 'tail -n 200 /workspace/modded-nanogpt-job.log' || true | |
| set -e | |
| fi | |
| exit 1 | |
| fi | |
| - name: Download logs zip (best effort) | |
| if: always() | |
| env: | |
| RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }} | |
| run: | | |
| set -euo pipefail | |
| : "${POD_ID:?missing POD_ID}" | |
| : "${RUNPOD_API_KEY:?missing RUNPOD_API_KEY}" | |
| refresh_pod_endpoint() { | |
| pod="$(curl -sSf -H "Authorization: Bearer $RUNPOD_API_KEY" \ | |
| "$RUNPOD_API_BASE/pods/$POD_ID")" | |
| ip="$(jq -r '.publicIp // empty' <<<"$pod")" | |
| port="$(jq -r '.portMappings["22"] // empty' <<<"$pod")" | |
| status="$(jq -r '.desiredStatus // empty' <<<"$pod")" | |
| if [[ -n "$ip" && -n "$port" ]]; then | |
| POD_IP="$ip" | |
| POD_SSH_PORT="$port" | |
| echo "POD_IP=$POD_IP" >> "$GITHUB_ENV" | |
| echo "POD_SSH_PORT=$POD_SSH_PORT" >> "$GITHUB_ENV" | |
| fi | |
| if [[ -n "${status:-}" ]]; then | |
| echo "pod status=$status ip=${ip:-} port=${port:-}" | |
| fi | |
| } | |
| refresh_pod_endpoint || true | |
| if [[ -z "${POD_IP:-}" || -z "${POD_SSH_PORT:-}" ]]; then | |
| echo "No POD_IP/POD_SSH_PORT, skipping download." | |
| exit 0 | |
| fi | |
| for i in $(seq 1 10); do | |
| set +e | |
| ssh -i ~/.ssh/runpod_key \ | |
| -o BatchMode=yes \ | |
| -o StrictHostKeyChecking=no \ | |
| -o UserKnownHostsFile=/dev/null \ | |
| -o ConnectTimeout=15 \ | |
| -o LogLevel=ERROR \ | |
| -p "$POD_SSH_PORT" \ | |
| root@"$POD_IP" 'test -f /workspace/modded-nanogpt-logs.zip' | |
| ok=$? | |
| set -e | |
| if [[ $ok -eq 0 ]]; then | |
| break | |
| fi | |
| refresh_pod_endpoint || true | |
| sleep 10 | |
| done | |
| if ssh -i ~/.ssh/runpod_key \ | |
| -o BatchMode=yes \ | |
| -o StrictHostKeyChecking=no \ | |
| -o UserKnownHostsFile=/dev/null \ | |
| -o ConnectTimeout=15 \ | |
| -o LogLevel=ERROR \ | |
| -p "$POD_SSH_PORT" \ | |
| root@"$POD_IP" 'test -f /workspace/modded-nanogpt-logs.zip'; then | |
| scp -i ~/.ssh/runpod_key \ | |
| -o BatchMode=yes \ | |
| -o StrictHostKeyChecking=no \ | |
| -o UserKnownHostsFile=/dev/null \ | |
| -o ConnectTimeout=15 \ | |
| -o LogLevel=ERROR \ | |
| -P "$POD_SSH_PORT" \ | |
| root@"$POD_IP":/workspace/modded-nanogpt-logs.zip \ | |
| ./modded-nanogpt-logs.zip | |
| else | |
| echo "No /workspace/modded-nanogpt-logs.zip found on pod." | |
| fi | |
| # Also try to download the job log + rc for debugging. | |
| set +e | |
| scp -i ~/.ssh/runpod_key \ | |
| -o BatchMode=yes \ | |
| -o StrictHostKeyChecking=no \ | |
| -o UserKnownHostsFile=/dev/null \ | |
| -o ConnectTimeout=15 \ | |
| -o LogLevel=ERROR \ | |
| -P "$POD_SSH_PORT" \ | |
| root@"$POD_IP":/workspace/modded-nanogpt-job.log \ | |
| ./modded-nanogpt-job.log >/dev/null 2>&1 | |
| scp -i ~/.ssh/runpod_key \ | |
| -o BatchMode=yes \ | |
| -o StrictHostKeyChecking=no \ | |
| -o UserKnownHostsFile=/dev/null \ | |
| -o ConnectTimeout=15 \ | |
| -o LogLevel=ERROR \ | |
| -P "$POD_SSH_PORT" \ | |
| root@"$POD_IP":/workspace/modded-nanogpt.rc \ | |
| ./modded-nanogpt.rc >/dev/null 2>&1 | |
| set -e | |
| - name: Upload logs artifact | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: modded-nanogpt-logs-${{ github.ref_name }}-${{ github.run_id }} | |
| path: | | |
| modded-nanogpt-logs.zip | |
| modded-nanogpt-job.log | |
| modded-nanogpt.rc | |
| if-no-files-found: warn | |
| - name: Terminate pod (always) | |
| if: always() | |
| continue-on-error: true | |
| env: | |
| RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }} | |
| run: | | |
| set -euo pipefail | |
| if [[ -z "${POD_ID:-}" ]]; then | |
| echo "No POD_ID, nothing to terminate." | |
| exit 0 | |
| fi | |
| for i in $(seq 1 10); do | |
| if curl -sSf -X DELETE "$RUNPOD_API_BASE/pods/$POD_ID" \ | |
| -H "Authorization: Bearer $RUNPOD_API_KEY" >/dev/null; then | |
| echo "Pod $POD_ID deleted." | |
| exit 0 | |
| fi | |
| echo "Delete failed, retrying..." | |
| sleep 5 | |
| done | |
| echo "Failed to delete pod $POD_ID after retries." | |
| exit 0 |