Skip to content

w

w #11

name: runpod nanogpt (8x H100 SXM)
on:
push:
branches:
- "**"
jobs:
run:
runs-on: ubuntu-latest
timeout-minutes: 360
env:
RUNPOD_API_BASE: https://rest.runpod.io/v1
# Secure Cloud, 8x H100 SXM (RunPod GPU ID for H100 SXM is "NVIDIA H100 80GB HBM3")
RUNPOD_CLOUD_TYPE: SECURE
RUNPOD_GPU_TYPE_ID: "NVIDIA H100 80GB HBM3"
RUNPOD_GPU_COUNT: 8
# Pick a RunPod PyTorch image that already works well on RunPod
RUNPOD_IMAGE: "runpod/pytorch:2.4.0-py3.11-cuda12.4.1-devel-ubuntu22.04"
# Storage (adjust as needed)
RUNPOD_CONTAINER_DISK_GB: 200
RUNPOD_VOLUME_GB: 500
RUNPOD_VOLUME_MOUNT_PATH: /workspace
steps:
- name: Install local deps
run: |
set -euo pipefail
sudo apt-get update
sudo apt-get install -y jq openssh-client
- name: Prepare SSH key (private -> file, derive public key)
run: |
set -euo pipefail
mkdir -p ~/.ssh
# Write private key exactly, fix CRLF just in case
printf '%s' "${{ secrets.RUNPOD_SSH_KEY }}" | tr -d '\r' > ~/.ssh/runpod_key
chmod 600 ~/.ssh/runpod_key
ssh-keygen -y -f ~/.ssh/runpod_key > ~/.ssh/runpod_key.pub
chmod 644 ~/.ssh/runpod_key.pub
- name: Create RunPod pod (Secure, 8x H100 SXM)
id: create_pod
env:
RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }}
run: |
set -euo pipefail
POD_NAME="gh-${GITHUB_REPOSITORY##*/}-${GITHUB_REF_NAME}-${GITHUB_RUN_ID}"
PUBKEY="$(cat ~/.ssh/runpod_key.pub)"
payload="$(jq -n \
--arg name "$POD_NAME" \
--arg image "$RUNPOD_IMAGE" \
--arg gpuType "$RUNPOD_GPU_TYPE_ID" \
--arg mountPath "$RUNPOD_VOLUME_MOUNT_PATH" \
--arg pubkey "$PUBKEY" \
--argjson gpuCount $RUNPOD_GPU_COUNT \
--argjson containerDisk $RUNPOD_CONTAINER_DISK_GB \
--argjson volumeGb $RUNPOD_VOLUME_GB \
'{
cloudType: "SECURE",
computeType: "GPU",
gpuCount: $gpuCount,
gpuTypeIds: [$gpuType],
imageName: $image,
name: $name,
interruptible: false,
supportPublicIp: true,
ports: ["22/tcp"],
containerDiskInGb: $containerDisk,
volumeInGb: $volumeGb,
volumeMountPath: $mountPath,
# Ensures the pod uses the public key that matches RUNPOD_SSH_KEY.
# This avoids relying on account-level SSH keys.
env: {
SSH_PUBLIC_KEY: $pubkey
}
}')"
resp="$(curl -sSf -X POST "$RUNPOD_API_BASE/pods" \
-H "Authorization: Bearer $RUNPOD_API_KEY" \
-H "Content-Type: application/json" \
-d "$payload")"
pod_id="$(jq -r '.id' <<<"$resp")"
if [[ -z "$pod_id" || "$pod_id" == "null" ]]; then
echo "Create pod failed, response:" >&2
echo "$resp" >&2
exit 1
fi
echo "POD_ID=$pod_id" >> "$GITHUB_ENV"
- name: Wait for RUNNING + public IP + SSH port mapping
env:
RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }}
run: |
set -euo pipefail
: "${POD_ID:?missing POD_ID}"
for i in $(seq 1 180); do
pod="$(curl -sSf -H "Authorization: Bearer $RUNPOD_API_KEY" \
"$RUNPOD_API_BASE/pods/$POD_ID")"
status="$(jq -r '.desiredStatus // empty' <<<"$pod")"
ip="$(jq -r '.publicIp // empty' <<<"$pod")"
port="$(jq -r '.portMappings["22"] // empty' <<<"$pod")"
if [[ "$status" == "RUNNING" && -n "$ip" && -n "$port" ]]; then
echo "POD_IP=$ip" >> "$GITHUB_ENV"
echo "POD_SSH_PORT=$port" >> "$GITHUB_ENV"
exit 0
fi
echo "waiting: status=$status ip=$ip port=$port"
sleep 10
done
echo "Timed out waiting for pod to become reachable over exposed TCP (22/tcp)." >&2
exit 1
- name: Wait for SSH handshake
run: |
set -euo pipefail
: "${POD_IP:?missing POD_IP}"
: "${POD_SSH_PORT:?missing POD_SSH_PORT}"
for i in $(seq 1 60); do
if ssh -i ~/.ssh/runpod_key \
-o BatchMode=yes \
-o StrictHostKeyChecking=no \
-o UserKnownHostsFile=/dev/null \
-o ConnectTimeout=10 \
-o ServerAliveInterval=30 \
-o ServerAliveCountMax=6 \
-p "$POD_SSH_PORT" \
root@"$POD_IP" \
'echo ssh-ready' >/dev/null 2>&1; then
exit 0
fi
echo "waiting for ssh..."
sleep 10
done
echo "SSH never became ready." >&2
exit 1
- name: Run workload on pod (fails job on non-zero)
env:
RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }}
run: |
set -euo pipefail
: "${POD_IP:?missing POD_IP}"
: "${POD_SSH_PORT:?missing POD_SSH_PORT}"
: "${POD_ID:?missing POD_ID}"
: "${RUNPOD_API_KEY:?missing RUNPOD_API_KEY}"
refresh_pod_endpoint() {
pod="$(curl -sSf -H "Authorization: Bearer $RUNPOD_API_KEY" \
"$RUNPOD_API_BASE/pods/$POD_ID")"
ip="$(jq -r '.publicIp // empty' <<<"$pod")"
port="$(jq -r '.portMappings["22"] // empty' <<<"$pod")"
if [[ -n "$ip" && -n "$port" ]]; then
POD_IP="$ip"
POD_SSH_PORT="$port"
export POD_IP POD_SSH_PORT
fi
}
ssh_pod() {
ssh -i ~/.ssh/runpod_key \
-o BatchMode=yes \
-o StrictHostKeyChecking=no \
-o UserKnownHostsFile=/dev/null \
-o ConnectTimeout=15 \
-o LogLevel=ERROR \
-o ServerAliveInterval=30 \
-o ServerAliveCountMax=6 \
-p "$POD_SSH_PORT" \
root@"$POD_IP" "$@"
}
scp_to_pod() {
scp -i ~/.ssh/runpod_key \
-o BatchMode=yes \
-o StrictHostKeyChecking=no \
-o UserKnownHostsFile=/dev/null \
-o ConnectTimeout=15 \
-o LogLevel=ERROR \
-P "$POD_SSH_PORT" \
"$@"
}
# Build a remote job script so the workload survives transient SSH disconnects.
cat > pod_job.sh <<'POD_EOF'
#!/usr/bin/env bash
set -euo pipefail
RC_FILE=/workspace/modded-nanogpt.rc
LOG_ZIP=/workspace/modded-nanogpt-logs.zip
JOB_LOG=/workspace/modded-nanogpt-job.log
_finalize() {
rc=$?
set +e
printf '%s' "$rc" > "$RC_FILE" 2>/dev/null || true
cd /workspace || true
zip_items=()
if [ -f "$RC_FILE" ]; then
zip_items+=("$(basename "$RC_FILE")")
fi
if [ -f "$JOB_LOG" ]; then
zip_items+=("$(basename "$JOB_LOG")")
fi
if [ -d /workspace/modded-nanogpt/logs ]; then
zip_items+=("modded-nanogpt/logs")
fi
rm -f "$LOG_ZIP"
if [ "${#zip_items[@]}" -gt 0 ]; then
zip -r "$LOG_ZIP" "${zip_items[@]}" >/dev/null 2>&1 || true
fi
exit "$rc"
}
trap _finalize EXIT
cd /workspace
export DEBIAN_FRONTEND=noninteractive
export PIP_DISABLE_PIP_VERSION_CHECK=1
export GIT_LFS_SKIP_SMUDGE=1
echo "[job] installing apt deps..."
apt-get update
apt-get install -y git zip
echo "[job] python:"
python3 --version
echo "[job] cloning repo..."
: "${GITHUB_SERVER_URL:?missing GITHUB_SERVER_URL}"
: "${GITHUB_REPOSITORY:?missing GITHUB_REPOSITORY}"
: "${GITHUB_REF_NAME:?missing GITHUB_REF_NAME}"
: "${GITHUB_SHA:?missing GITHUB_SHA}"
REPO_URL="${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}.git"
rm -rf /workspace/modded-nanogpt
git clone --depth 1 --branch "${GITHUB_REF_NAME}" "$REPO_URL" /workspace/modded-nanogpt
cd /workspace/modded-nanogpt
if [[ "$(git rev-parse HEAD)" != "$GITHUB_SHA" ]]; then
git fetch --depth 1 origin "$GITHUB_SHA"
git checkout "$GITHUB_SHA"
fi
echo "[job] installing python deps..."
pip install -r requirements.txt
# Use Torch 2.10.x nightlies (2.10 dev builds are < 2.10 final per PEP 440, so we bound with .dev0).
pip install --pre 'torch>=2.10.0.dev0,<2.11.0.dev0' --index-url https://download.pytorch.org/whl/nightly/cu126 --upgrade
echo "[job] torch:"
python3 - <<'PY'
import torch
print(torch.__version__)
print("cuda:", torch.version.cuda)
print("is_available:", torch.cuda.is_available())
PY
echo "[job] preparing data..."
python3 data/cached_fineweb10B.py 9
echo "[job] starting training..."
./run.sh
POD_EOF
chmod +x pod_job.sh
# Upload the script.
scp_to_pod ./pod_job.sh root@"$POD_IP":/workspace/pod_job.sh
# Start it in the background (log to file) and return immediately.
ssh_pod "GITHUB_SERVER_URL='${GITHUB_SERVER_URL}' GITHUB_REPOSITORY='${GITHUB_REPOSITORY}' GITHUB_REF_NAME='${GITHUB_REF_NAME}' GITHUB_SHA='${GITHUB_SHA}' nohup /workspace/pod_job.sh > /workspace/modded-nanogpt-job.log 2>&1 & echo \$! > /workspace/modded-nanogpt.pid"
# Poll for completion (RC file). Retry SSH with refreshed port mapping if needed.
for i in $(seq 1 360); do
set +e
ssh_pod 'test -f /workspace/modded-nanogpt.rc' >/dev/null 2>&1
rc=$?
set -e
if [[ $rc -eq 0 ]]; then
break
fi
if (( i % 5 == 0 )); then
echo "still running... (minute=$i)"
set +e
ssh_pod 'tail -n 30 /workspace/modded-nanogpt-job.log' || true
set -e
fi
sleep 60
if (( i % 2 == 0 )); then
refresh_pod_endpoint || true
fi
done
if ! ssh_pod 'test -f /workspace/modded-nanogpt.rc' >/dev/null 2>&1; then
echo "Timed out waiting for remote job to finish." >&2
set +e
ssh_pod 'tail -n 200 /workspace/modded-nanogpt-job.log' || true
set -e
exit 1
fi
job_rc="$(ssh_pod 'cat /workspace/modded-nanogpt.rc' | tr -d '\r\n')"
echo "Remote job rc=$job_rc"
scp_from_pod() {
scp -i ~/.ssh/runpod_key \
-o BatchMode=yes \
-o StrictHostKeyChecking=no \
-o UserKnownHostsFile=/dev/null \
-o ConnectTimeout=15 \
-o LogLevel=ERROR \
-P "$POD_SSH_PORT" \
"$@"
}
download_from_pod() {
remote_path="$1"
local_path="$2"
label="${3:-$remote_path}"
for attempt in $(seq 1 8); do
set +e
scp_from_pod root@"$POD_IP":"$remote_path" "$local_path" >/dev/null 2>&1
ok=$?
set -e
if [[ $ok -eq 0 ]]; then
echo "downloaded: $label"
return 0
fi
refresh_pod_endpoint || true
sleep 5
done
echo "failed to download: $label" >&2
return 1
}
download_from_pod /workspace/modded-nanogpt-job.log ./modded-nanogpt-job.log "job log" || true
download_from_pod /workspace/modded-nanogpt.rc ./modded-nanogpt.rc "job rc" || true
download_from_pod /workspace/modded-nanogpt-logs.zip ./modded-nanogpt-logs.zip "logs zip" || true
if [[ "$job_rc" != "0" ]]; then
if [[ -f ./modded-nanogpt-job.log ]]; then
tail -n 200 ./modded-nanogpt-job.log || true
else
set +e
ssh_pod 'tail -n 200 /workspace/modded-nanogpt-job.log' || true
set -e
fi
exit 1
fi
- name: Download logs zip (best effort)
if: always()
env:
RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }}
run: |
set -euo pipefail
: "${POD_ID:?missing POD_ID}"
: "${RUNPOD_API_KEY:?missing RUNPOD_API_KEY}"
refresh_pod_endpoint() {
pod="$(curl -sSf -H "Authorization: Bearer $RUNPOD_API_KEY" \
"$RUNPOD_API_BASE/pods/$POD_ID")"
ip="$(jq -r '.publicIp // empty' <<<"$pod")"
port="$(jq -r '.portMappings["22"] // empty' <<<"$pod")"
status="$(jq -r '.desiredStatus // empty' <<<"$pod")"
if [[ -n "$ip" && -n "$port" ]]; then
POD_IP="$ip"
POD_SSH_PORT="$port"
echo "POD_IP=$POD_IP" >> "$GITHUB_ENV"
echo "POD_SSH_PORT=$POD_SSH_PORT" >> "$GITHUB_ENV"
fi
if [[ -n "${status:-}" ]]; then
echo "pod status=$status ip=${ip:-} port=${port:-}"
fi
}
refresh_pod_endpoint || true
if [[ -z "${POD_IP:-}" || -z "${POD_SSH_PORT:-}" ]]; then
echo "No POD_IP/POD_SSH_PORT, skipping download."
exit 0
fi
for i in $(seq 1 10); do
set +e
ssh -i ~/.ssh/runpod_key \
-o BatchMode=yes \
-o StrictHostKeyChecking=no \
-o UserKnownHostsFile=/dev/null \
-o ConnectTimeout=15 \
-o LogLevel=ERROR \
-p "$POD_SSH_PORT" \
root@"$POD_IP" 'test -f /workspace/modded-nanogpt-logs.zip'
ok=$?
set -e
if [[ $ok -eq 0 ]]; then
break
fi
refresh_pod_endpoint || true
sleep 10
done
if ssh -i ~/.ssh/runpod_key \
-o BatchMode=yes \
-o StrictHostKeyChecking=no \
-o UserKnownHostsFile=/dev/null \
-o ConnectTimeout=15 \
-o LogLevel=ERROR \
-p "$POD_SSH_PORT" \
root@"$POD_IP" 'test -f /workspace/modded-nanogpt-logs.zip'; then
scp -i ~/.ssh/runpod_key \
-o BatchMode=yes \
-o StrictHostKeyChecking=no \
-o UserKnownHostsFile=/dev/null \
-o ConnectTimeout=15 \
-o LogLevel=ERROR \
-P "$POD_SSH_PORT" \
root@"$POD_IP":/workspace/modded-nanogpt-logs.zip \
./modded-nanogpt-logs.zip
else
echo "No /workspace/modded-nanogpt-logs.zip found on pod."
fi
# Also try to download the job log + rc for debugging.
set +e
scp -i ~/.ssh/runpod_key \
-o BatchMode=yes \
-o StrictHostKeyChecking=no \
-o UserKnownHostsFile=/dev/null \
-o ConnectTimeout=15 \
-o LogLevel=ERROR \
-P "$POD_SSH_PORT" \
root@"$POD_IP":/workspace/modded-nanogpt-job.log \
./modded-nanogpt-job.log >/dev/null 2>&1
scp -i ~/.ssh/runpod_key \
-o BatchMode=yes \
-o StrictHostKeyChecking=no \
-o UserKnownHostsFile=/dev/null \
-o ConnectTimeout=15 \
-o LogLevel=ERROR \
-P "$POD_SSH_PORT" \
root@"$POD_IP":/workspace/modded-nanogpt.rc \
./modded-nanogpt.rc >/dev/null 2>&1
set -e
- name: Upload logs artifact
if: always()
uses: actions/upload-artifact@v4
with:
name: modded-nanogpt-logs-${{ github.ref_name }}-${{ github.run_id }}
path: |
modded-nanogpt-logs.zip
modded-nanogpt-job.log
modded-nanogpt.rc
if-no-files-found: warn
- name: Terminate pod (always)
if: always()
continue-on-error: true
env:
RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }}
run: |
set -euo pipefail
if [[ -z "${POD_ID:-}" ]]; then
echo "No POD_ID, nothing to terminate."
exit 0
fi
for i in $(seq 1 10); do
if curl -sSf -X DELETE "$RUNPOD_API_BASE/pods/$POD_ID" \
-H "Authorization: Bearer $RUNPOD_API_KEY" >/dev/null; then
echo "Pod $POD_ID deleted."
exit 0
fi
echo "Delete failed, retrying..."
sleep 5
done
echo "Failed to delete pod $POD_ID after retries."
exit 0