w #11

Workflow file for this run

.github/workflows/runpod-nanogpt.yml at e64c180

	name: runpod nanogpt (8x H100 SXM)

	on:
	push:
	branches:
	- "**"

	jobs:
	run:
	runs-on: ubuntu-latest
	timeout-minutes: 360

	env:
	RUNPOD_API_BASE: https://rest.runpod.io/v1

	# Secure Cloud, 8x H100 SXM (RunPod GPU ID for H100 SXM is "NVIDIA H100 80GB HBM3")
	RUNPOD_CLOUD_TYPE: SECURE
	RUNPOD_GPU_TYPE_ID: "NVIDIA H100 80GB HBM3"
	RUNPOD_GPU_COUNT: 8

	# Pick a RunPod PyTorch image that already works well on RunPod
	RUNPOD_IMAGE: "runpod/pytorch:2.4.0-py3.11-cuda12.4.1-devel-ubuntu22.04"

	# Storage (adjust as needed)
	RUNPOD_CONTAINER_DISK_GB: 200
	RUNPOD_VOLUME_GB: 500
	RUNPOD_VOLUME_MOUNT_PATH: /workspace

	steps:
	- name: Install local deps
	run: \|
	set -euo pipefail
	sudo apt-get update
	sudo apt-get install -y jq openssh-client

	- name: Prepare SSH key (private -> file, derive public key)
	run: \|
	set -euo pipefail
	mkdir -p ~/.ssh
	# Write private key exactly, fix CRLF just in case
	printf '%s' "${{ secrets.RUNPOD_SSH_KEY }}" \| tr -d '\r' > ~/.ssh/runpod_key
	chmod 600 ~/.ssh/runpod_key

	ssh-keygen -y -f ~/.ssh/runpod_key > ~/.ssh/runpod_key.pub
	chmod 644 ~/.ssh/runpod_key.pub

	- name: Create RunPod pod (Secure, 8x H100 SXM)
	id: create_pod
	env:
	RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }}
	run: \|
	set -euo pipefail

	POD_NAME="gh-${GITHUB_REPOSITORY##*/}-${GITHUB_REF_NAME}-${GITHUB_RUN_ID}"
	PUBKEY="$(cat ~/.ssh/runpod_key.pub)"

	payload="$(jq -n \
	--arg name "$POD_NAME" \
	--arg image "$RUNPOD_IMAGE" \
	--arg gpuType "$RUNPOD_GPU_TYPE_ID" \
	--arg mountPath "$RUNPOD_VOLUME_MOUNT_PATH" \
	--arg pubkey "$PUBKEY" \
	--argjson gpuCount $RUNPOD_GPU_COUNT \
	--argjson containerDisk $RUNPOD_CONTAINER_DISK_GB \
	--argjson volumeGb $RUNPOD_VOLUME_GB \
	'{
	cloudType: "SECURE",
	computeType: "GPU",
	gpuCount: $gpuCount,
	gpuTypeIds: [$gpuType],
	imageName: $image,
	name: $name,
	interruptible: false,
	supportPublicIp: true,
	ports: ["22/tcp"],
	containerDiskInGb: $containerDisk,
	volumeInGb: $volumeGb,
	volumeMountPath: $mountPath,

	# Ensures the pod uses the public key that matches RUNPOD_SSH_KEY.
	# This avoids relying on account-level SSH keys.
	env: {
	SSH_PUBLIC_KEY: $pubkey
	}
	}')"

	resp="$(curl -sSf -X POST "$RUNPOD_API_BASE/pods" \
	-H "Authorization: Bearer $RUNPOD_API_KEY" \
	-H "Content-Type: application/json" \
	-d "$payload")"

	pod_id="$(jq -r '.id' <<<"$resp")"
	if [[ -z "$pod_id" \|\| "$pod_id" == "null" ]]; then
	echo "Create pod failed, response:" >&2
	echo "$resp" >&2
	exit 1
	fi

	echo "POD_ID=$pod_id" >> "$GITHUB_ENV"

	- name: Wait for RUNNING + public IP + SSH port mapping
	env:
	RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }}
	run: \|
	set -euo pipefail
	: "${POD_ID:?missing POD_ID}"

	for i in $(seq 1 180); do
	pod="$(curl -sSf -H "Authorization: Bearer $RUNPOD_API_KEY" \
	"$RUNPOD_API_BASE/pods/$POD_ID")"

	status="$(jq -r '.desiredStatus // empty' <<<"$pod")"
	ip="$(jq -r '.publicIp // empty' <<<"$pod")"
	port="$(jq -r '.portMappings["22"] // empty' <<<"$pod")"

	if [[ "$status" == "RUNNING" && -n "$ip" && -n "$port" ]]; then
	echo "POD_IP=$ip" >> "$GITHUB_ENV"
	echo "POD_SSH_PORT=$port" >> "$GITHUB_ENV"
	exit 0
	fi

	echo "waiting: status=$status ip=$ip port=$port"
	sleep 10
	done

	echo "Timed out waiting for pod to become reachable over exposed TCP (22/tcp)." >&2
	exit 1

	- name: Wait for SSH handshake
	run: \|
	set -euo pipefail
	: "${POD_IP:?missing POD_IP}"
	: "${POD_SSH_PORT:?missing POD_SSH_PORT}"

	for i in $(seq 1 60); do
	if ssh -i ~/.ssh/runpod_key \
	-o BatchMode=yes \
	-o StrictHostKeyChecking=no \
	-o UserKnownHostsFile=/dev/null \
	-o ConnectTimeout=10 \
	-o ServerAliveInterval=30 \
	-o ServerAliveCountMax=6 \
	-p "$POD_SSH_PORT" \
	root@"$POD_IP" \
	'echo ssh-ready' >/dev/null 2>&1; then
	exit 0
	fi
	echo "waiting for ssh..."
	sleep 10
	done

	echo "SSH never became ready." >&2
	exit 1

	- name: Run workload on pod (fails job on non-zero)
	env:
	RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }}
	run: \|
	set -euo pipefail
	: "${POD_IP:?missing POD_IP}"
	: "${POD_SSH_PORT:?missing POD_SSH_PORT}"

	: "${POD_ID:?missing POD_ID}"
	: "${RUNPOD_API_KEY:?missing RUNPOD_API_KEY}"

	refresh_pod_endpoint() {
	pod="$(curl -sSf -H "Authorization: Bearer $RUNPOD_API_KEY" \
	"$RUNPOD_API_BASE/pods/$POD_ID")"

	ip="$(jq -r '.publicIp // empty' <<<"$pod")"
	port="$(jq -r '.portMappings["22"] // empty' <<<"$pod")"
	if [[ -n "$ip" && -n "$port" ]]; then
	POD_IP="$ip"
	POD_SSH_PORT="$port"
	export POD_IP POD_SSH_PORT
	fi
	}

	ssh_pod() {
	ssh -i ~/.ssh/runpod_key \
	-o BatchMode=yes \
	-o StrictHostKeyChecking=no \
	-o UserKnownHostsFile=/dev/null \
	-o ConnectTimeout=15 \
	-o LogLevel=ERROR \
	-o ServerAliveInterval=30 \
	-o ServerAliveCountMax=6 \
	-p "$POD_SSH_PORT" \
	root@"$POD_IP" "$@"
	}

	scp_to_pod() {
	scp -i ~/.ssh/runpod_key \
	-o BatchMode=yes \
	-o StrictHostKeyChecking=no \
	-o UserKnownHostsFile=/dev/null \
	-o ConnectTimeout=15 \
	-o LogLevel=ERROR \
	-P "$POD_SSH_PORT" \
	"$@"
	}

	# Build a remote job script so the workload survives transient SSH disconnects.
	cat > pod_job.sh <<'POD_EOF'
	#!/usr/bin/env bash
	set -euo pipefail

	RC_FILE=/workspace/modded-nanogpt.rc
	LOG_ZIP=/workspace/modded-nanogpt-logs.zip
	JOB_LOG=/workspace/modded-nanogpt-job.log

	_finalize() {
	rc=$?
	set +e
	printf '%s' "$rc" > "$RC_FILE" 2>/dev/null \|\| true
	cd /workspace \|\| true
	zip_items=()
	if [ -f "$RC_FILE" ]; then
	zip_items+=("$(basename "$RC_FILE")")
	fi
	if [ -f "$JOB_LOG" ]; then
	zip_items+=("$(basename "$JOB_LOG")")
	fi
	if [ -d /workspace/modded-nanogpt/logs ]; then
	zip_items+=("modded-nanogpt/logs")
	fi
	rm -f "$LOG_ZIP"
	if [ "${#zip_items[@]}" -gt 0 ]; then
	zip -r "$LOG_ZIP" "${zip_items[@]}" >/dev/null 2>&1 \|\| true
	fi
	exit "$rc"
	}
	trap _finalize EXIT

	cd /workspace
	export DEBIAN_FRONTEND=noninteractive
	export PIP_DISABLE_PIP_VERSION_CHECK=1
	export GIT_LFS_SKIP_SMUDGE=1

	echo "[job] installing apt deps..."
	apt-get update
	apt-get install -y git zip

	echo "[job] python:"
	python3 --version

	echo "[job] cloning repo..."
	: "${GITHUB_SERVER_URL:?missing GITHUB_SERVER_URL}"
	: "${GITHUB_REPOSITORY:?missing GITHUB_REPOSITORY}"
	: "${GITHUB_REF_NAME:?missing GITHUB_REF_NAME}"
	: "${GITHUB_SHA:?missing GITHUB_SHA}"

	REPO_URL="${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}.git"
	rm -rf /workspace/modded-nanogpt
	git clone --depth 1 --branch "${GITHUB_REF_NAME}" "$REPO_URL" /workspace/modded-nanogpt
	cd /workspace/modded-nanogpt
	if [[ "$(git rev-parse HEAD)" != "$GITHUB_SHA" ]]; then
	git fetch --depth 1 origin "$GITHUB_SHA"
	git checkout "$GITHUB_SHA"
	fi

	echo "[job] installing python deps..."
	pip install -r requirements.txt
	# Use Torch 2.10.x nightlies (2.10 dev builds are < 2.10 final per PEP 440, so we bound with .dev0).
	pip install --pre 'torch>=2.10.0.dev0,<2.11.0.dev0' --index-url https://download.pytorch.org/whl/nightly/cu126 --upgrade

	echo "[job] torch:"
	python3 - <<'PY'
	import torch
	print(torch.__version__)
	print("cuda:", torch.version.cuda)
	print("is_available:", torch.cuda.is_available())
	PY

	echo "[job] preparing data..."
	python3 data/cached_fineweb10B.py 9

	echo "[job] starting training..."
	./run.sh
	POD_EOF

	chmod +x pod_job.sh

	# Upload the script.
	scp_to_pod ./pod_job.sh root@"$POD_IP":/workspace/pod_job.sh

	# Start it in the background (log to file) and return immediately.
	ssh_pod "GITHUB_SERVER_URL='${GITHUB_SERVER_URL}' GITHUB_REPOSITORY='${GITHUB_REPOSITORY}' GITHUB_REF_NAME='${GITHUB_REF_NAME}' GITHUB_SHA='${GITHUB_SHA}' nohup /workspace/pod_job.sh > /workspace/modded-nanogpt-job.log 2>&1 & echo \$! > /workspace/modded-nanogpt.pid"

	# Poll for completion (RC file). Retry SSH with refreshed port mapping if needed.
	for i in $(seq 1 360); do
	set +e
	ssh_pod 'test -f /workspace/modded-nanogpt.rc' >/dev/null 2>&1
	rc=$?
	set -e

	if [[ $rc -eq 0 ]]; then
	break
	fi

	if (( i % 5 == 0 )); then
	echo "still running... (minute=$i)"
	set +e
	ssh_pod 'tail -n 30 /workspace/modded-nanogpt-job.log' \|\| true
	set -e
	fi

	sleep 60

	if (( i % 2 == 0 )); then
	refresh_pod_endpoint \|\| true
	fi
	done

	if ! ssh_pod 'test -f /workspace/modded-nanogpt.rc' >/dev/null 2>&1; then
	echo "Timed out waiting for remote job to finish." >&2
	set +e
	ssh_pod 'tail -n 200 /workspace/modded-nanogpt-job.log' \|\| true
	set -e
	exit 1
	fi

	job_rc="$(ssh_pod 'cat /workspace/modded-nanogpt.rc' \| tr -d '\r\n')"
	echo "Remote job rc=$job_rc"

	scp_from_pod() {
	scp -i ~/.ssh/runpod_key \
	-o BatchMode=yes \
	-o StrictHostKeyChecking=no \
	-o UserKnownHostsFile=/dev/null \
	-o ConnectTimeout=15 \
	-o LogLevel=ERROR \
	-P "$POD_SSH_PORT" \
	"$@"
	}

	download_from_pod() {
	remote_path="$1"
	local_path="$2"
	label="${3:-$remote_path}"

	for attempt in $(seq 1 8); do
	set +e
	scp_from_pod root@"$POD_IP":"$remote_path" "$local_path" >/dev/null 2>&1
	ok=$?
	set -e
	if [[ $ok -eq 0 ]]; then
	echo "downloaded: $label"
	return 0
	fi
	refresh_pod_endpoint \|\| true
	sleep 5
	done

	echo "failed to download: $label" >&2
	return 1
	}

	download_from_pod /workspace/modded-nanogpt-job.log ./modded-nanogpt-job.log "job log" \|\| true
	download_from_pod /workspace/modded-nanogpt.rc ./modded-nanogpt.rc "job rc" \|\| true
	download_from_pod /workspace/modded-nanogpt-logs.zip ./modded-nanogpt-logs.zip "logs zip" \|\| true

	if [[ "$job_rc" != "0" ]]; then
	if [[ -f ./modded-nanogpt-job.log ]]; then
	tail -n 200 ./modded-nanogpt-job.log \|\| true
	else
	set +e
	ssh_pod 'tail -n 200 /workspace/modded-nanogpt-job.log' \|\| true
	set -e
	fi
	exit 1
	fi

	- name: Download logs zip (best effort)
	if: always()
	env:
	RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }}
	run: \|
	set -euo pipefail
	: "${POD_ID:?missing POD_ID}"
	: "${RUNPOD_API_KEY:?missing RUNPOD_API_KEY}"

	refresh_pod_endpoint() {
	pod="$(curl -sSf -H "Authorization: Bearer $RUNPOD_API_KEY" \
	"$RUNPOD_API_BASE/pods/$POD_ID")"

	ip="$(jq -r '.publicIp // empty' <<<"$pod")"
	port="$(jq -r '.portMappings["22"] // empty' <<<"$pod")"
	status="$(jq -r '.desiredStatus // empty' <<<"$pod")"
	if [[ -n "$ip" && -n "$port" ]]; then
	POD_IP="$ip"
	POD_SSH_PORT="$port"
	echo "POD_IP=$POD_IP" >> "$GITHUB_ENV"
	echo "POD_SSH_PORT=$POD_SSH_PORT" >> "$GITHUB_ENV"
	fi
	if [[ -n "${status:-}" ]]; then
	echo "pod status=$status ip=${ip:-} port=${port:-}"
	fi
	}

	refresh_pod_endpoint \|\| true
	if [[ -z "${POD_IP:-}" \|\| -z "${POD_SSH_PORT:-}" ]]; then
	echo "No POD_IP/POD_SSH_PORT, skipping download."
	exit 0
	fi

	for i in $(seq 1 10); do
	set +e
	ssh -i ~/.ssh/runpod_key \
	-o BatchMode=yes \
	-o StrictHostKeyChecking=no \
	-o UserKnownHostsFile=/dev/null \
	-o ConnectTimeout=15 \
	-o LogLevel=ERROR \
	-p "$POD_SSH_PORT" \
	root@"$POD_IP" 'test -f /workspace/modded-nanogpt-logs.zip'
	ok=$?
	set -e
	if [[ $ok -eq 0 ]]; then
	break
	fi
	refresh_pod_endpoint \|\| true
	sleep 10
	done

	if ssh -i ~/.ssh/runpod_key \
	-o BatchMode=yes \
	-o StrictHostKeyChecking=no \
	-o UserKnownHostsFile=/dev/null \
	-o ConnectTimeout=15 \
	-o LogLevel=ERROR \
	-p "$POD_SSH_PORT" \
	root@"$POD_IP" 'test -f /workspace/modded-nanogpt-logs.zip'; then

	scp -i ~/.ssh/runpod_key \
	-o BatchMode=yes \
	-o StrictHostKeyChecking=no \
	-o UserKnownHostsFile=/dev/null \
	-o ConnectTimeout=15 \
	-o LogLevel=ERROR \
	-P "$POD_SSH_PORT" \
	root@"$POD_IP":/workspace/modded-nanogpt-logs.zip \
	./modded-nanogpt-logs.zip
	else
	echo "No /workspace/modded-nanogpt-logs.zip found on pod."
	fi

	# Also try to download the job log + rc for debugging.
	set +e
	scp -i ~/.ssh/runpod_key \
	-o BatchMode=yes \
	-o StrictHostKeyChecking=no \
	-o UserKnownHostsFile=/dev/null \
	-o ConnectTimeout=15 \
	-o LogLevel=ERROR \
	-P "$POD_SSH_PORT" \
	root@"$POD_IP":/workspace/modded-nanogpt-job.log \
	./modded-nanogpt-job.log >/dev/null 2>&1

	scp -i ~/.ssh/runpod_key \
	-o BatchMode=yes \
	-o StrictHostKeyChecking=no \
	-o UserKnownHostsFile=/dev/null \
	-o ConnectTimeout=15 \
	-o LogLevel=ERROR \
	-P "$POD_SSH_PORT" \
	root@"$POD_IP":/workspace/modded-nanogpt.rc \
	./modded-nanogpt.rc >/dev/null 2>&1
	set -e

	- name: Upload logs artifact
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: modded-nanogpt-logs-${{ github.ref_name }}-${{ github.run_id }}
	path: \|
	modded-nanogpt-logs.zip
	modded-nanogpt-job.log
	modded-nanogpt.rc
	if-no-files-found: warn

	- name: Terminate pod (always)
	if: always()
	continue-on-error: true
	env:
	RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }}
	run: \|
	set -euo pipefail
	if [[ -z "${POD_ID:-}" ]]; then
	echo "No POD_ID, nothing to terminate."
	exit 0
	fi

	for i in $(seq 1 10); do
	if curl -sSf -X DELETE "$RUNPOD_API_BASE/pods/$POD_ID" \
	-H "Authorization: Bearer $RUNPOD_API_KEY" >/dev/null; then
	echo "Pod $POD_ID deleted."
	exit 0
	fi
	echo "Delete failed, retrying..."
	sleep 5
	done

	echo "Failed to delete pod $POD_ID after retries."
	exit 0

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

w #11

Workflow file

w #11

Uh oh!

Workflow file for this run