Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 48 additions & 32 deletions tools/prologs-epilogs/imex_prolog
Original file line number Diff line number Diff line change
Expand Up @@ -16,48 +16,64 @@
# Slurm Prolog for NVIDIA Imex

if ! systemctl list-unit-files --all | grep -Fq "nvidia-imex.service"; then
exit 0
exit 0
fi

activate_imex() {
set -ex
set -ex

# Clean the config file in case the service gets started by accident
> /etc/nvidia-imex/nodes_config.cfg
# Clean the config file in case the service gets started by accident
> /etc/nvidia-imex/nodes_config.cfg

NVIDIA_IMEX_START_TIMEOUT=80
IMEX_CONN_WAIT_TIMEOUT=70
IMEX_SHUTDOWN_WAIT=60
NVIDIA_IMEX_STOP_TIMEOUT=15
DOMAIN_READY_TIMEOUT=15
IMEX_SERVER_PORT=1101
IMEX_CMD_PORT=1102
# clean up prev connection
set +e
timeout $NVIDIA_IMEX_STOP_TIMEOUT systemctl stop nvidia-imex
pkill -9 nvidia-imex
set -e
NVIDIA_IMEX_STOP_TIMEOUT=15
IMEX_CONN_WAIT_TIMEOUT=70
IMEX_SERVER_PORT=1101
IMEX_CMD_PORT=1102

# update peer list
scontrol -a show node "${SLURM_NODELIST}" -o | sed 's/^.* NodeAddr=\([^ ]*\).*/\1/' > /etc/nvidia-imex/nodes_config.cfg
# clean up prev connection
set +e
timeout $NVIDIA_IMEX_STOP_TIMEOUT systemctl stop nvidia-imex
pkill -9 nvidia-imex
set -e

# rotate server port to prevent race condition
sed -i "s/SERVER_PORT.*/SERVER_PORT=${IMEX_SERVER_PORT}/" /etc/nvidia-imex/config.cfg
# Actively poll to see if the port is free before proceeding.
PORT_TO_CHECK=${IMEX_SERVER_PORT}
MAX_WAIT_SECONDS=60
for ((i=0; i<MAX_WAIT_SECONDS; i++)); do
if ! ss -lnt | grep -q ":${PORT_TO_CHECK}\s"; then
break # Port is free, exit loop
fi
sleep 1
done
if ((i == MAX_WAIT_SECONDS)); then
echo "IMEX Prolog Error: Port ${PORT_TO_CHECK} did not become free within ${MAX_WAIT_SECONDS}s." >&2
exit 1
fi

# enable imex-ctl on all nodes so you can query imex status with: nvidia-imex-ctl -a -q
sed -i "s/IMEX_CMD_PORT.*/IMEX_CMD_PORT=${IMEX_CMD_PORT}/" /etc/nvidia-imex/config.cfg
sed -i "s/IMEX_CMD_ENABLED.*/IMEX_CMD_ENABLED=1/" /etc/nvidia-imex/config.cfg
# update peer list
scontrol -a show node "${SLURM_NODELIST}" -o | sed 's/^.* NodeAddr=\([^ ]*\).*/\1/' > /etc/nvidia-imex/nodes_config.cfg

# set timeouts for start
sed -i "s/IMEX_CONN_WAIT_TIMEOUT.*/IMEX_CONN_WAIT_TIMEOUT=${IMEX_CONN_WAIT_TIMEOUT}/" /etc/nvidia-imex/config.cfg
# configure port and enable imex-ctl
sed -i "s/SERVER_PORT.*/SERVER_PORT=${IMEX_SERVER_PORT}/" /etc/nvidia-imex/config.cfg
sed -i "s/IMEX_CMD_PORT.*/IMEX_CMD_PORT=${IMEX_CMD_PORT}/" /etc/nvidia-imex/config.cfg
sed -i "s/IMEX_CMD_ENABLED.*/IMEX_CMD_ENABLED=1/" /etc/nvidia-imex/config.cfg
sed -i "s/IMEX_CONN_WAIT_TIMEOUT.*/IMEX_CONN_WAIT_TIMEOUT=${IMEX_CONN_WAIT_TIMEOUT}/" /etc/nvidia-imex/config.cfg

sleep ${IMEX_SHUTDOWN_WAIT}
netstat -na | grep tcp | grep ":${IMEX_SERVER_PORT}" || true
netstat -na | grep tcp | grep ":${IMEX_CMD_PORT}" || true
systemctl start nvidia-imex

timeout $NVIDIA_IMEX_START_TIMEOUT systemctl start nvidia-imex

sleep ${DOMAIN_READY_TIMEOUT}
# Actively poll to see if the service started successfully.
MAX_WAIT_SECONDS=20
for ((i=0; i<MAX_WAIT_SECONDS; i++)); do
if systemctl is-active --quiet nvidia-imex; then
break # Service is active, exit loop
fi
sleep 1
done
if ((i == MAX_WAIT_SECONDS)); then
echo "IMEX Prolog Error: nvidia-imex service did not start within ${MAX_WAIT_SECONDS}s." >&2
# Check the journal for errors
journalctl -u nvidia-imex --no-pager -n 20
exit 1
fi
}

activate_imex >> "/var/log/slurm/imex_prolog_${SLURM_JOB_ID}.log" 2>&1
Loading