jianhan-amd
diff --git a/‎launch.sh‎
Lines changed: 11 additions & 13 deletions b/‎launch.sh‎
Lines changed: 11 additions & 13 deletions
diff --git a/‎multi_node/README.md‎
Lines changed: 198 additions & 0 deletions b/‎multi_node/README.md‎
Lines changed: 198 additions & 0 deletions
diff --git a/‎multi_node/docker/jax_maxdiffusion_wan2.1_train_inference.ubuntu.amd.Dockerfile‎
100755100644
Lines changed: 1 addition & 36 deletions b/‎multi_node/docker/jax_maxdiffusion_wan2.1_train_inference.ubuntu.amd.Dockerfile‎
100755100644
Lines changed: 1 addition & 36 deletions
diff --git a/‎multi_node/run_multinode_train.sh‎
Lines changed: 32 additions & 0 deletions b/‎multi_node/run_multinode_train.sh‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎multi_node/wan_multinode_train.sbatch‎
Lines changed: 1 addition & 1 deletion b/‎multi_node/wan_multinode_train.sbatch‎
Lines changed: 1 addition & 1 deletion
@@ -16,14 +16,12 @@ done
 
 # Set default log file if not provided
 if [ -z "$LOG_PATH" ]; then
-  LOG_PATH="$PWD/output/output_$EXP_NAME.log"
+  LOG_PATH="$PWD/output/"
 fi
 
 export HF_TOKEN=""
 export HF_HOME="/app/hf_home/"
 
-# export ROCR_VISIBLE_DEVICES="4,5,6,7"
-
 export MIOPEN_CUSTOM_CACHE_DIR="/app/.cache/miopen/"
 export JAX_COMPILATION_CACHE_DIR="/app/.cache/jax/"
 export JAX_PERSISTENT_CACHE_ENABLE_XLA_CACHES="all"
@@ -54,7 +52,7 @@ export NVTE_CK_HOW_V3_BF16_CVT=1    # default
 export NVTE_ALLOW_NONDETERMINISTIC_ALGO=1
 
 export NCCL_IB_HCA=bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re6,bnxt_re7
-export NCCL_SOCKET_IFNAME=ens51f1np1
+export NCCL_SOCKET_IFNAME=enp159s0np0
 export NCCL_IB_GID_INDEX=3
 export NCCL_PROTO=Simple
 
@@ -65,15 +63,14 @@ export GPU_MAX_HW_QUEUES=2
 export HIP_FORCE_DEV_KERNARG=1
 export HSA_NO_SCRATCH_RECLAIM=1
 # NCCL flags
-export NCCL_DEBUG=INFO  #WARN, INFO
+export NCCL_DEBUG=WARN  #WARN, INFO
 # export NCCL_DEBUG_SUBSYS=ALL
-# export RCCL_REPLAY_FILE=/shared_nfs/jianhan/slurm_logs-${SCALING_EXP}/cohere-${SLURM_JOB_NUM_NODES}N-8x22B-${SLURM_JOB_ID}-${timestamp}/mixtral_8x-22b_128N_run.bin
 export NCCL_PROTO=Simple
 export NCCL_IB_TIMEOUT=20
 export NCCL_IB_TC=41
 export NCCL_IB_SL=0
 
-export GLOO_SOCKET_IFNAME=ens51f1np1
+export GLOO_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME}
 export NCCL_CROSS_NIC=0
 export NCCL_CHECKS_DISABLE=1
 export NCCL_IB_QPS_PER_CONNECTION=1
@@ -100,22 +97,23 @@ export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enabl
 
 rm -rf /app/.cache/*
 python3 setup.py develop
+ulimit -n 4096
 
-EXP_NAME="WAN_train"
+EXP_NAME="train"
 LOG_FILE="$LOG_PATH/output_$HOST_NAME.log"
 
+
 # python -m src.maxdiffusion.train_flux src/maxdiffusion/configs/base_flux_dev.yml \
 python -m src.maxdiffusion.train_wan src/maxdiffusion/configs/base_wan_14b.yml \
-        run_name="run_$EXP_NAME" output_dir="$PWD/output" \
+        run_name="run_$EXP_NAME" output_dir="$LOG_PATH" \
         hardware=gpu \
         attention=cudnn_flash_te \
-        max_train_steps=10 \
-        dcn_data_parallelism=-1 \
-        dcn_fsdp_batch_parallelism=1 \
+        max_train_steps=20 \
+        dcn_data_parallelism=1 \
+        dcn_fsdp_parallelism=-1 \
         ici_data_parallelism=1 \
         ici_fsdp_parallelism=8 \
         per_device_batch_size=1 \
-        enable_ssim=False \
         "${FILTERED_ARGS[@]}" |& tee -a "$LOG_FILE"
 
 
 
@@ -0,0 +1,198 @@
+# Multi-Node WAN Training Guide
+
+Distributed WAN model training across multiple nodes with AMD ROCm GPUs.
+
+## Quick Start
+
+### Option 1: Using Helper Script (Recommended)
+
+```bash
+cd /home/amd/jianhan/github/maxdiffusion/multi_node
+
+# Edit run_multinode_train.sh to set configuration and enable/disable steps
+bash run_multinode_train.sh
+```
+
+### Option 2: Manual Execution
+
+```bash
+# Set ALL required environment variables (no defaults)
+export COORDINATOR_IP="172.29.0.73"
+export IMAGE_TAG="maxdiffusion-multinode-train:v1"
+export MULTI_NODES_LOG_DIR="/home/amd/jianhan/multi_node_log"
+export SHARE_DOCKERFILE_PATH="/home/amd/jianhan/github/maxdiffusion/multi_node/docker/jax_maxdiffusion_wan2.1_train_inference.ubuntu.amd.Dockerfile"
+export SHARED_CODE_BASE_PATH="/home/amd/jianhan/github/maxdiffusion"
+export MAXDIFFUSION_DIR_IN_DOCKER="/app/maxdiffusion"
+export RUN_NAME="WAN_14B_FSDP8"
+export REMOVE_IMAGES="n"
+export CHMOD_RUN="n"
+export REGISTRY_USERNAME="rocmshared"
+export REGISTRY_TOKEN="your_token"
+
+# Run commands
+bash wan_multinode_train.sh "node1,node2,node3,node4" clean
+bash wan_multinode_train.sh "node1,node2,node3,node4" build
+bash wan_multinode_train.sh "node1,node2,node3,node4" launch
+
+# Monitor training
+tail -f ${MULTI_NODES_LOG_DIR}/slurm_logs/${RUN_NAME}_*/node_*_rank_0.log
+```
+
+## Prerequisites
+
+- **Password-less SSH**: Set up SSH keys for all nodes
+  ```bash
+  ssh-keygen -t ed25519 -C "multinode-training"
+  for node in node1 node2 node3; do ssh-copy-id $node; done
+  ```
+- **Docker 20.10+** on all nodes
+- **AMD ROCm 5.7+** with MI250/MI300 GPUs
+- **Port 12345 open** between nodes (JAX coordinator)
+- **50GB+ disk space** per node
+
+## Environment Variables
+
+**All variables are required (no defaults):**
+
+| Variable | Description | Example |
+|----------|-------------|---------|
+| `COORDINATOR_IP` | JAX coordinator IP | `172.29.0.73` |
+| `IMAGE_TAG` | Docker image name | `maxdiffusion-multinode-train:v1` |
+| `MULTI_NODES_LOG_DIR` | Base log directory | `/home/amd/jianhan/multi_node_log` |
+| `SHARE_DOCKERFILE_PATH` | Path to Dockerfile | `/home/amd/.../jax_maxdiffusion_wan2.1...Dockerfile` |
+| `SHARED_CODE_BASE_PATH` | Codebase path | `/home/amd/jianhan/github/maxdiffusion` |
+| `MAXDIFFUSION_DIR_IN_DOCKER` | Docker mount path | `/app/maxdiffusion` |
+| `RUN_NAME` | Experiment name | `WAN_14B_FSDP8` or `WAN_1_3B_FSDP8` |
+| `REMOVE_IMAGES` | Remove images on clean | `y` or `n` |
+| `CHMOD_RUN` | Only fix permissions (skip training) | `y` or `n` (default: `n`) |
+| `REGISTRY_USERNAME` | Docker Hub username | `rocmshared` |
+| `REGISTRY_TOKEN` | Docker Hub token | Your token |
+
+## Scripts Overview
+
+- **`run_multinode_train.sh`**: Helper script with pre-configured variables. Edit to set config and enable/disable steps
+- **`wan_multinode_train.sh`**: Main wrapper for clean/build/launch operations (requires all env vars)
+- **`wan_multinode_train_clean.sh`**: Cleans containers and syncs codebase via rsync
+- **`wan_multinode_train_build_docker.sh`**: Builds Docker images in parallel (5 retries)
+- **`wan_multinode_train_launch.sh`**: Launches distributed training with JAX
+
+## Directory Structure
+
+```
+multi_node_log/
+├── slurm_logs/
+│   ├── CLEAN_*N_*/              # Cleanup logs
+│   ├── BUILD_DOCKER_*N_*/       # Build logs
+│   └── ${RUN_NAME}_*N_*/        # Training logs (e.g., WAN_14B_FSDP8_4N_20260204-141300)
+│       ├── node_*_rank_0.log    # Primary logs
+│       └── host_output.{out,err}
+└── output/
+    └── ${RUN_NAME}_*N_*/        # Checkpoints
+```
+
+## Typical Workflow
+
+```bash
+# First time: Run all steps
+bash run_multinode_train.sh
+
+# Code changes: Skip build (edit run_multinode_train.sh, comment out build line)
+# Dockerfile changes: Run build only (comment out clean and launch)
+# Quick iteration: Run clean + launch only (comment out build)
+```
+
+## Common Commands
+
+```bash
+# Change model
+export RUN_NAME="WAN_1_3B_FSDP8"  # or WAN_14B_FSDP8
+
+# Remove Docker images (free disk space)
+export REMOVE_IMAGES="y"
+
+# Fix permissions only (no training) - useful for permission issues
+export CHMOD_RUN="y"
+bash wan_multinode_train.sh "node1,node2,node3,node4" launch
+
+# Monitor latest run
+LATEST=$(ls -td ${MULTI_NODES_LOG_DIR}/slurm_logs/${RUN_NAME}_* | head -1)
+tail -f ${LATEST}/node_*_rank_0.log
+
+# Average step time (exclude warmup)
+grep "seconds:" ${LATEST}/node_*_rank_0.log | tail -n +2 | \
+    awk -F'seconds: ' '{print $2}' | awk '{sum+=$1; count++} END {printf "Avg: %.2fs\n", sum/count}'
+
+# Check GPU utilization
+for node in core42-5-a08u01 core42-1-a08u07 core42-3-a08u19 core42-4-a08u25; do
+    ssh $node "rocm-smi --showuse"
+done
+
+# Check containers
+for node in core42-5-a08u01 core42-1-a08u07; do
+    ssh $node "docker ps"
+done
+```
+
+## Performance (WAN 14B, 4 nodes × 8 GPUs)
+
+- **Batch size/device**: 1
+- **Resolution**: 1280×720 × 85 frames
+- **Speed**: ~82-83s/step (after warmup)
+- **Throughput**: ~255 TFLOP/s/device
+- **FPS/device**: ~1.03
+- **First step**: ~300s (JIT compilation)
+
+**Single Node**: For testing, omit node list (defaults to single node): `bash wan_multinode_train.sh "" launch`  
+Or specify: `bash wan_multinode_train.sh "core42-4-a08u25" launch`
+
+## Troubleshooting
+
+```bash
+# SSH issues
+ssh -vvv node1  # Test connectivity
+eval "$(ssh-agent -s)" && ssh-add ~/.ssh/id_ed25519
+
+# Docker issues
+ssh node1 "docker ps"  # Check Docker
+ssh node1 "sudo usermod -aG docker $USER"  # Add to docker group
+
+# JAX timeout
+ssh node1 "hostname -I"  # Get coordinator IP
+export COORDINATOR_IP="172.29.0.XX"
+ssh node2 "nc -zv $COORDINATOR_IP 12345"  # Test port
+
+# GPU not visible
+ssh node1 "rocm-smi"  # Check GPUs
+ssh node1 "docker run --rm --privileged -e HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 ${IMAGE_TAG} rocm-smi"
+
+# Build failures
+cat ${MULTI_NODES_LOG_DIR}/slurm_logs/BUILD_DOCKER_*/build_*.log
+ssh node1 "docker system prune -af"  # Clean cache
+
+# Permission issues (codebase not writable)
+export CHMOD_RUN="y"
+bash wan_multinode_train.sh "node1,node2" launch  # Fix perms only
+
+# Debug mode
+bash -x wan_multinode_train.sh "node1,node2" clean  # Verbose
+```
+
+## Log Analysis
+
+```bash
+# Find latest run
+LOG_DIR=$(ls -td ${MULTI_NODES_LOG_DIR}/slurm_logs/${RUN_NAME}_* | head -1)
+
+# View metrics
+grep "seconds:\|loss:\|TFLOP/s" ${LOG_DIR}/node_*_rank_0.log
+
+# Calculate stats
+grep "seconds:" ${LOG_DIR}/node_*_rank_0.log | tail -n +2 | \
+    awk -F'seconds: ' '{print $2}' | awk '{sum+=$1; count++} END {printf "Mean: %.2fs, Total: %d steps\n", sum/count, count}'
+```
+
+## Resources
+
+- [JAX Distributed](https://jax.readthedocs.io/en/latest/multi_process.html)
+- [AMD ROCm](https://rocmdocs.amd.com/)
+- [MaxDiffusion](https://github.com/google/maxdiffusion)
@@ -25,8 +25,7 @@
 #
 #################################################################################
 
-ARG BASE_DOCKER=rocm/pyt-megatron-lm-jax-nightly-private:jax_rocm7.1_jax_0.7.1_20251215
-# ARG BASE_DOCKER=rocm/jax-training:maxtext-v25.11
+ARG BASE_DOCKER=rocm/jax-training:maxtext-v25.11
 FROM $BASE_DOCKER
 USER root
 ENV WORKSPACE_DIR=/workspace
@@ -65,44 +64,10 @@ RUN pip install \
     typeguard==2.13.3 \
     qwix==0.1.5 --no-deps
 
-#Download MaxDiffusion
-# RUN cd ${WORKSPACE_DIR} && \
-#     git clone https://github.com/AI-Hypercomputer/maxdiffusion.git && \
-#     cd maxdiffusion && \
-#     git reset --hard "07b4d29c4a9bbdaafa501299275dcb15b5365034" && \
-#     python3 setup.py develop
-# RUN cd ${WORKSPACE_DIR} && \
-#     git clone https://github.com/cpersson-amd/maxdiffusion.git && \
-#     cd maxdiffusion && \
-#     git reset --hard "07b4d29c4a9bbdaafa501299275dcb15b5365034" && \
-#     python3 setup.py develop
-
 # Display installed packages for verification
 RUN pip list
 
-# libaries for IB fabric
-RUN apt-get update
-RUN apt-get install -y libelf-dev unzip
-RUN apt-get install -y gcc make libtool autoconf librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils perftest ethtool libibverbs-dev rdma-core strace libibmad5 libibnetdisc5 ibverbs-providers libibumad-dev libibumad3 libibverbs1 libnl-3-dev libnl-route-3-dev
-
-WORKDIR $WORKSPACE_DIR/
-
-# The drivers should upgrade with each release and match the host version
-RUN wget https://docs.broadcom.com/docs-and-downloads/ethernet-network-adapters/NXE/Thor2/GCA1/bcm5760x_230.2.52.0a.zip
-RUN unzip bcm5760x_230.2.52.0a.zip
-RUN cd bcm5760x_230.2.52.0a/drivers_linux/bnxt_rocelib/ && \
-    results=$(find -name "libbnxt*.tar.gz") && tar -xf $results && \
-    untar_dir=$(find . -maxdepth 1 -type d -name "libbnxt*" ! -name "*.tar.gz" | head -n 1) && \
-    cd $untar_dir && sh autogen.sh && ./configure && make && \
-    find /usr/lib64/ /usr/lib -name "libbnxt_re-rdmav*.so" -exec mv {} {}.inbox \; && \
-    make install all && sudo sh -c "echo /usr/local/lib >> /etc/ld.so.conf" && \
-    sudo ldconfig && \
-    cp -f bnxt_re.driver /etc/libibverbs.d/ && \
-    find . -name "*.so" -exec md5sum {} \; && \
-    BUILT_MD5SUM=$(find . -name "libbnxt_re-rdmav*.so" -exec md5sum {} \; | cut -d " " -f 1) && \
-    echo -e "\n\nmd5sum of the built libbnxt_re is $BUILT_MD5SUM"
 
-RUN ibv_devices
 
 
 
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+# Required environment variables for wan_multinode_train.sh
+
+# core42-4-a08u25:172.29.0.73
+export COORDINATOR_IP=172.29.0.73
+export IMAGE_TAG=your-name-wan-multinode-train:v1
+# Please keep MULTI_NODES_LOG_DIR outside SHARED_CODE_BASE_PATH since we are going to sync the whole SHARED_CODE_BASE_PATH
+export MULTI_NODES_LOG_DIR=/home/amd/your_dir/multi_node_log
+export SHARE_DOCKERFILE_PATH=/home/amd/your_dir/maxdiffusion/multi_node/docker/jax_maxdiffusion_wan2.1_train_inference.ubuntu.amd.Dockerfile
+export SHARED_CODE_BASE_PATH=/home/amd/your_dir/maxdiffusion
+export MAXDIFFUSION_DIR_IN_DOCKER=/app/maxdiffusion
+export RUN_NAME=WAN_14B_FSDP8
+export REMOVE_IMAGES=n
+export REGISTRY_USERNAME=""
+export REGISTRY_TOKEN=""
+export CHMOD_RUN=n
+
+# Define node list
+# Please put the JAX COORDINATOR to the first of the list. The JAX COORDINATOR node will be launched before others to make sure all nodes can connect to the JAX COORDINATOR service.
+# core42-4-a08u25:172.29.0.73
+NODES="core42-4-a08u25,core42-1-a08u07,core42-3-a08u19,core42-5-a08u01"
+
+# 1. Clean and sync codebase
+# To remove Docker images during cleanup, uncomment the line below:
+bash wan_multinode_train.sh "$NODES" clean
+
+# # 2. Build Docker images (only when Dockerfile changes)
+bash wan_multinode_train.sh "$NODES" build
+
+# # 3. Launch training
+bash wan_multinode_train.sh "$NODES" launch
@@ -104,7 +104,7 @@ echo "Building the container image on all nodes"
 srun bash -c '
     MAX_RETRIES=5
     INITIAL_DELAY=30 # seconds
-    MAX_DELAY=1800 # seconds
+    MAX_DELAY=180 # seconds
     RETRY_COUNT=0
 
     while true; do