Skip to content

Update distributed example tests in run_python_examples.sh #1250

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
May 3, 2024
42 changes: 8 additions & 34 deletions run_distributed_examples.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,8 @@
# to pip install dependencies (other than pytorch), run all examples, and remove temporary/changed data files.
# Expects pytorch, torchvision to be installed.

BASE_DIR=`pwd`"/"`dirname $0`
EXAMPLES=`echo $1 | sed -e 's/ //g'`

# Redirect 'python' calls to 'python3'
python() {
command python3 "$@"
}
BASE_DIR="$(pwd)/$(dirname $0)"
source $BASE_DIR/utils.sh

USE_CUDA=$(python -c "import torch; print(torch.cuda.is_available())")
case $USE_CUDA in
Expand All @@ -35,33 +30,12 @@ case $USE_CUDA in
;;
esac

ERRORS=""

function error() {
ERR=$1
ERRORS="$ERRORS\n$ERR"
echo $ERR
}

function install_deps() {
echo "installing requirements"
cat $BASE_DIR/*/requirements.txt | \
sort -u | \
# testing the installed version of torch, so don't pip install it.
grep -vE '^torch$' | \
pip install -r /dev/stdin || \
{ error "failed to install dependencies"; exit 1; }
}

function start() {
EXAMPLE=${FUNCNAME[1]}
cd $BASE_DIR/$EXAMPLE
echo "Running example: $EXAMPLE"
}

function distributed() {
start
torchrun --standalone --nnodes=1 --nproc_per_node=4 tensor_parallelism/fsdp_tp_example.py
bash tensor_parallelism/run_example.sh tensor_parallelism/tensor_parallel_example.py || error "tensor parallel example failed"
bash tensor_parallelism/run_example.sh tensor_parallelism/sequence_parallel_example.py || error "sequence parallel example failed"
bash tensor_parallelism/run_example.sh tensor_parallelism/fsdp_tp_example.py || error "2D parallel example failed"
python ddp/main.py || error "ddp example failed"
}

function clean() {
Expand All @@ -88,8 +62,8 @@ fi
if [ "" == "$ERRORS" ]; then
echo "Completed successfully with status $?"
else
echo "Some examples failed:"
printf "$ERRORS"
echo "Some distributed examples failed:"
printf "$ERRORS\n"
#Exit with error (0-255) in case of failure in one of the tests.
exit 1

Expand Down
49 changes: 6 additions & 43 deletions run_python_examples.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,8 @@
# to pip install dependencies (other than pytorch), run all examples, and remove temporary/changed data files.
# Expects pytorch, torchvision to be installed.

BASE_DIR=`pwd`"/"`dirname $0`
EXAMPLES=`echo $1 | sed -e 's/ //g'`

# Redirect 'python' calls to 'python3'
python() {
command python3 "$@"
}
BASE_DIR="$(pwd)/$(dirname $0)"
source $BASE_DIR/utils.sh

USE_CUDA=$(python -c "import torchvision, torch; print(torch.cuda.is_available())")
case $USE_CUDA in
Expand All @@ -35,43 +30,11 @@ case $USE_CUDA in
;;
esac

ERRORS=""

function error() {
ERR=$1
ERRORS="$ERRORS\n$ERR"
echo $ERR
}

function install_deps() {
echo "installing requirements"
cat $BASE_DIR/*/requirements.txt | \
sort -u | \
# testing the installed version of torch, so don't pip install it.
grep -vE '^torch$' | \
pip install -r /dev/stdin || \
{ error "failed to install dependencies"; exit 1; }
}

function start() {
EXAMPLE=${FUNCNAME[1]}
cd $BASE_DIR/$EXAMPLE
echo "Running example: $EXAMPLE"
}

function dcgan() {
start
python main.py --dataset fake $CUDA_FLAG --mps --dry-run || error "dcgan failed"
}

function distributed() {
start
python tensor_parallelism/tensor_parallel_example.py || error "tensor parallel example failed"
python tensor_parallelism/sequence_parallel_example.py || error "sequence parallel example failed"
python tensor_parallelism/fsdp_tp_example.py || error "2D parallel example failed"
python ddp/main.py || error "ddp example failed"
}

function fast_neural_style() {
start
if [ ! -d "saved_models" ]; then
Expand Down Expand Up @@ -223,9 +186,9 @@ function clean() {
}

function run_all() {
# cpp
# cpp moved to `run_cpp_examples.sh```
dcgan
distributed
# distributed moved to `run_distributed_examples.sh`
fast_neural_style
imagenet
language_translation
Expand Down Expand Up @@ -261,8 +224,8 @@ fi
if [ "" == "$ERRORS" ]; then
echo "Completed successfully with status $?"
else
echo "Some examples failed:"
printf "$ERRORS"
echo "Some python examples failed:"
printf "$ERRORS\n"
#Exit with error (0-255) in case of failure in one of the tests.
exit 1

Expand Down
38 changes: 38 additions & 0 deletions utils.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/usr/bin/env bash
# This script contains utility functions and initialize exmaple scripts.
# Eg: run_python_examples.sh, run_distributed_examples.sh

BASE_DIR="$(pwd)/$(dirname $0)"
EXAMPLES=$(echo $1 | sed -e 's/ //g')

# Redirect 'python' calls to 'python3'
python() {
command python3 "$@"
}

ERRORS=${ERRORS-""}

function error() {
ERR=$1
if [ "" == "$ERRORS" ]; then
ERRORS="$ERR"
else
ERRORS="$ERRORS\n$ERR"
fi
}

function install_deps() {
echo "installing requirements"
cat $BASE_DIR/*/requirements.txt | \
sort -u | \
# testing the installed version of torch, so don't pip install it.
grep -vE '^torch$' | \
pip install -r /dev/stdin || \
{ error "failed to install dependencies"; exit 1; }
}

function start() {
EXAMPLE=${FUNCNAME[1]}
cd $BASE_DIR/$EXAMPLE
echo "Running example: $EXAMPLE"
}