From 36860c24c4859f0139bb4386ed22f805d0c114c3 Mon Sep 17 00:00:00 2001 From: Huanyu He Date: Thu, 19 Jun 2025 18:02:21 -0700 Subject: [PATCH] fix validate nightly binaries Summary: # context * ``` +++ conda run -n build_binary python -c 'import torch; import fbgemm_gpu; import torchrec' +++ local cmd=run +++ case "$cmd" in +++ __conda_exe run -n build_binary python -c 'import torch; import fbgemm_gpu; import torchrec' +++ /opt/conda/bin/conda run -n build_binary python -c 'import torch; import fbgemm_gpu; import torchrec' WARNING: overwriting environment variables set in the machine overwriting variable {'LD_LIBRARY_PATH'} Traceback (most recent call last): File "", line 1, in File "/pytorch/torchrec/torchrec/__init__.py", line 10, in import torchrec.distributed # noqa File "/pytorch/torchrec/torchrec/distributed/__init__.py", line 38, in from torchrec.distributed.model_parallel import DistributedModelParallel # noqa File "/pytorch/torchrec/torchrec/distributed/model_parallel.py", line 18, in from fbgemm_gpu.split_table_batched_embeddings_ops_training import ( File "/opt/conda/envs/build_binary/lib/python3.9/site-packages/fbgemm_gpu/split_table_batched_embeddings_ops_training.py", line 54, in from fbgemm_gpu.tbe.stats import TBEBenchmarkParamsReporter File "/opt/conda/envs/build_binary/lib/python3.9/site-packages/fbgemm_gpu/tbe/stats/__init__.py", line 10, in from .bench_params_reporter import TBEBenchmarkParamsReporter # noqa F401 File "/opt/conda/envs/build_binary/lib/python3.9/site-packages/fbgemm_gpu/tbe/stats/bench_params_reporter.py", line 19, in from fbgemm_gpu.tbe.bench.tbe_data_config import ( File "/opt/conda/envs/build_binary/lib/python3.9/site-packages/fbgemm_gpu/tbe/bench/__init__.py", line 12, in from .bench_config import ( # noqa F401 Traceback (most recent call last): File "/home/ec2-user/actions-runner/_work/torchrec/torchrec/test-infra/.github/scripts/run_with_env_secrets.py", line 102, in File "/opt/conda/envs/build_binary/lib/python3.9/site-packages/fbgemm_gpu/tbe/bench/bench_config.py", line 14, in import click ModuleNotFoundError: No module named 'click' ERROR conda.cli.main_run:execute(47): `conda run python -c import torch; import fbgemm_gpu; import torchrec` failed. (See above for error) main() File "/home/ec2-user/actions-runner/_work/torchrec/torchrec/test-infra/.github/scripts/run_with_env_secrets.py", line 98, in main run_cmd_or_die(f"docker exec -t {container_name} /exec") File "/home/ec2-user/actions-runner/_work/torchrec/torchrec/test-infra/.github/scripts/run_with_env_secrets.py", line 39, in run_cmd_or_die raise RuntimeError(f"Command {cmd} failed with exit code {exit_code}") RuntimeError: Command docker exec -t 96827edf14ff626b7bc16b6cfaa56aa27b4b660029e1fd7755d14bf20a3c4e96 /exec failed with exit code 1 Error: Process completed with exit code 1. ``` Differential Revision: D76875546 --- .github/scripts/validate_binaries.sh | 8 +++----- .github/workflows/validate-nightly-binaries.yml | 12 ++++++------ 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/.github/scripts/validate_binaries.sh b/.github/scripts/validate_binaries.sh index 42c382ac9..b92ada91c 100755 --- a/.github/scripts/validate_binaries.sh +++ b/.github/scripts/validate_binaries.sh @@ -73,22 +73,20 @@ conda env config vars set -n ${CONDA_ENV} \ # export PYTORCH_CUDA_PKG="pytorch-cuda=${MATRIX_GPU_ARCH_VERSION}" # fi -conda run -n "${CONDA_ENV}" pip install importlib-metadata - conda run -n "${CONDA_ENV}" pip install torch --index-url "$PYTORCH_URL" # install fbgemm conda run -n "${CONDA_ENV}" pip install fbgemm-gpu --index-url "$PYTORCH_URL" -# install requirements from pypi -conda run -n "${CONDA_ENV}" pip install torchmetrics==1.0.3 - # install tensordict from pypi conda run -n "${CONDA_ENV}" pip install tensordict==0.8.1 # install torchrec conda run -n "${CONDA_ENV}" pip install torchrec --index-url "$PYTORCH_URL" +# install other requirements +conda run -n "${CONDA_ENV}" pip install -r requirements.txt + # Run small import test conda run -n "${CONDA_ENV}" python -c "import torch; import fbgemm_gpu; import torchrec" diff --git a/.github/workflows/validate-nightly-binaries.yml b/.github/workflows/validate-nightly-binaries.yml index 6d6369495..0cc067912 100644 --- a/.github/workflows/validate-nightly-binaries.yml +++ b/.github/workflows/validate-nightly-binaries.yml @@ -11,14 +11,14 @@ on: branches: - main paths: - - .github/workflows/validate-nightly-binaries.yml - - .github/workflows/validate-binaries.yml - - .github/scripts/validate-binaries.sh + - '.github/workflows/validate-nightly-binaries.yml' + - '.github/workflows/validate-binaries.yml' + - '.github/scripts/validate-binaries.sh' pull_request: paths: - - .github/workflows/validate-nightly-binaries.yml - - .github/workflows/validate-binaries.yml - - .github/scripts/validate-binaries.sh + - '.github/workflows/validate-nightly-binaries.yml' + - '.github/workflows/validate-binaries.yml' + - '.github/scripts/validate-binaries.sh' jobs: nightly: uses: ./.github/workflows/validate-binaries.yml