add hybrid model and update readme #21
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # This workflow will install Python dependencies, run tests and lint with a single version of Python | |
| # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python | |
| name: Quamba CI | |
| on: | |
| push: | |
| branches: [ "main" ] | |
| pull_request: | |
| branches: [ "main" ] | |
| permissions: | |
| contents: read | |
| jobs: | |
| build: | |
| runs-on: self-hosted | |
| # Use the specified Docker image | |
| container: | |
| image: hychiang/quamba-cuda-12.1:latest | |
| options: --gpus all --shm-size=64G --memory=128g --memory-swap=128g | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| submodules: recursive | |
| # - name: Set up Python 3.10 | |
| # uses: actions/setup-python@v3 | |
| # with: | |
| # python-version: "3.10" | |
| # - name: Cache pip dependencies | |
| # uses: actions/cache@v3 | |
| # with: | |
| # path: ~/.cache/pip | |
| # key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} | |
| # restore-keys: | | |
| # ${{ runner.os }}-pip- | |
| # - name: Create virtual environment | |
| # run: | | |
| # python -m venv venv | |
| # source venv/bin/activate | |
| # - name: Configure include path | |
| # run: | | |
| # CPATH=$pythonLocation/include/python3.10 | |
| # echo "CPATH=$CPATH" >> $GITHUB_ENV | |
| - name: Use system Python | |
| shell: bash # Ensures support for `source` | |
| run: | | |
| python --version | |
| python -m venv venv | |
| source venv/bin/activate | |
| - name: Install dependencies | |
| run: | | |
| python -m pip install --upgrade pip | |
| pip install wheel flake8 pytest | |
| # - name: Lint with flake8 | |
| # run: | | |
| # # stop the build if there are Python syntax errors or undefined names | |
| # flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics | |
| # # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide | |
| # flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics | |
| # Not sure why Megatron-LM will force to install pytorch 2.5.1 and cuda libs, | |
| # so we need to install Megatron-LM first, and then do `pip install -r requirements.txt` | |
| - name: Build 3rdparty dependencies | |
| run: | | |
| pip install -e 3rdparty/Megatron-LM | |
| pip install -r requirements.txt | |
| export FAST_HADAMARD_TRANSFORM_FORCE_BUILD=TRUE | |
| pip install -e 3rdparty/fast-hadamard-transform --no-build-isolation | |
| export MAMBA_FORCE_BUILD=TRUE | |
| pip install -e 3rdparty/mamba | |
| bash build_cutlass.sh | |
| - name: Check Environment | |
| run: | | |
| pwd | |
| find /usr/include -name "Python.h" | |
| which python | |
| which pip | |
| pip list | |
| python -c "import torch; print(torch.__version__, torch.cuda.is_available());" | |
| python -c "from sysconfig import get_paths; info = get_paths(); print(info)" | |
| python -c "import fast_hadamard_transform; print(fast_hadamard_transform.__version__);" | |
| python -c "import mamba_ssm; print(mamba_ssm.__version__);" | |
| python -c "import megatron.core; print(megatron.core.__version__);" | |
| - name: Build Quamba | |
| run: | | |
| rm -f *.so | |
| rm -rf build | |
| rm -rf quamba.egg-info | |
| pip install -e . | |
| - name: Test with pytest | |
| run: | | |
| export CUDA_VISIBLE_DEVICES=7 | |
| pytest quamba/tests --disable-warnings -v | |
| # test model quantization with generate.py and store the quantized models | |
| - name: Test generate.py | |
| run: | | |
| export CUDA_VISIBLE_DEVICES=7 | |
| python generate.py state-spaces/mamba-130m --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/ --quantize --quantize_embedding --quantize_lm_head --w_bits 8 --a_bits 8 | |
| python generate.py state-spaces/mamba-130m --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/ --quantize --quantize_embedding --quantize_lm_head --w_bits 4 --a_bits 8 --apply_gptq | |
| python generate.py state-spaces/mamba-130m --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/ --quantize --quantize_embedding --quantize_lm_head --w_bits 4 --a_bits 16 --apply_gptq | |
| python generate.py state-spaces/mamba2-130m --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/ --quantize --quantize_embedding --quantize_lm_head --w_bits 8 --a_bits 8 --group_heads | |
| python generate.py state-spaces/mamba2-130m --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/ --quantize --quantize_embedding --quantize_lm_head --w_bits 4 --a_bits 8 --apply_gptq --group_heads | |
| python generate.py state-spaces/mamba2-130m --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/ --quantize --quantize_embedding --quantize_lm_head --w_bits 4 --a_bits 16 --apply_gptq | |
| # test generate.py with w4ax hybrid model and store w4ax hybrid models | |
| # we hack and apply the mamba2-8B hybrid config (hybrid_blocks_config.json) to state-spaces/mamba2-130m | |
| - name: Test w4ax hybrid generate.py | |
| run: | | |
| export CUDA_VISIBLE_DEVICES=7 | |
| python generate.py state-spaces/mamba2-130m --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/ --quantize --quantize_embedding --quantize_lm_head --w_bits 4 --apply_gptq --group_heads --hybrid_blocks --hybrid_blocks_config configs/hybrid/mamba2-8b/hybrid_blocks_config.json | |
| # test loading the stored quantized models with generate.py | |
| - name: Test loading quantized models | |
| run: | | |
| export CUDA_VISIBLE_DEVICES=7 | |
| python generate.py ut-enyac/quamba-130m-w8a8 --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/ | |
| python generate.py ut-enyac/quamba-130m-w4a8 --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/ | |
| python generate.py ut-enyac/quamba-130m-w4a16 --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/ | |
| python generate.py ut-enyac/quamba2-130m-w8a8 --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/ | |
| python generate.py ut-enyac/quamba2-130m-w4a8 --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/ | |
| python generate.py ut-enyac/quamba2-130m-w4a16 --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/ | |
| # test loading the stored w4ax hybrid model with generate.py | |
| # we hack and apply the mamba2-8B hybrid config (hybrid_blocks_config.json) to state-spaces/mamba2-130m | |
| - name: Test loading w4ax hybrid generate.py | |
| run: | | |
| export CUDA_VISIBLE_DEVICES=7 | |
| python generate.py ut-enyac/quamba2-130m-w4aX-hybrid_blocks_config --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/ | |
| - name: Clean up pretrained models | |
| run: | | |
| rm -rf pretrained_models/ut-enyac/* |