Skip to content

Add Quamba CI

Add Quamba CI #6

Workflow file for this run

# This workflow will install Python dependencies, run tests and lint with a single version of Python
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
name: Quamba CI
on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]
permissions:
contents: read
jobs:
build:
runs-on: self-hosted
# Use the specified Docker image
container:
image: hychiang/quamba-cuda-12.1:latest
options: --gpus all # Enable GPU access if needed
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
# We run with docker, so we don’t need the pip cache:
# - name: Cache pip dependencies
# uses: actions/cache@v3
# with:
# path: ~/.cache/pip
# key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
# restore-keys: |
# we run with the python in docker image, so we don't need these steps to set up python
# - name: Set up Python 3.10
# uses: actions/setup-python@v3
# with:
# python-version: "3.10"
# ${{ runner.os }}-pip-
# - name: Create virtual environment
# run: |
# python -m venv venv
# source venv/bin/activate
# - name: Configure include path
# run: |
# CPATH=$pythonLocation/include/python3.10
# echo "CPATH=$CPATH" >> $GITHUB_ENV
- name: Use system Python
shell: bash # Ensures support for `source` to solve `source: not found` error
run: |
python --version
python -m venv venv
source venv/bin/activate
pip install -r requirements.txt
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install wheel flake8 pytest
# - name: Lint with flake8
# run: |
# # stop the build if there are Python syntax errors or undefined names
# flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
# flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
# Not sure why Megatron-LM will force to install pytorch 2.6.0 and cuda libs
# , so we install Megatron-LM first then do `pip install -r requirements.txt` again here...
- name: Build 3rdparty dependencies
run: |
pip install -e 3rdparty/Megatron-LM
pip install -r requirements.txt
export FAST_HADAMARD_TRANSFORM_FORCE_BUILD=TRUE
pip install -e 3rdparty/fast-hadamard-transform --no-build-isolation
export MAMBA_FORCE_BUILD=TRUE
pip install -e 3rdparty/mamba
bash build_cutlass.sh
- name: Check Environment
run: |
pwd
find /usr/include -name "Python.h"
which python
which pip
pip list
python -c "import torch; print(torch.__version__, torch.cuda.is_available());"
python -c "from sysconfig import get_paths; info = get_paths(); print(info)"
python -c "import fast_hadamard_transform; print(fast_hadamard_transform.__version__);"
python -c "import mamba_ssm; print(mamba_ssm.__version__);"
python -c "import megatron.core; print(megatron.core.__version__);"
- name: Build Quamba
run: |
rm -f *.so
rm -rf build
rm -rf quamba.egg-info
pip install -e .
- name: Test with pytest
run: |
export CUDA_VISIBLE_DEVICES=7
pytest quamba/tests --disable-warnings -v
# test model quantization with generate.py and store the quantized models
- name: Test generate.py
run: |
export CUDA_VISIBLE_DEVICES=7
python generate.py state-spaces/mamba-130m --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/ --quantize --quantize_embedding --quantize_lm_head --w_bits 8 --a_bits 8
python generate.py state-spaces/mamba-130m --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/ --quantize --quantize_embedding --quantize_lm_head --w_bits 4 --a_bits 8 --apply_gptq
python generate.py state-spaces/mamba-130m --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/ --quantize --quantize_embedding --quantize_lm_head --w_bits 4 --a_bits 16 --apply_gptq
python generate.py state-spaces/mamba2-130m --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/ --quantize --quantize_embedding --quantize_lm_head --w_bits 8 --a_bits 8 --group_heads
python generate.py state-spaces/mamba2-130m --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/ --quantize --quantize_embedding --quantize_lm_head --w_bits 4 --a_bits 8 --apply_gptq --group_heads
python generate.py state-spaces/mamba2-130m --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/ --quantize --quantize_embedding --quantize_lm_head --w_bits 4 --a_bits 16 --apply_gptq
# test generate.py with w4ax hybrid model and store w4ax hybrid models
# we hack and apply the mamba2-8B hybrid config (searched_1400_v3.json) to state-spaces/mamba2-130m
# - name: Test w4ax hybrid generate.py
# run: |
# export CUDA_VISIBLE_DEVICES=7
# python generate.py state-spaces/mamba2-130m --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/ --quantize --quantize_embedding --quantize_lm_head --w_bits 4 --apply_gptq --group_heads --hybrid_blocks --hybrid_blocks_config configs/hybrid/mamba2-8b/searched_1400_v3.json
# test loading the stored quantized models with generate.py
- name: Test loading quantized models
run: |
export CUDA_VISIBLE_DEVICES=7
python generate.py ut-enyac/quamba-130m-w8a8 --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/
python generate.py ut-enyac/quamba-130m-w4a8 --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/
python generate.py ut-enyac/quamba-130m-w4a16 --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/
python generate.py ut-enyac/quamba2-130m-w8a8 --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/
python generate.py ut-enyac/quamba2-130m-w4a8 --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/
python generate.py ut-enyac/quamba2-130m-w4a16 --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/
# test loading the stored w4ax hybrid model with generate.py
# we hack and apply the mamba2-8B hybrid config (searched_1400_v3.json) to state-spaces/mamba2-130m
# - name: Test loading w4ax hybrid generate.py
# run: |
# export CUDA_VISIBLE_DEVICES=7
# python generate.py ut-enyac/quamba2-130m-w4aX-searched_1400_v3 --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/
- name: Clean up pretrained models
run: |
rm -rf pretrained_models/ut-enyac/*