Add Quamba CI #6

Workflow file for this run

.github/workflows/quamba-ci.yml at 8b42893

	# This workflow will install Python dependencies, run tests and lint with a single version of Python
	# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python

	name: Quamba CI

	on:
	push:
	branches: [ "main" ]
	pull_request:
	branches: [ "main" ]

	permissions:
	contents: read

	jobs:
	build:
	runs-on: self-hosted

	# Use the specified Docker image
	container:
	image: hychiang/quamba-cuda-12.1:latest
	options: --gpus all # Enable GPU access if needed

	steps:
	- uses: actions/checkout@v4
	with:
	submodules: recursive
	# We run with docker, so we don’t need the pip cache:
	# - name: Cache pip dependencies
	# uses: actions/cache@v3
	# with:
	# path: ~/.cache/pip
	# key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
	# restore-keys: \|
	# we run with the python in docker image, so we don't need these steps to set up python
	# - name: Set up Python 3.10
	# uses: actions/setup-python@v3
	# with:
	# python-version: "3.10"
	# ${{ runner.os }}-pip-
	# - name: Create virtual environment
	# run: \|
	# python -m venv venv
	# source venv/bin/activate
	# - name: Configure include path
	# run: \|
	# CPATH=$pythonLocation/include/python3.10
	# echo "CPATH=$CPATH" >> $GITHUB_ENV
	- name: Use system Python
	shell: bash # Ensures support for `source` to solve `source: not found` error
	run: \|
	python --version
	python -m venv venv
	source venv/bin/activate
	pip install -r requirements.txt
	- name: Install dependencies
	run: \|
	python -m pip install --upgrade pip
	pip install wheel flake8 pytest
	# - name: Lint with flake8
	# run: \|
	# # stop the build if there are Python syntax errors or undefined names
	# flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
	# # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
	# flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
	# Not sure why Megatron-LM will force to install pytorch 2.6.0 and cuda libs
	# , so we install Megatron-LM first then do `pip install -r requirements.txt` again here...
	- name: Build 3rdparty dependencies
	run: \|
	pip install -e 3rdparty/Megatron-LM
	pip install -r requirements.txt
	export FAST_HADAMARD_TRANSFORM_FORCE_BUILD=TRUE
	pip install -e 3rdparty/fast-hadamard-transform --no-build-isolation
	export MAMBA_FORCE_BUILD=TRUE
	pip install -e 3rdparty/mamba
	bash build_cutlass.sh
	- name: Check Environment
	run: \|
	pwd
	find /usr/include -name "Python.h"
	which python
	which pip
	pip list
	python -c "import torch; print(torch.__version__, torch.cuda.is_available());"
	python -c "from sysconfig import get_paths; info = get_paths(); print(info)"
	python -c "import fast_hadamard_transform; print(fast_hadamard_transform.__version__);"
	python -c "import mamba_ssm; print(mamba_ssm.__version__);"
	python -c "import megatron.core; print(megatron.core.__version__);"

	- name: Build Quamba
	run: \|
	rm -f *.so
	rm -rf build
	rm -rf quamba.egg-info
	pip install -e .
	- name: Test with pytest
	run: \|
	export CUDA_VISIBLE_DEVICES=7
	pytest quamba/tests --disable-warnings -v
	# test model quantization with generate.py and store the quantized models
	- name: Test generate.py
	run: \|
	export CUDA_VISIBLE_DEVICES=7
	python generate.py state-spaces/mamba-130m --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/ --quantize --quantize_embedding --quantize_lm_head --w_bits 8 --a_bits 8
	python generate.py state-spaces/mamba-130m --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/ --quantize --quantize_embedding --quantize_lm_head --w_bits 4 --a_bits 8 --apply_gptq
	python generate.py state-spaces/mamba-130m --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/ --quantize --quantize_embedding --quantize_lm_head --w_bits 4 --a_bits 16 --apply_gptq
	python generate.py state-spaces/mamba2-130m --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/ --quantize --quantize_embedding --quantize_lm_head --w_bits 8 --a_bits 8 --group_heads
	python generate.py state-spaces/mamba2-130m --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/ --quantize --quantize_embedding --quantize_lm_head --w_bits 4 --a_bits 8 --apply_gptq --group_heads
	python generate.py state-spaces/mamba2-130m --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/ --quantize --quantize_embedding --quantize_lm_head --w_bits 4 --a_bits 16 --apply_gptq
	# test generate.py with w4ax hybrid model and store w4ax hybrid models
	# we hack and apply the mamba2-8B hybrid config (searched_1400_v3.json) to state-spaces/mamba2-130m
	# - name: Test w4ax hybrid generate.py
	# run: \|
	# export CUDA_VISIBLE_DEVICES=7
	# python generate.py state-spaces/mamba2-130m --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/ --quantize --quantize_embedding --quantize_lm_head --w_bits 4 --apply_gptq --group_heads --hybrid_blocks --hybrid_blocks_config configs/hybrid/mamba2-8b/searched_1400_v3.json
	# test loading the stored quantized models with generate.py
	- name: Test loading quantized models
	run: \|
	export CUDA_VISIBLE_DEVICES=7
	python generate.py ut-enyac/quamba-130m-w8a8 --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/
	python generate.py ut-enyac/quamba-130m-w4a8 --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/
	python generate.py ut-enyac/quamba-130m-w4a16 --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/
	python generate.py ut-enyac/quamba2-130m-w8a8 --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/
	python generate.py ut-enyac/quamba2-130m-w4a8 --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/
	python generate.py ut-enyac/quamba2-130m-w4a16 --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/
	# test loading the stored w4ax hybrid model with generate.py
	# we hack and apply the mamba2-8B hybrid config (searched_1400_v3.json) to state-spaces/mamba2-130m
	# - name: Test loading w4ax hybrid generate.py
	# run: \|
	# export CUDA_VISIBLE_DEVICES=7
	# python generate.py ut-enyac/quamba2-130m-w4aX-searched_1400_v3 --prompt "My cat wrote all this CUDA code for a new language model and" --topp 0.9 --temperature 0.7 --repetition_penalty 1.2 --cache_graph --pretrained_dir pretrained_models/
	- name: Clean up pretrained models
	run: \|
	rm -rf pretrained_models/ut-enyac/*

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Add Quamba CI #6

Workflow file

Add Quamba CI #6

Uh oh!

Workflow file for this run