Skip to content

Commit d833a5f

Browse files
committed
Initial commit
0 parents  commit d833a5f

34 files changed

+5907
-0
lines changed

.github/workflows/lint.yml

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
name: Format & Lint
2+
3+
on:
4+
push:
5+
branches: [main]
6+
pull_request:
7+
8+
jobs:
9+
format-and-lint:
10+
if: github.repository == 'CentML/flexible-inference-bench'
11+
concurrency:
12+
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
13+
cancel-in-progress: true
14+
runs-on: ubuntu-latest
15+
strategy:
16+
matrix:
17+
python-version: ["3.10"]
18+
steps:
19+
- uses: actions/checkout@v3
20+
- name: Set up Python ${{ matrix.python-version }}
21+
uses: actions/setup-python@v4
22+
with:
23+
python-version: ${{ matrix.python-version }}
24+
- name: Install dependencies
25+
run: |
26+
python -m pip install --upgrade pip
27+
pip install -r requirements-dev.txt
28+
- name: Format with black
29+
run: |
30+
# stop the build if format is not correct
31+
echo "Running with " $(pip freeze | grep "black")
32+
bash scripts/lint/format.sh --check
33+
- name: Lint with pylint
34+
run: |
35+
echo "Running with" $(pip freeze | grep "pylint")
36+
bash scripts/lint/lint.sh
37+
- name: Type checking with mypy
38+
run: |
39+
echo "Running with" $(pip freeze | grep "mypy")
40+
bash scripts/lint/mypy.sh

.github/workflows/run-tests.yml

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
name: Unit Tests
2+
3+
on:
4+
push:
5+
branches: [main]
6+
pull_request:
7+
8+
jobs:
9+
unit-tests:
10+
if : github.repository == 'CentML/flexible-inference-bench'
11+
runs-on:
12+
group: arc-a100-80
13+
container:
14+
image: nvidia/cuda:12.1.0-devel-ubuntu22.04
15+
options: --gpus all
16+
strategy:
17+
matrix:
18+
python-version: ["3.10"]
19+
steps:
20+
- name: Fetch repository
21+
uses: actions/checkout@v4
22+
23+
- name: Set up Python ${{ matrix.python-version }}
24+
uses: actions/setup-python@v4
25+
with:
26+
python-version: ${{ matrix.python-version }}
27+
28+
- name: Install dependencies
29+
run: |
30+
python -m pip install --upgrade pip
31+
pip install -r requirements-dev.txt
32+
pip install -e .
33+
34+
- name : Run tests
35+
run: |
36+
bash scripts/unit_test/test.sh

.gitignore

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
# Protobuf generated files
2+
python/cserve/protos/*pb2*
3+
4+
# Byte-compiled / optimized / DLL files
5+
__pycache__/
6+
*.py[cod]
7+
*$py.class
8+
9+
# C extensions
10+
*.so
11+
12+
# Distribution / packaging
13+
.Python
14+
build/
15+
develop-eggs/
16+
dist/
17+
downloads/
18+
eggs/
19+
.eggs/
20+
lib/
21+
lib64/
22+
parts/
23+
sdist/
24+
var/
25+
wheels/
26+
share/python-wheels/
27+
*.egg-info/
28+
.installed.cfg
29+
*.egg
30+
MANIFEST
31+
32+
# PyInstaller
33+
# Usually these files are written by a python script from a template
34+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
35+
*.manifest
36+
*.spec
37+
38+
# Installer logs
39+
pip-log.txt
40+
pip-delete-this-directory.txt
41+
42+
# Unit test / coverage reports
43+
htmlcov/
44+
.tox/
45+
.nox/
46+
.coverage
47+
.coverage.*
48+
.cache
49+
nosetests.xml
50+
coverage.xml
51+
*.cover
52+
*.py,cover
53+
.hypothesis/
54+
.pytest_cache/
55+
cover/
56+
57+
# Translations
58+
*.mo
59+
*.pot
60+
61+
# Django stuff:
62+
*.log
63+
local_settings.py
64+
db.sqlite3
65+
db.sqlite3-journal
66+
67+
# Flask stuff:
68+
instance/
69+
.webassets-cache
70+
71+
# Scrapy stuff:
72+
.scrapy
73+
74+
# Sphinx documentation
75+
docs/_build/
76+
77+
# PyBuilder
78+
.pybuilder/
79+
target/
80+
81+
# Jupyter Notebook
82+
.ipynb_checkpoints
83+
84+
# IPython
85+
profile_default/
86+
ipython_config.py
87+
88+
# pyenv
89+
# For a library or package, you might want to ignore these files since the code is
90+
# intended to run in multiple environments; otherwise, check them in:
91+
# .python-version
92+
93+
# pipenv
94+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
95+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
96+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
97+
# install all needed dependencies.
98+
#Pipfile.lock
99+
100+
# poetry
101+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
102+
# This is especially recommended for binary packages to ensure reproducibility, and is more
103+
# commonly ignored for libraries.
104+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
105+
#poetry.lock
106+
107+
# pdm
108+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
109+
#pdm.lock
110+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
111+
# in version control.
112+
# https://pdm.fming.dev/#use-with-ide
113+
.pdm.toml
114+
115+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
116+
__pypackages__/
117+
118+
# Celery stuff
119+
celerybeat-schedule
120+
celerybeat.pid
121+
122+
# SageMath parsed files
123+
*.sage.py
124+
125+
# Environments
126+
.env
127+
.venv
128+
env/
129+
venv/
130+
ENV/
131+
env.bak/
132+
venv.bak/
133+
134+
# Spyder project settings
135+
.spyderproject
136+
.spyproject
137+
138+
# Rope project settings
139+
.ropeproject
140+
141+
# mkdocs documentation
142+
/site
143+
144+
# mypy
145+
.mypy_cache/
146+
.dmypy.json
147+
dmypy.json
148+
149+
# Pyre type checker
150+
.pyre/
151+
152+
# pytype static type analyzer
153+
.pytype/
154+
155+
# Cython debug symbols
156+
cython_debug/
157+
158+
# PyCharm
159+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
160+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
161+
# and can be added to the global gitignore or merged into this file. For a more nuclear
162+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
163+
#.idea/
164+
165+
# NSYS Files
166+
*.nsys-rep

README.md

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
# Flexible Inference Benchmarker
2+
A modular, extensible LLM inference benchmarking framework that supports multiple benchmarking frameworks and paradigms.
3+
4+
This benchmarking framework operates entirely external to any serving framework, and can easily be extended and modified. It is intended to be fully-featured to provide a variety of statistics and profiling modes and be easily extensible.
5+
6+
## Installation
7+
```
8+
cd flexible-inference-benchmark
9+
pip install .
10+
```
11+
12+
## Usage
13+
After installing with the above instructions, the benchmarker can be invoked with `inference-benchmark <args>`.
14+
15+
After you get your output (using `--output-file`), you can invoke one of the data postprocessors in `data_postprocessors`.
16+
17+
### Parameters
18+
| argument | description |
19+
| --- | --- |
20+
| `--seed` | Seed for reproducibility. |
21+
| `--backend` | Backend options: `tgi`,`vllm`,`cserve`,`cserve-debug`,`lmdeploy`,`deepspeed-mii`,`openai`,`openai-chat`,`tensorrt-llm`. <br> **For tensorrt-llm temperature is set to 0.01 since NGC container >= 24.06 does not support 0.0** |
22+
| `--base-url` | Server or API base url, without endpoint |
23+
| `--endpoint` | API endpoint. |
24+
| one of <br> `--num-of-req` **or** <br> `--max-time-for-reqs` | <br> Total number of requests to send <br> time window for sending requests **(in seconds)** |
25+
| `--request-distribution` | Distribution for sending requests: <br> **eg:** `exponential 5` (request will follow an exponential distribution with an average time between requests of **5 seconds**) <br> options: <br> `poisson rate` <br> `uniform min_val max_val` <br> `normal mean std`. |
26+
| `--input-token-distribution` | Request distribution for prompt length. eg: <br> `uniform min_val max_val` <br> `normal mean std`. |
27+
| `--output-token-distribution` | Request distribution for output token length. eg: <br> `uniform min_val max_val` <br> `normal mean std`. |
28+
| one of:<br>`--prefix-text`<br>`--prefix-len`<br>`--no-prefix` | <br> Text to use as prefix for all requests. <br> Length of prefix to use for all requests. <br> No prefix for requests. |
29+
| `--dataset-name` | Name of the dataset to benchmark on <br> {`sharegpt`,`other`,`random`}. |
30+
| `--dataset-path` | Path to the dataset. |
31+
| `--model` | Name of the model. |
32+
| `--tokenizer` | Name or path of the tokenizer, if not using the default tokenizer. |
33+
| `--disable-tqdm` | Specify to disable tqdm progress bar. |
34+
| `--best-of` | Number of best completions to return. |
35+
| `--use-beam-search` | Use beam search for completions. |
36+
| `--output-file` | Output json file to save the results. |
37+
| `--debug` | Log debug messages. |
38+
| `--disable-ignore-eos` | Ignores end of sequence.<br> **Note:** Not valid argument for TensorRT-LLM |
39+
| `--disable-stream` | The requests are send with Stream: False. (Used for APIs without an stream option) |
40+
| `--cookies` | Include cookies in the request. |
41+
| `--config-file` | Path to configuration file. |
42+
43+
**For ease of use we recommend passing a configuration file with all the required parameters for your use case. Examples are provided in `examples/`**
44+
45+
### Output
46+
The output json file in an array of objects that contain the following fields:<br>
47+
* `backend`: backend used
48+
* `time`: Total time
49+
* `outputs`:
50+
* `text`: Generated text
51+
* `success`: Whether the request was successful
52+
* `latency`: End-to-end time for the request
53+
* `ttft`: Time to first token
54+
* `itl`: Inter-token latency
55+
* `prompt_len`: Length of the prompt
56+
* `error`: Error message
57+
* `inputs`: List of `[prompt string, input tokens, expected output tokens]`
58+
* `tokenizer`: Tokenizer name
59+
* `stream`: Indicates if we used the stream argument or not
60+
61+
### Data Postprocessors
62+
Below is a description of the data postprocessors.
63+
64+
#### `performance.py`
65+
Prints the following output for a given run, same as vLLM.
66+
67+
```
68+
============ Serving Benchmark Result ============
69+
Successful requests: 20
70+
Benchmark duration (s): 19.39
71+
Total input tokens: 407
72+
Total generated tokens: 5112
73+
Request throughput (req/s): 1.03
74+
Input token throughput (tok/s): 20.99
75+
Output token throughput (tok/s): 263.66
76+
---------------Time to First Token----------------
77+
Mean TTFT (ms): 24.66
78+
Median TTFT (ms): 24.64
79+
P99 TTFT (ms): 34.11
80+
-----Time per Output Token (excl. 1st token)------
81+
Mean TPOT (ms): 2295.86
82+
Median TPOT (ms): 2362.54
83+
P99 TPOT (ms): 2750.76
84+
==================================================
85+
```
86+
87+
Supports the following args:
88+
89+
| argument | description |
90+
| --- | --- |
91+
| `--datapath` | Path to the output json file produced. |
92+
93+
#### `itl.py`
94+
95+
Returns a plot of inter-token latencies for a specific request. Takes the following args:
96+
97+
| argument | description |
98+
| --- | --- |
99+
| `--datapath` | Path to the output json file produced. |
100+
| `--output` | Path to save figure supported by matplotlib. |
101+
| `--request-num` | Which request to produce ITL plot for. |
102+
103+
#### `ttft.py`
104+
105+
Generates a simple CDF plot of **time to first token** requests. You can pass a single file or a list of generated files from the benchmark to make a comparisson <br>
106+
107+
| argument | description |
108+
| --- | --- |
109+
| `--files` | file(s) to generate the plot
110+
111+
## `Example`
112+
113+
Let's take vllm as the backend for our benchmark.
114+
You can install vllm with the command:<br>
115+
`pip install vllm`
116+
117+
We will use gpt2 as the model<br>
118+
`python -m vllm.entrypoints.openai.api_server --model gpt2`
119+
120+
Once the backend is up and running we can go to the examples folder and run the inference benchmark using vllm_args.json file <br>
121+
`cd examples`<br>
122+
`inference-benchmark --config-file vllm_args.json --output-file vllm-benchmark.json`
123+
124+
then you can go to the folder data_postprocessors and see the performance with performance.py<br>
125+
`cd ../data_postprocessors` <br>
126+
`python performance.py --datapath ../examples/vllm-benchmark.json` <br>
127+
128+
```
129+
============ Serving Benchmark Result ============
130+
Successful requests: 20
131+
Benchmark duration (s): 4.15
132+
Total input tokens: 3836
133+
Total generated tokens: 4000
134+
Request throughput (req/s): 4.82
135+
Input token throughput (tok/s): 925.20
136+
Output token throughput (tok/s): 964.76
137+
---------------Time to First Token----------------
138+
Mean TTFT (ms): 19.91
139+
Median TTFT (ms): 22.11
140+
P99 TTFT (ms): 28.55
141+
-----Time per Output Token (excl. 1st token)------
142+
Mean TPOT (ms): 6.73
143+
Median TPOT (ms): 7.96
144+
P99 TPOT (ms): 8.41
145+
---------------Inter-token Latency----------------
146+
Mean ITL (ms): 6.73
147+
Median ITL (ms): 7.40
148+
P99 ITL (ms): 20.70
149+
==================================================
150+
```

0 commit comments

Comments
 (0)