MahmoudAshraf97 · jrhe · Dec 15, 2024
diff --git a/.github/workflows/test_run.yml b/.github/workflows/test_run.yml
@@ -10,7 +10,6 @@ on:
 
 jobs:
   build:
-
     strategy:
       fail-fast: false
       matrix:
@@ -24,6 +23,7 @@ jobs:
 
     steps:
     - uses: actions/checkout@v4
+
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v5
       with:
@@ -41,13 +41,13 @@ jobs:
       if: runner.os == 'Windows'
       run: choco install ffmpeg
 
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip cython uv
-        # This is to avoid installing cuda dependencies
-        python -m pip install torch torchaudio --index-url https://download.pytorch.org/whl/cpu
-        uv pip install --system -c constraints.txt -r requirements.txt
+    - name: Install uv
+      uses: astral-sh/setup-uv@v4
+      with:
+        version: "0.5.9"
+
+    - name: Install the project
+      run: uv sync
 
     - name: Test running a file
-      run: |
-        python diarize.py -a "./tests/assets/test.opus" --whisper-model tiny.en
+      run: uv run diarize.py -a "./tests/assets/test.opus" --whisper-model tiny.en
diff --git a/README.md b/README.md
@@ -38,16 +38,9 @@ This repository combines Whisper ASR capabilities with Voice Activity Detection
 
 Whisper and NeMo parameters are coded into diarize.py and helpers.py, I will add the CLI arguments to change them later
 ## Installation
-Python >= `3.10` is needed, `3.9` will work but you'll need to manually install the requirements one by one.
+### Install dependencies
 
-`FFMPEG` and `Cython` are needed as prerequisites to install the requirements
-```
-pip install cython
-```
-or
-```
-sudo apt update && sudo apt install cython3
-```
+####`FFMPEG` 
 ```
 # on Ubuntu or Debian
 sudo apt update && sudo apt install ffmpeg
@@ -67,14 +60,32 @@ scoop install ffmpeg
 # on Windows using WinGet (https://github.com/microsoft/winget-cli)
 winget install ffmpeg
 ```
+
+#### `uv` python package manager
+```
+# on Linux or MacOS
+curl -LsSf https://astral.sh/uv/install.sh | sh
+
+# on Windows
+powershell -c "irm https://astral.sh/uv/install.ps1 | iex"
+```
+
+See the [uv installation docs](https://docs.astral.sh/uv/getting-started/installation/) for more installation methods.
+
+#### Pytorch
+If you want to use a `torch` and `torchaudio` distrbution other than the default one provided by PyPI (which is CUDA 12.4 on Linux, CPU on Windows and CPU/MPS on MacOS), you can install it using the pip instructions from [pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/), using `uv pip ...` instead of `pip ...`/`pip3 ...`. 
+
+For example, to install torch and torchaudio for CUDA 12.1 on Linux:
 ```
-pip install -c constraints.txt -r requirements.txt
+uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
 ```
+
 ## Usage 
 
 ```
-python diarize.py -a AUDIO_FILE_NAME
+uv run diarize.py -a AUDIO_FILE_NAME
 ```
+By default, `uv` will ensure a correct version of python is used, setup a virtual environment, and install the required python dependencies. If you wish to customise this behaviour, please see the [uv documentation](https://docs.astral.sh/uv/getting-started/installation/).
 
 If your system has enough VRAM (>=10GB), you can use `diarize_parallel.py` instead, the difference is that it runs NeMo in parallel with Whisper, this can be beneficial in some cases and the result is the same since the two models are nondependent on each other. This is still experimental, so expect errors and sharp edges. Your feedback is welcome.
 

diff --git a/constraints.txt b/constraints.txt
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,26 @@
+[project]
+name = "whisper-diarization"
+version = "0.1.0"
+description = "Speaker Diarization pipeline based on OpenAI Whisper"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "ctc-forced-aligner",
+    "deepmultilingualpunctuation",
+    "demucs",
+    "wget",
+    "nemo_toolkit[asr]==2.0.0rc0",
+    "nltk",
+    "faster-whisper>=1.1.0",
+]
+
+[tool.uv.sources]
+ctc-forced-aligner = { git = "https://github.com/MahmoudAshraf97/ctc-forced-aligner.git" }
+deepmultilingualpunctuation = { git = "https://github.com/oliverguhr/deepmultilingualpunctuation.git" }
+demucs = { git = "https://github.com/MahmoudAshraf97/demucs.git" }
+
+[tool.uv]
+constraint-dependencies = [
+    "huggingface_hub<0.24",
+    "numpy<2",
+]
diff --git a/requirements.txt b/requirements.txt