georgian-io · benjaminye · Apr 3, 2024 · Apr 2, 2024 · Apr 2, 2024 · Apr 2, 2024
diff --git a/.github/workflows/lint_and_format.yml b/.github/workflows/lint_and_format.yml
@@ -0,0 +1,18 @@
+name: Ruff
+on: pull_request
+jobs:
+  lint:
+    name: Lint, Format, and Commit
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: chartboost/ruff-action@v1
+        name: Lint
+        with:
+          version: 0.3.5
+          args: "check --output-format=full --statistics"
+      - uses: chartboost/ruff-action@v1
+        name: Format
+        with:
+          version: 0.3.5
+          args: "format --check"
diff --git a/README.md b/README.md
@@ -255,3 +255,10 @@ If you would like to contribute to this project, we recommend following the "for
 5.  Submit a **Pull request** so that we can review your changes
 
 NOTE: Be sure to merge the latest from "upstream" before making a pull request!
+
+### Checklist Before Pull Request (Optional)
+
+1. Use `ruff check --fix` to check and fix lint errors
+2. Use `ruff format` to apply formatting
+
+NOTE: Ruff linting and formatting checks are done when PR is raised via Git Action. Before raising a PR, it is a good practice to check and fix lint errors, as well as apply formatting.
diff --git a/llmtune/data/dataset_generator.py b/llmtune/data/dataset_generator.py
@@ -1,10 +1,10 @@
 import os
-from os.path import join, exists
+import pickle
+import re
 from functools import partial
+from os.path import exists, join
 from typing import Tuple, Union
-import pickle
 
-import re
 from datasets import Dataset
 
 from llmtune.data.ingestor import Ingestor, get_ingestor
@@ -61,12 +61,8 @@ def _format_one_prompt(self, example, is_test: bool = False):
         return example
 
     def _format_prompts(self):
-        self.dataset["train"] = self.dataset["train"].map(
-            partial(self._format_one_prompt, is_test=False)
-        )
-        self.dataset["test"] = self.dataset["test"].map(
-            partial(self._format_one_prompt, is_test=True)
-        )
+        self.dataset["train"] = self.dataset["train"].map(partial(self._format_one_prompt, is_test=False))
+        self.dataset["test"] = self.dataset["test"].map(partial(self._format_one_prompt, is_test=True))
 
     def get_dataset(self) -> Tuple[Dataset, Dataset]:
         self._train_test_split()

diff --git a/llmtune/data/ingestor.py b/llmtune/data/ingestor.py
@@ -1,9 +1,8 @@
+import csv
 from abc import ABC, abstractmethod
-from functools import partial
 
 import ijson
-import csv
-from datasets import Dataset, load_dataset, concatenate_datasets
+from datasets import Dataset, concatenate_datasets, load_dataset
 
 
 def get_ingestor(data_type: str):
@@ -14,9 +13,7 @@ def get_ingestor(data_type: str):
     elif data_type == "huggingface":
         return HuggingfaceIngestor
     else:
-        raise ValueError(
-            f"'type' must be one of 'json', 'csv', or 'huggingface', you have {data_type}"
-        )
+        raise ValueError(f"'type' must be one of 'json', 'csv', or 'huggingface', you have {data_type}")
 
 
 class Ingestor(ABC):

diff --git a/llmtune/finetune/lora.py b/llmtune/finetune/lora.py
@@ -1,31 +1,26 @@
-from os.path import join, exists
-from typing import Tuple
-
-import torch
+from os.path import join
 
 import bitsandbytes as bnb
+import torch
 from datasets import Dataset
+from peft import (
+    LoraConfig,
+    get_peft_model,
+    prepare_model_for_kbit_training,
+)
 from transformers import (
-    AutoTokenizer,
     AutoModelForCausalLM,
-    BitsAndBytesConfig,
-    TrainingArguments,
     AutoTokenizer,
+    BitsAndBytesConfig,
     ProgressCallback,
-)
-from peft import (
-    prepare_model_for_kbit_training,
-    get_peft_model,
-    LoraConfig,
+    TrainingArguments,
 )
 from trl import SFTTrainer
-from rich.console import Console
 
-
-from llmtune.pydantic_models.config_model import Config
-from llmtune.utils.save_utils import DirectoryHelper
 from llmtune.finetune.generics import Finetune
+from llmtune.pydantic_models.config_model import Config
 from llmtune.ui.rich_ui import RichUI
+from llmtune.utils.save_utils import DirectoryHelper
 
 
 class LoRAFinetune(Finetune):
@@ -99,9 +94,7 @@ def _inject_lora(self):
         self.model = get_peft_model(self.model, self._lora_config)
 
         if not self.config.accelerate:
-            self.optimizer = bnb.optim.Adam8bit(
-                self.model.parameters(), lr=self._training_args.learning_rate
-            )
+            self.optimizer = bnb.optim.Adam8bit(self.model.parameters(), lr=self._training_args.learning_rate)
             self.lr_scheduler = torch.optim.lr_scheduler.ConstantLR(self.optimizer)
         if self.config.accelerate:
             self.model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
@@ -132,7 +125,7 @@ def finetune(self, train_dataset: Dataset):
             **self._sft_args.model_dump(),
         )
 
-        trainer_stats = self._trainer.train()
+        self._trainer.train()
 
     def save_model(self) -> None:
         self._trainer.model.save_pretrained(self._weights_path)

diff --git a/llmtune/inference/lora.py b/llmtune/inference/lora.py
@@ -1,20 +1,18 @@
+import csv
 import os
 from os.path import join
 from threading import Thread
-import csv
 
-from transformers import TextIteratorStreamer
-from rich.text import Text
+import torch
 from datasets import Dataset
-from transformers import AutoTokenizer, BitsAndBytesConfig
 from peft import AutoPeftModelForCausalLM
-import torch
-
+from rich.text import Text
+from transformers import AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
 
-from llmtune.pydantic_models.config_model import Config
-from llmtune.utils.save_utils import DirectoryHelper
 from llmtune.inference.generics import Inference
+from llmtune.pydantic_models.config_model import Config
 from llmtune.ui.rich_ui import RichUI
+from llmtune.utils.save_utils import DirectoryHelper
 
 
 # TODO: Add type hints please!
@@ -35,9 +33,7 @@ def __init__(
         self.device_map = self.config.model.device_map
         self._weights_path = dir_helper.save_paths.weights
 
-        self.model, self.tokenizer = self._get_merged_model(
-            dir_helper.save_paths.weights
-        )
+        self.model, self.tokenizer = self._get_merged_model(dir_helper.save_paths.weights)
 
     def _get_merged_model(self, weights_path: str):
         # purge VRAM
@@ -47,20 +43,14 @@ def _get_merged_model(self, weights_path: str):
         dtype = (
             torch.float16
             if self.config.training.training_args.fp16
-            else (
-                torch.bfloat16
-                if self.config.training.training_args.bf16
-                else torch.float32
-            )
+            else (torch.bfloat16 if self.config.training.training_args.bf16 else torch.float32)
         )
 
         self.model = AutoPeftModelForCausalLM.from_pretrained(
             weights_path,
             torch_dtype=dtype,
             device_map=self.device_map,
-            quantization_config=(
-                BitsAndBytesConfig(**self.config.model.bitsandbytes.model_dump())
-            ),
+            quantization_config=(BitsAndBytesConfig(**self.config.model.bitsandbytes.model_dump())),
         )
 
         """TODO: figure out multi-gpu
@@ -70,9 +60,7 @@ def _get_merged_model(self, weights_path: str):
 
         model = self.model.merge_and_unload()
 
-        tokenizer = AutoTokenizer.from_pretrained(
-            self._weights_path, device_map=self.device_map
-        )
+        tokenizer = AutoTokenizer.from_pretrained(self._weights_path, device_map=self.device_map)
 
         return model, tokenizer
 
@@ -83,13 +71,11 @@ def infer_all(self):
 
         # inference loop
         for idx, (prompt, label) in enumerate(zip(prompts, labels)):
-            RichUI.inference_ground_truth_display(
-                f"Generating on test set: {idx+1}/{len(prompts)}", prompt, label
-            )
+            RichUI.inference_ground_truth_display(f"Generating on test set: {idx+1}/{len(prompts)}", prompt, label)
 
             try:
                 result = self.infer_one(prompt)
-            except:
+            except Exception:
                 continue
             results.append((prompt, label, result))
 
@@ -103,9 +89,7 @@ def infer_all(self):
                 writer.writerow(row)
 
     def infer_one(self, prompt: str) -> str:
-        input_ids = self.tokenizer(
-            prompt, return_tensors="pt", truncation=True
-        ).input_ids.cuda()
+        input_ids = self.tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
 
         # stream processor
         streamer = TextIteratorStreamer(
@@ -115,9 +99,7 @@ def infer_one(self, prompt: str) -> str:
             timeout=60,  # 60 sec timeout for generation; to handle OOM errors
         )
 
-        generation_kwargs = dict(
-            input_ids=input_ids, streamer=streamer, **self.config.inference.model_dump()
-        )
+        generation_kwargs = dict(input_ids=input_ids, streamer=streamer, **self.config.inference.model_dump())
 
         thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
         thread.start()

diff --git a/llmtune/pydantic_models/config_model.py b/llmtune/pydantic_models/config_model.py
@@ -1,27 +1,21 @@
-from typing import Literal, Union, List, Dict, Optional
-from pydantic import BaseModel, FilePath, validator, Field
-
-from huggingface_hub.utils import validate_repo_id
+from typing import List, Literal, Optional, Union
 
 import torch
+from pydantic import BaseModel, Field, FilePath, validator
+
 
 # TODO: Refactor this into multiple files...
 HfModelPath = str
 
+
 class QaConfig(BaseModel):
-    llm_tests: Optional[List[str]] = Field([], description = "list of tests that needs to be connected")
-    
+    llm_tests: Optional[List[str]] = Field([], description="list of tests that needs to be connected")
+
 
 class DataConfig(BaseModel):
-    file_type: Literal["json", "csv", "huggingface"] = Field(
-        None, description="File type"
-    )
-    path: Union[FilePath, HfModelPath] = Field(
-        None, description="Path to the file or HuggingFace model"
-    )
-    prompt: str = Field(
-        None, description="Prompt for the model. Use {} brackets for column name"
-    )
+    file_type: Literal["json", "csv", "huggingface"] = Field(None, description="File type")
+    path: Union[FilePath, HfModelPath] = Field(None, description="Path to the file or HuggingFace model")
+    prompt: str = Field(None, description="Prompt for the model. Use {} brackets for column name")
     prompt_stub: str = Field(
         None,
         description="Stub for the prompt; this is injected during training. Use {} brackets for column name",
@@ -48,9 +42,7 @@ class DataConfig(BaseModel):
 
 
 class BitsAndBytesConfig(BaseModel):
-    load_in_8bit: Optional[bool] = Field(
-        False, description="Enable 8-bit quantization with LLM.int8()"
-    )
+    load_in_8bit: Optional[bool] = Field(False, description="Enable 8-bit quantization with LLM.int8()")
     llm_int8_threshold: Optional[float] = Field(
         6.0, description="Outlier threshold for outlier detection in 8-bit quantization"
     )
@@ -61,9 +53,7 @@ class BitsAndBytesConfig(BaseModel):
         False,
         description="Enable splitting model parts between int8 on GPU and fp32 on CPU",
     )
-    llm_int8_has_fp16_weight: Optional[bool] = Field(
-        False, description="Run LLM.int8() with 16-bit main weights"
-    )
+    llm_int8_has_fp16_weight: Optional[bool] = Field(False, description="Run LLM.int8() with 16-bit main weights")
 
     load_in_4bit: Optional[bool] = Field(
         True,
@@ -86,14 +76,10 @@ class ModelConfig(BaseModel):
         "NousResearch/Llama-2-7b-hf",
         description="Path to the model (huggingface repo or local path)",
     )
-    device_map: Optional[str] = Field(
-        "auto", description="device onto which to load the model"
-    )
+    device_map: Optional[str] = Field("auto", description="device onto which to load the model")
 
     quantize: Optional[bool] = Field(False, description="Flag to enable quantization")
-    bitsandbytes: BitsAndBytesConfig = Field(
-        None, description="Bits and Bytes configuration"
-    )
+    bitsandbytes: BitsAndBytesConfig = Field(None, description="Bits and Bytes configuration")
 
     # @validator("hf_model_ckpt")
     # def validate_model(cls, v, **kwargs):
@@ -116,22 +102,12 @@ def set_device_map_to_none(cls, v, values, **kwargs):
 
 class LoraConfig(BaseModel):
     r: Optional[int] = Field(8, description="Lora rank")
-    task_type: Optional[str] = Field(
-        "CAUSAL_LM", description="Base Model task type during training"
-    )
+    task_type: Optional[str] = Field("CAUSAL_LM", description="Base Model task type during training")
 
-    lora_alpha: Optional[int] = Field(
-        16, description="The alpha parameter for Lora scaling"
-    )
-    bias: Optional[str] = Field(
-        "none", description="Bias type for Lora. Can be 'none', 'all' or 'lora_only'"
-    )
-    lora_dropout: Optional[float] = Field(
-        0.1, description="The dropout probability for Lora layers"
-    )
-    target_modules: Optional[List[str]] = Field(
-        None, description="The names of the modules to apply Lora to"
-    )
+    lora_alpha: Optional[int] = Field(16, description="The alpha parameter for Lora scaling")
+    bias: Optional[str] = Field("none", description="Bias type for Lora. Can be 'none', 'all' or 'lora_only'")
+    lora_dropout: Optional[float] = Field(0.1, description="The dropout probability for Lora layers")
+    target_modules: Optional[List[str]] = Field(None, description="The names of the modules to apply Lora to")
     fan_in_fan_out: Optional[bool] = Field(
         False,
         description="Flag to indicate if the layer to replace stores weight like (fan_in, fan_out)",
@@ -140,9 +116,7 @@ class LoraConfig(BaseModel):
         None,
         description="List of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint",
     )
-    layers_to_transform: Optional[Union[List[int], int]] = Field(
-        None, description="The layer indexes to transform"
-    )
+    layers_to_transform: Optional[Union[List[int], int]] = Field(None, description="The layer indexes to transform")
     layers_pattern: Optional[str] = Field(None, description="The layer pattern name")
     # rank_pattern: Optional[Dict[str, int]] = Field(
     #     {}, description="The mapping from layer names or regexp expression to ranks"
@@ -155,15 +129,9 @@ class LoraConfig(BaseModel):
 # TODO: Get comprehensive Args!
 class TrainingArgs(BaseModel):
     num_train_epochs: Optional[int] = Field(1, description="Number of training epochs")
-    per_device_train_batch_size: Optional[int] = Field(
-        1, description="Batch size per training device"
-    )
-    gradient_accumulation_steps: Optional[int] = Field(
-        1, description="Number of steps for gradient accumulation"
-    )
-    gradient_checkpointing: Optional[bool] = Field(
-        True, description="Flag to enable gradient checkpointing"
-    )
+    per_device_train_batch_size: Optional[int] = Field(1, description="Batch size per training device")
+    gradient_accumulation_steps: Optional[int] = Field(1, description="Number of steps for gradient accumulation")
+    gradient_checkpointing: Optional[bool] = Field(True, description="Flag to enable gradient checkpointing")
     optim: Optional[str] = Field("paged_adamw_32bit", description="Optimizer")
     logging_steps: Optional[int] = Field(100, description="Number of logging steps")
     learning_rate: Optional[float] = Field(2.0e-4, description="Learning rate")
@@ -172,9 +140,7 @@ class TrainingArgs(BaseModel):
     fp16: Optional[bool] = Field(False, description="Flag to enable fp16")
     max_grad_norm: Optional[float] = Field(0.3, description="Maximum gradient norm")
     warmup_ratio: Optional[float] = Field(0.03, description="Warmup ratio")
-    lr_scheduler_type: Optional[str] = Field(
-        "constant", description="Learning rate scheduler type"
-    )
+    lr_scheduler_type: Optional[str] = Field("constant", description="Learning rate scheduler type")
 
 
 # TODO: Get comprehensive Args!