huggingface · DN6 · Dec 17, 2024 · Oct 21, 2024 · Oct 21, 2024 · Oct 24, 2024
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
@@ -31,7 +31,7 @@
     "loaders": ["FromOriginalModelMixin"],
     "models": [],
     "pipelines": [],
-    "quantizers.quantization_config": ["BitsAndBytesConfig"],
+    "quantizers.quantization_config": ["BitsAndBytesConfig", "GGUFQuantizationConfig"],
     "schedulers": [],
     "utils": [
         "OptionalDependencyNotAvailable",

diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py
@@ -17,8 +17,10 @@
 from contextlib import nullcontext
 from typing import Optional
 
+import torch
 from huggingface_hub.utils import validate_hf_hub_args
 
+from ..quantizers import DiffusersAutoQuantizer
 from ..utils import deprecate, is_accelerate_available, logging
 from .single_file_utils import (
     SingleFileComponentError,
@@ -202,6 +204,7 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] =
         subfolder = kwargs.pop("subfolder", None)
         revision = kwargs.pop("revision", None)
         torch_dtype = kwargs.pop("torch_dtype", None)
+        quantization_config = kwargs.pop("quantization_config", None)
 
         if isinstance(pretrained_model_link_or_path_or_dict, dict):
             checkpoint = pretrained_model_link_or_path_or_dict
@@ -215,6 +218,11 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] =
                 local_files_only=local_files_only,
                 revision=revision,
             )
+        if quantization_config is not None:
+            hf_quantizer = DiffusersAutoQuantizer.from_config(quantization_config)
+
+        else:
+            hf_quantizer = None
 
         mapping_functions = SINGLE_FILE_LOADABLE_CLASSES[mapping_class_name]
 
@@ -295,8 +303,29 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] =
         with ctx():
             model = cls.from_config(diffusers_model_config)
 
+        # Check if `_keep_in_fp32_modules` is not None
+        use_keep_in_fp32_modules = (cls._keep_in_fp32_modules is not None) and (
+            (torch_dtype == torch.float16) or hasattr(hf_quantizer, "use_keep_in_fp32_modules")
+        )
+        if use_keep_in_fp32_modules:
+            keep_in_fp32_modules = cls._keep_in_fp32_modules
+            if not isinstance(keep_in_fp32_modules, list):
+                keep_in_fp32_modules = [keep_in_fp32_modules]
+
+        else:
+            keep_in_fp32_modules = []
+
+        if hf_quantizer is not None:
+            hf_quantizer.preprocess_model(model=model, device_map=None, keep_in_fp32_modules=keep_in_fp32_modules)
+
         if is_accelerate_available():
-            unexpected_keys = load_model_dict_into_meta(model, diffusers_format_checkpoint, dtype=torch_dtype)
+            unexpected_keys = load_model_dict_into_meta(
+                model,
+                diffusers_format_checkpoint,
+                dtype=torch_dtype,
+                hf_quantizer=hf_quantizer,
+                keep_in_fp32_modules=keep_in_fp32_modules,
+            )
 
         else:
             _, unexpected_keys = model.load_state_dict(diffusers_format_checkpoint, strict=False)
@@ -310,6 +339,9 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] =
                 f"Some weights of the model checkpoint were not used when initializing {cls.__name__}: \n {[', '.join(unexpected_keys)]}"
             )
 
+        if hf_quantizer is not None:
+            hf_quantizer.postprocess_model(model)
+
         if torch_dtype is not None:
             model.to(torch_dtype)
 # When using `use_keep_in_fp32_modules` if we do a global `to()` here, then we will 
 # When using `use_keep_in_fp32_modules` if we do a global `to()` here, then we will 
 

diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py
@@ -17,26 +17,30 @@
 import importlib
 import inspect
 import os
+from array import array
 from collections import OrderedDict
 from pathlib import Path
 from typing import List, Optional, Union
 
 import safetensors
 import torch
 from huggingface_hub.utils import EntryNotFoundError
+from tqdm import tqdm
 
-from ..quantizers.quantization_config import QuantizationMethod
 from ..utils import (
+    GGUF_FILE_EXTENSION,
     SAFE_WEIGHTS_INDEX_NAME,
     SAFETENSORS_FILE_EXTENSION,
     WEIGHTS_INDEX_NAME,
     _add_variant,
     _get_model_file,
     deprecate,
     is_accelerate_available,
+    is_torch_available,
     is_torch_version,
     logging,
 )
+from ..utils.import_utils import is_gguf_available
 
 
 logger = logging.get_logger(__name__)
@@ -140,6 +144,8 @@ def load_state_dict(checkpoint_file: Union[str, os.PathLike], variant: Optional[
         file_extension = os.path.basename(checkpoint_file).split(".")[-1]
         if file_extension == SAFETENSORS_FILE_EXTENSION:
             return safetensors.torch.load_file(checkpoint_file, device="cpu")
+        elif file_extension == GGUF_FILE_EXTENSION:
+            return load_gguf_checkpoint(checkpoint_file)
         else:
             weights_only_kwarg = {"weights_only": True} if is_torch_version(">=", "1.13") else {}
             return torch.load(
@@ -176,11 +182,9 @@ def load_model_dict_into_meta(
     hf_quantizer=None,
     keep_in_fp32_modules=None,
 ) -> List[str]:
-    if hf_quantizer is None:
-        device = device or torch.device("cpu")
+    device = device or torch.device("cpu")
 param_device = torch.cuda.current_device() 
 param_device = torch.cuda.current_device() 
     dtype = dtype or torch.float32
     is_quantized = hf_quantizer is not None
-    is_quant_method_bnb = getattr(model, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES
 
     accepts_dtype = "dtype" in set(inspect.signature(set_module_tensor_to_device).parameters.keys())
     empty_state_dict = model.state_dict()
@@ -211,14 +215,15 @@ def load_model_dict_into_meta(
                     set_module_kwargs["dtype"] = dtype
 
         # bnb params are flattened.
+        # gguf quants have a different shape based on the type of quantization applied
         if empty_state_dict[param_name].shape != param.shape:
             if (
-                is_quant_method_bnb
+                is_quantized
                 and hf_quantizer.pre_quantized
                 and hf_quantizer.check_if_quantized_param(model, param, param_name, state_dict, param_device=device)
             ):
-                hf_quantizer.check_quantized_param_shape(param_name, empty_state_dict[param_name].shape, param.shape)
-            elif not is_quant_method_bnb:
+                hf_quantizer.check_quantized_param_shape(param_name, empty_state_dict[param_name], param)
+            else:
                 model_name_or_path_str = f"{model_name_or_path} " if model_name_or_path is not None else ""
                 raise ValueError(
                     f"Cannot load {model_name_or_path_str} because {param_name} expected shape {empty_state_dict[param_name]}, but got {param.shape}. If you want to instead overwrite randomly initialized weights, please make sure to pass both `low_cpu_mem_usage=False` and `ignore_mismatched_sizes=True`. For more information, see also: https://github.com/huggingface/diffusers/issues/1619#issuecomment-1345604389 as an example."
@@ -396,3 +401,77 @@ def _fetch_index_file_legacy(
                 index_file = None
 
     return index_file
+
+
+def _gguf_parse_value(_value, data_type):
+    if not isinstance(data_type, list):
+        data_type = [data_type]
+    if len(data_type) == 1:
+        data_type = data_type[0]
+        array_data_type = None
+    else:
+        if data_type[0] != 9:
+            raise ValueError("Received multiple types, therefore expected the first type to indicate an array.")
+        data_type, array_data_type = data_type
+
+    if data_type in [0, 1, 2, 3, 4, 5, 10, 11]:
+        _value = int(_value[0])
+    elif data_type in [6, 12]:
+        _value = float(_value[0])
+    elif data_type in [7]:
+        _value = bool(_value[0])
+    elif data_type in [8]:
+        _value = array("B", list(_value)).tobytes().decode()
+    elif data_type in [9]:
+        _value = _gguf_parse_value(_value, array_data_type)
+    return _value
+
+
+def read_field(reader, field):
+    value = reader.fields[field]
+    return [_gguf_parse_value(value.parts[_data_index], value.types) for _data_index in value.data]
+
+
+def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
+    """
+    Load a GGUF file and return a dictionary of parsed parameters containing tensors, the parsed tokenizer and config
+    attributes.
+
+    Args:
+        gguf_checkpoint_path (`str`):
+            The path the to GGUF file to load
+        return_tensors (`bool`, defaults to `True`):
+            Whether to read the tensors from the file and return them. Not doing so is faster and only loads the
+            metadata in memory.
+    """
+
+    if is_gguf_available() and is_torch_available():
+        import gguf
+        from gguf import GGUFReader
+
+        from ..quantizers.gguf.utils import GGUFParameter
+    else:
+        logger.error(
+            "Loading a GGUF checkpoint in PyTorch, requires both PyTorch and GGUF>=0.10.0 to be installed. Please see "
 if not is_bitsandbytes_available() or is_bitsandbytes_version("<", "0.43.3"): 
 if not is_bitsandbytes_available() or is_bitsandbytes_version("<", "0.43.3"): 
+            "https://pytorch.org/ and https://github.com/ggerganov/llama.cpp/tree/master/gguf-py for installation instructions."
+        )
+        raise ImportError("Please install torch and gguf>=0.10.0 to load a GGUF checkpoint in PyTorch.")
+
+    reader = GGUFReader(gguf_checkpoint_path)
+    fields = reader.fields
+    reader_keys = list(fields.keys())
+
+    parsed_parameters = {}
+    for tensor in tqdm(reader.tensors):
+        name = tensor.name
+        quant_type = tensor.tensor_type
+
+        # if the tensor is a torch supported dtype do not use GGUFParameter
+        is_gguf_quant = quant_type not in [gguf.GGMLQuantizationType.F32, gguf.GGMLQuantizationType.F16]
+        weights = torch.from_numpy(tensor.data.copy())
+        parsed_parameters[name] = GGUFParameter(weights, quant_type=quant_type) if is_gguf_quant else weights
+
+    if len(reader_keys) > 0:
+        logger.info(f"Some keys of the GGUF file were not considered: {reader_keys}")
+
+    return parsed_parameters
diff --git a/src/diffusers/quantizers/auto.py b/src/diffusers/quantizers/auto.py
@@ -19,12 +19,14 @@
 from typing import Dict, Optional, Union
 
 from .bitsandbytes import BnB4BitDiffusersQuantizer, BnB8BitDiffusersQuantizer
+from .gguf import GGUFQuantizer
 from .quantization_config import BitsAndBytesConfig, QuantizationConfigMixin, QuantizationMethod
 
 
 AUTO_QUANTIZER_MAPPING = {
     "bitsandbytes_4bit": BnB4BitDiffusersQuantizer,
     "bitsandbytes_8bit": BnB8BitDiffusersQuantizer,
+    "gguf": GGUFQuantizer,
 }
 
 AUTO_QUANTIZATION_CONFIG_MAPPING = {

diff --git a/src/diffusers/quantizers/bitsandbytes/bnb_quantizer.py b/src/diffusers/quantizers/bitsandbytes/bnb_quantizer.py
@@ -204,7 +204,10 @@ def create_quantized_param(
 
         module._parameters[tensor_name] = new_value
 
-    def check_quantized_param_shape(self, param_name, current_param_shape, loaded_param_shape):
+    def check_quantized_param_shape(self, param_name, current_param, loaded_param):
+        current_param_shape = current_param.shape
+        loaded_param_shape = loaded_param.shape
+
         n = current_param_shape.numel()
         inferred_shape = (n,) if "bias" in param_name else ((n + 1) // 2, 1)
         if loaded_param_shape != inferred_shape:

diff --git a/src/diffusers/quantizers/gguf/__init__.py b/src/diffusers/quantizers/gguf/__init__.py
@@ -0,0 +1 @@
+from .gguf_quantizer import GGUFQuantizer
diff --git a/src/diffusers/quantizers/gguf/gguf_quantizer.py b/src/diffusers/quantizers/gguf/gguf_quantizer.py
@@ -0,0 +1,96 @@
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+
+from ...utils import get_module_from_name
+from ..base import DiffusersQuantizer
+from .utils import GGUFParameter, _quant_shape_from_byte_shape, _replace_with_gguf_linear
+
+
+if TYPE_CHECKING:
+    from ...models.modeling_utils import ModelMixin
+
+from ...utils import (
+    is_gguf_available,
+    is_torch_available,
+    logging,
+)
+
+
+if is_torch_available():
+    import torch
+
+if is_gguf_available():
+    import gguf
+
+logger = logging.get_logger(__name__)
+
+
+class GGUFQuantizer(DiffusersQuantizer):
+    def __init__(self, quantization_config, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+
+        self.compute_dtype = quantization_config.compute_dtype
+        self.pre_quantized = True
+
+    def check_quantized_param_shape(self, param_name, current_param, loaded_param):
+        loaded_param_shape = loaded_param.shape
+        current_param_shape = current_param.shape
+        quant_type = loaded_param.quant_type
+
+        block_size, type_size = gguf.GGML_QUANT_SIZES[quant_type]
+
+        inferred_shape = _quant_shape_from_byte_shape(loaded_param_shape, type_size, block_size)
-        inferred_shape = _quant_shape_from_byte_shape(loaded_param_shape, type_size, block_size)
+        inferred_shape = _infer_quant_shape_from_byte_shape(loaded_param_shape, type_size, block_size)
-        inferred_shape = _quant_shape_from_byte_shape(loaded_param_shape, type_size, block_size)
+        inferred_shape = _infer_quant_shape_from_byte_shape(loaded_param_shape, type_size, block_size)
+        if inferred_shape != current_param_shape:
+            raise ValueError(
+                f"{param_name} has an expected quantized shape of: {inferred_shape}, but receieved shape: {loaded_param_shape}"
+            )
+
+        return True
+
+    def check_if_quantized_param(
+        self,
+        model: "ModelMixin",
+        param_value: "torch.Tensor",
+        param_name: str,
+        state_dict: Dict[str, Any],
+        **kwargs,
+    ) -> bool:
+        if isinstance(param_value, GGUFParameter):
+            return True
+
+        return False
+
+    def create_quantized_param(
+        self,
+        model: "ModelMixin",
+        param_value: "torch.Tensor",
+        param_name: str,
+        target_device: "torch.device",
+        state_dict: Dict[str, Any],
+        unexpected_keys: Optional[List[str]] = None,
+    ):
+        module, tensor_name = get_module_from_name(model, param_name)
+        if tensor_name not in module._parameters:
+            raise ValueError(f"{module} does not have a parameter or a buffer named {tensor_name}.")
+
+        module._parameters[tensor_name] = param_value
+
+    def _process_model_before_weight_loading(
+        self,
+        model: "ModelMixin",
+        device_map,
+        keep_in_fp32_modules: List[str] = [],
+        **kwargs,
+    ):
+        _replace_with_gguf_linear(model, self.compute_dtype)
+
+    def _process_model_after_weight_loading(self, model: "ModelMixin", **kwargs):
+        return model
+
+    @property
+    def is_serializable(self):
+        return False
+
+    @property
+    def is_trainable(self) -> bool:
+        # Because we're mandating `bitsandbytes` 0.43.3.
+        return False