huggingface · DN6 · Dec 17, 2024 · Oct 21, 2024 · Oct 21, 2024 · Oct 24, 2024
diff --git a/docs/source/en/quantization/gguf.md b/docs/source/en/quantization/gguf.md
@@ -13,10 +13,19 @@ specific language governing permissions and limitations under the License.
 
 # GGUF
 
-The GGUF file format is typically used to store models for inference with [GGML]() and supports a variety of block wise quantization options. Diffusers supports loading checkpoints prequantized and saved in the GGUF format via `from_single_file` loading with Model classes. Support for loading GGUF checkpoint via Pipelines is currently not supported. The dequantizatation functions used for dynamic dequantizatation are based on the great work done by [city96](https://github.com/city96/ComfyUI-GGUF) 
+The GGUF file format is typically used to store models for inference with [GGML](https://github.com/ggerganov/ggml) and supports a variety of block wise quantization options. Diffusers supports loading checkpoints prequantized and saved in the GGUF format via `from_single_file` loading with Model classes. Loading GGUF checkpoints via Pipelines is currently not supported.
 
 The following example will load the [FLUX.1 DEV](https://huggingface.co/black-forest-labs/FLUX.1-dev) transformer model using the GGUF Q2_K quantization variant.
 
+Before starting please install gguf in your environment
+
+```shell
+pip install -U gguf
+```
+
+Since GGUF is a single file format, we will be using `from_single_file` to load the model and pass in the `GGUFQuantizationConfig` when loading the model.
+
+When using GGUF checkpoints, the quantized weights remain in a low memory `dtype`, typically `torch.unint8` and are dynamically dequantized and cast to the configured `compute_dtype` when running a forward pass through each module in the model. The `GGUFQuantizationConfig` allows you to set the `compute_dtype` for the forward pass of each module. The functions used for dynamic dequantizatation are based on the great work done by [city96](https://github.com/city96/ComfyUI-GGUF)
 
 ```python
 import torch

diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py
@@ -351,6 +351,7 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] =
 
         if hf_quantizer is not None:
             hf_quantizer.postprocess_model(model)
+            model.hf_quantizer = hf_quantizer
 
         if torch_dtype is not None and hf_quantizer is None:
             model.to(torch_dtype)

diff --git a/src/diffusers/quantizers/gguf/gguf_quantizer.py b/src/diffusers/quantizers/gguf/gguf_quantizer.py
@@ -24,6 +24,7 @@
     from .utils import (
         GGML_QUANT_SIZES,
         GGUFParameter,
+        _dequantize_gguf_and_restore_linear,
         _quant_shape_from_byte_shape,
         _replace_with_gguf_linear,
     )
@@ -143,3 +144,16 @@ def is_serializable(self):
     @property
     def is_trainable(self) -> bool:
         return False
+
+    def _dequantize(self, model):
+        is_model_on_cpu = model.device.type == "cpu"
+        if is_model_on_cpu:
+            logger.info(
+                "Model was found to be on CPU (could happen as a result of `enable_model_cpu_offload()`). So, moving it to GPU. After dequantization, will move the model back to CPU again to preserve the previous device."
+            )
+            model.to(torch.cuda.current_device())
+
+        model = _dequantize_gguf_and_restore_linear(model, self.modules_to_not_convert)
+        if is_model_on_cpu:
+            model.to("cpu")
+        return model
diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py
@@ -13,6 +13,7 @@
 # # limitations under the License.
 
 
+import inspect
 from contextlib import nullcontext
 
 import gguf
@@ -23,7 +24,27 @@
 
 
 if is_accelerate_available():
+    import accelerate
     from accelerate import init_empty_weights
+    from accelerate.hooks import add_hook_to_module, remove_hook_from_module
+
+
+# Copied from diffusers.quantizers.bitsandbytes.utils._create_accelerate_new_hook
+def _create_accelerate_new_hook(old_hook):
+    r"""
+    Creates a new hook based on the old hook. Use it only if you know what you are doing ! This method is a copy of:
+    https://github.com/huggingface/peft/blob/748f7968f3a31ec06a1c2b0328993319ad9a150a/src/peft/utils/other.py#L245 with
+    some changes
+    """
+    old_hook_cls = getattr(accelerate.hooks, old_hook.__class__.__name__)
+    old_hook_attr = old_hook.__dict__
+    filtered_old_hook_attr = {}
+    old_hook_init_signature = inspect.signature(old_hook_cls.__init__)
+    for k in old_hook_attr.keys():
+        if k in old_hook_init_signature.parameters:
+            filtered_old_hook_attr[k] = old_hook_attr[k]
+    new_hook = old_hook_cls(**filtered_old_hook_attr)
+    return new_hook
 
 
 def _replace_with_gguf_linear(model, compute_dtype, state_dict, prefix="", modules_to_not_convert=[]):
@@ -59,6 +80,42 @@ def _should_convert_to_gguf(state_dict, prefix):
     return model
 
 
+def _dequantize_gguf_and_restore_linear(model, modules_to_not_convert=[]):
+    for name, module in model.named_children():
+        if isinstance(module, GGUFLinear) and name not in modules_to_not_convert:
+            device = module.weight.device
+            bias = getattr(module, "bias", None)
+
+            ctx = init_empty_weights if is_accelerate_available() else nullcontext
+            with ctx():
+                new_module = nn.Linear(
+                    module.in_features,
+                    module.out_features,
+                    module.bias is not None,
+                    device=device,
+                )
+            new_module.weight = nn.Parameter(dequantize_gguf_tensor(module.weight))
+            if bias is not None:
+                new_module.bias = bias
+
+            # Create a new hook and attach it in case we use accelerate
+            if hasattr(module, "_hf_hook"):
+                old_hook = module._hf_hook
+                new_hook = _create_accelerate_new_hook(old_hook)
+
+                remove_hook_from_module(module)
+                add_hook_to_module(new_module, new_hook)
+
+            new_module.to(device)
+            model._modules[name] = new_module
+
+        has_children = list(module.children())
+        if has_children:
+            _dequantize_gguf_and_restore_linear(module, modules_to_not_convert)
+
+    return model
+
+
 # dequantize operations based on torch ports of GGUF dequantize_functions
 # from City96
 # more info: https://github.com/city96/ComfyUI-GGUF/blob/main/dequant.py

diff --git a/tests/quantization/gguf/test_gguf.py b/tests/quantization/gguf/test_gguf.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 import torch
+import torch.nn as nn
 
 from diffusers import (
     FluxPipeline,
@@ -23,7 +24,7 @@
 
 
 if is_gguf_available():
-    from diffusers.quantizers.gguf.utils import GGUFParameter
+    from diffusers.quantizers.gguf.utils import GGUFLinear, GGUFParameter
 
 
 @nightly
@@ -112,6 +113,24 @@ def test_dtype_assignment(self):
         # This should work
         model.to("cuda")
 
+    def test_dequantize_model(self):
+        quantization_config = GGUFQuantizationConfig(compute_dtype=self.torch_dtype)
+        model = self.model_cls.from_single_file(self.ckpt_path, quantization_config=quantization_config)
+        model.dequantize()
+
+        def _check_for_gguf_linear(model):
+            has_children = list(model.children())
+            if not has_children:
+                return
+
+            for name, module in model.named_children():
+                if isinstance(module, nn.Linear):
+                    assert not isinstance(module, GGUFLinear), f"{name} is still GGUFLinear"
+                    assert not isinstance(module.weight, GGUFParameter), f"{name} weight is still GGUFParameter"
+
+        for name, module in model.named_children():
+            _check_for_gguf_linear(module)
+
 
 class FluxGGUFSingleFileTests(GGUFSingleFileTesterMixin, unittest.TestCase):
     ckpt_path = "https://huggingface.co/city96/FLUX.1-dev-gguf/blob/main/flux1-dev-Q2_K.gguf"