huggingface · zucchini-nlp · Jul 10, 2025 · Jun 13, 2025 · Jun 16, 2025 · Jun 16, 2025
diff --git a/docs/source/en/cache_explanation.md b/docs/source/en/cache_explanation.md
@@ -99,8 +99,6 @@ self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_stat
 
 2. The cache grows dynamically as more tokens are processed. The sequence length dimension (`seq_len`) increases with each new token.
 
-3. The cache maintains a count of seen tokens through `self._seen_tokens`. This is updated when the first layer processes a new token.
-
 The example below demonstrates how to create a generation loop with [`DynamicCache`]. As discussed, the attention mask is a concatenation of past and current token values and `1` is added to the cache position for the next token.
 
 ```py

diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
@@ -185,17 +185,6 @@ def reorder_cache(self, beam_idx: torch.LongTensor):
                 device = self.value_cache[layer_idx].device
                 self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device))
 
-    @property
-    def seen_tokens(self):
-        logger.warning_once(
-            "The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` "
-            "model input instead."
-        )
-        if hasattr(self, "_seen_tokens"):
-            return self._seen_tokens
-        else:
-            return None
-
     def get_mask_sizes(self, cache_position: torch.Tensor, layer_idx: int) -> tuple[int, int]:
         """
         Return a tuple (kv_length, kv_offset) corresponding to the length and offset that will be returned for
@@ -472,7 +461,6 @@ class DynamicCache(Cache):
 
     def __init__(self, _distributed_cache_data: Optional[Iterable] = None) -> None:
         super().__init__()
-        self._seen_tokens = 0  # Used in `generate` to keep tally of how many tokens the cache has seen
         self.key_cache: list[torch.Tensor] = []
         self.value_cache: list[torch.Tensor] = []
 
@@ -535,10 +523,6 @@ def update(
         Return:
             A tuple containing the updated key and value states.
         """
-        # Update the number of seen tokens
-        if layer_idx == 0:
-            self._seen_tokens += key_states.shape[-2]
-
         # Update the cache
         if key_states is not None:
             if len(self.key_cache) <= layer_idx:
@@ -605,7 +589,6 @@ def crop(self, max_length: int):
         if self.get_seq_length() <= max_length:
             return
 
-        self._seen_tokens = max_length
         for idx in range(len(self.key_cache)):
             if self.key_cache[idx].numel():
                 self.key_cache[idx] = self.key_cache[idx][..., :max_length, :]
@@ -617,7 +600,6 @@ def batch_split(self, full_batch_size: int, split_size: int) -> list["DynamicCac
         out = []
         for i in range(0, full_batch_size, split_size):
             current_split = DynamicCache()
-            current_split._seen_tokens = self._seen_tokens
             current_split.key_cache = [tensor[i : i + split_size] for tensor in self.key_cache]
             current_split.value_cache = [tensor[i : i + split_size] for tensor in self.value_cache]
             out.append(current_split)
@@ -815,10 +797,6 @@ def update(
         Return:
             A tuple containing the updated key and value states.
         """
-        # Update the number of seen tokens
-        if layer_idx == 0:
-            self._seen_tokens += key_states.shape[-2]
-
         # Update the cache
         if len(self.key_cache) < layer_idx:
             raise ValueError("OffloadedCache does not support model usage where layers are skipped. Use DynamicCache.")
@@ -857,6 +835,9 @@ class QuantizedCache(DynamicCache):
 
     def __init__(self, cache_config: QuantizedCacheConfig) -> None:
         super().__init__()
+
+        # Used only for QuantCache where the seq-length can't be inferred easily from cache contents
+        self._seen_tokens = 0
         self._quantized_key_cache: list[torch.Tensor] = []
         self._quantized_value_cache: list[torch.Tensor] = []
 
@@ -1412,6 +1393,19 @@ def __init__(self, self_attention_cache: Cache, cross_attention_cache: Cache):
         for layer_idx in range(len(cross_attention_cache.key_cache)):
             self.is_updated[layer_idx] = bool(cross_attention_cache.get_seq_length(layer_idx) > 0)
 
+    def __iter__(self):
+        """
+        Support for backwards-compatible `past_key_value` iteration, e.g. `for x in past_key_value:` to iterate over
+        keys and values
+        """
+        for layer_idx in range(len(self)):
+            yield (
+                self.self_attention_cache.key_cache[layer_idx],
+                self.self_attention_cache.value_cache[layer_idx],
+                self.cross_attention_cache.key_cache[layer_idx],
+                self.cross_attention_cache.value_cache[layer_idx],
+            )
+
     def __getitem__(self, layer_idx: int) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0].shape[2]` to get the
@@ -2313,10 +2307,6 @@ def __init__(
             self._device_key_cache.append(key_cache)
             self._device_value_cache.append(value_cache)
 
-        # For backwards compatibility.
-        # TODO(gante): Remove this.
-        self._seen_tokens = 0
-
         # Create new CUDA stream for parallel prefetching.
         self._prefetch_stream = torch.cuda.Stream() if self.device.type == "cuda" else None
 
@@ -2350,10 +2340,6 @@ def update(
         value_states = value_states.to(self.value_cache[layer_idx].dtype)
 
         if layer_idx == 0:
-            # Update seen tokens.
-            # TODO(gante): Remove this.
-            self._seen_tokens += key_states.shape[-2]
-
             # Always there.
             k_out = self.key_cache[0]
             v_out = self.value_cache[0]
@@ -2407,10 +2393,14 @@ def update(
         return k_out, v_out
 
     def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
-        """Returns the sequence length of the cached states that were seen by the model."""
-
-        # TODO(gante): Remove this.
-        return self._seen_tokens
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        is_empty_layer = (
+            len(self.key_cache) == 0  # no cache in any layer
+            or len(self.key_cache) <= layer_idx  # hasn't run a layer with cache after it
+            or not self.key_cache[layer_idx].numel()  # the layer has no cache
+        )
+        layer_seq_length = self.key_cache[layer_idx].shape[-2] if not is_empty_layer else 0
+        return layer_seq_length
 
     def get_max_cache_shape(self) -> Optional[int]:
         """Returns the maximum sequence length of the cached states."""
@@ -2420,22 +2410,12 @@ def get_max_cache_shape(self) -> Optional[int]:
     def reset(self) -> None:
         """Resets the cache values while preserving the objects."""
 
-        # For backwards compatibility.
-        # TODO(gante): Remove this.
-        self._seen_tokens = 0
-
         # Zero out cache.
         for layer_idx in range(len(self.key_cache)):
             # In-place ops prevent breaking the static address.
             self.key_cache[layer_idx].zero_()
             self.value_cache[layer_idx].zero_()
 
-    @property
-    def seen_tokens(self) -> int:
-        # For backwards compatibility.
-        # TODO(gante): Remove this.
-        return self._seen_tokens
-
     def _create_key_value_cache_tensors(
         self, shape: tuple[int, ...], device: torch.device
     ) -> tuple[torch.Tensor, torch.Tensor]:

diff --git a/src/transformers/integrations/executorch.py b/src/transformers/integrations/executorch.py
@@ -15,7 +15,7 @@
 
 import torch
 
-from ..cache_utils import DynamicCache, HybridCache, StaticCache
+from ..cache_utils import DynamicCache, EncoderDecoderCache, HybridCache, StaticCache
 from ..generation.configuration_utils import GenerationConfig
 from ..masking_utils import (
     ALL_MASK_ATTENTION_FUNCTIONS,
@@ -548,14 +548,15 @@ def __init__(self, model, max_static_cache_length, batch_size):
         self.lm_head = model.lm_head
         self.config = model.config
 
-        # Initialize static cache
+        # Initialize static cache for decoder and DynamicCache for encoder
         self.static_cache = StaticCache(
             config=self.config,
             max_batch_size=batch_size,
             max_cache_len=max_static_cache_length,
             device="cpu",
             dtype=torch.float32,
         )
+        self.cache = EncoderDecoderCache(self.static_cache, DynamicCache())
 
         # Register cache buffers to make them exportable
         for i in range(len(self.static_cache.key_cache)):
@@ -567,7 +568,7 @@ def forward(self, decoder_input_ids, encoder_hidden_states, cache_position):
         outputs = self.decoder(
             input_ids=decoder_input_ids,
             encoder_hidden_states=encoder_hidden_states,
-            past_key_values=self.static_cache,
+            past_key_values=self.cache,
             use_cache=True,
             cache_position=cache_position,
         )
-Original file line number
+Diff line change
@@ Expand Up @@
 . The cache grows dynamically as more tokens are processed. The sequence length dimension (`seq_len`) increases with each new token.
-. The cache maintains a count of seen tokens through `self._seen_tokens`. This is updated when the first layer processes a new token.
     The example below demonstrates how to create a generation loop with [`DynamicCache`]. As discussed, the attention mask is a concatenation of past and current token values and `1` is added to the cache position for the next token.
     ```py
@@ Expand Down @@