reserve space for random_roll creating extra noise span

jyc · jyc · commit f92e701399fd · 2025-11-12T20:52:59.000-08:00
diff --git a/src/levanter/data/ul2r.py b/src/levanter/data/ul2r.py
@@ -286,9 +286,9 @@ def apply_roll(m):
 @functools.partial(jax.jit, static_argnames=["force_initial_sentinel"])
 def noise_span_to_unique_sentinel(
     tokens: jnp.ndarray,
+    length: int,
     noise_mask: jnp.ndarray,
     sentinel_tokens: jnp.ndarray,
-    length: int,
     force_initial_sentinel: bool,
 ) -> jnp.ndarray:
     """
@@ -369,6 +369,7 @@ def to_ul2r_rx_tokens(
     mask_prob: float,
     mean_noise_span_length: float,
     random_roll: bool,
+    pad_token_id: int,
     sentinel_token_ids: jnp.ndarray,
     max_length: int,
 ) -> tuple[jnp.ndarray, jnp.ndarray]:
@@ -384,8 +385,8 @@ def to_ul2r_rx_tokens(
         - The length of `inputs` (before `targets`).
           For use when generating the loss mask / PrefixLM attention mask.
         - A tensor with the same shape as `tokens` containing
-          `inputs targets 0...` where `inputs targets` is truncated to
-          fit `max_length`. There is no leading padding.
+          `inputs targets 0...` where `inputs targets` is truncated to fit `max_length`.
+          There is no leading padding.
     """
 
     padded_length = tokens.shape[0]
@@ -407,16 +408,16 @@ def to_ul2r_rx_tokens(
 
     inputs = noise_span_to_unique_sentinel(
         tokens,
+        length,
         noise_mask,
         sentinel_token_ids,
-        length,
         force_initial_sentinel=False,
     )
     targets = noise_span_to_unique_sentinel(
         tokens,
+        target_in_len,
         ~noise_mask,
         sentinel_token_ids,
-        target_in_len,
         force_initial_sentinel=True,
     )
 
@@ -440,7 +441,7 @@ def to_ul2r_rx_tokens(
     trunc_target_len = jnp.maximum(target_len - drop_targets, 0)
 
     # Truncate `targets` to the new length; `inputs` are gated by `new_input_len` below
-    targets = jnp.where(indices < trunc_target_len, targets, 0)
+    targets = jnp.where(indices < trunc_target_len, targets, pad_token_id)
     targets = typing.cast(jnp.ndarray, targets)
 
     targets = jnp.roll(targets, trunc_input_len)
@@ -512,6 +513,7 @@ def to_ul2r_tokens(
     task_params: jnp.ndarray,
     tokens: jnp.ndarray,
     length: int,
+    pad_token_id: int,
     sentinel_token_ids: jnp.ndarray,
     # TODO maybe we don't actually need the truncation logic in
     # to_ul2r_rx_tokens given that we truncate while packing
@@ -547,6 +549,7 @@ def rx_tokens():
             noise_density,
             mean_noise_span_length,
             True,
+            pad_token_id,
             sentinel_token_ids,
             max_length - 1,
         )
@@ -616,13 +619,18 @@ def ul2r_loss_mask(
 def compute_denoising_length(
     task_params: jnp.ndarray,
     length: jnp.ndarray,
+    random_roll: bool,
 ) -> jnp.ndarray:
     def rx_length() -> jnp.ndarray:
         noise_density = RXDenoisingConfig.mask_prob_from_task_params(task_params)
         mean_noise_span_length = RXDenoisingConfig.mean_span_length_from_task_params(task_params)
         _num_noise_tokens, num_noise_spans, _num_nonnoise_tokens = num_noise_spans_tokens_and_spans(
             length, noise_density, mean_noise_span_length
         )
+        # When random_roll is True, we might create an additional noise span by
+        # rolling a noise span so that it is cut by the beginning/end. Reserve
+        # space for it.
+        num_noise_spans = jax.lax.select(random_roll, num_noise_spans + 1, num_noise_spans)
         # [task_token] one <sentinel_0> three <sentinel_0> two
         return 1 + 2 * num_noise_spans + length
 
@@ -678,7 +686,7 @@ def prepare_segment(id: int) -> tuple[jnp.ndarray, jnp.ndarray, jnp.ndarray]:
         # batch independently (but in a way that matches how we computed
         # lengths for packing).
         task_idx = task_indices[id]
-        out_length = compute_denoising_length(task_params[task_idx], in_length)
+        out_length = compute_denoising_length(task_params[task_idx], in_length, False)
 
         return in_start, in_length, out_length
 
@@ -711,8 +719,11 @@ def process_segment(key: PRNGKeyArray, id: int) -> tuple[jnp.ndarray, jnp.ndarra
         out_start = typing.cast(int, jnp.squeeze(out_starts[idx]))
 
         segment = jnp.roll(tokens.array, -in_start)
+        # TODO this should return the actual length, not just out_length which
+        # might include an extra token? Or we could just use padding. Loss
+        # shouldn't be compute don padding anyways.
         inputs_len, denoising_tokens = to_ul2r_tokens(
-            key, task_params[task_idx], segment, in_length, sentinel_token_ids, QPos.size
+            key, task_params[task_idx], segment, in_length, pad_token_id, sentinel_token_ids, QPos.size
         )
 
         n_tokens = tokens.array.shape[0]
@@ -825,7 +836,7 @@ def diff_offsets(offsets: np.ndarray):
         # to turn each input batch into a denoising batch while still staying
         # under the max sequence length for the model.
         def _compute_length(task_idx: jnp.ndarray, length: jnp.ndarray) -> int:
-            return compute_denoising_length(task_params[task_idx], length)
+            return compute_denoising_length(task_params[task_idx], length, False)
 
         out_token_counts = jax.vmap(_compute_length)(task_indices, in_token_counts)
         out_lengths = {**in_lengths, "input_ids": out_token_counts}
diff --git a/tests/test_ul2r.py b/tests/test_ul2r.py
@@ -18,6 +18,7 @@
     preprocessor_for_format,
 )
 from levanter.data.ul2r import (
+    RX_TASK_KIND,
     TokenizedDict,
     compute_denoising_length,
     noise_span_to_unique_sentinel,
@@ -291,7 +292,7 @@ def test_noise_span_to_unique_sentinel():
     )
     noise_mask = jnp.pad(noise_mask, (0, padded_length - 10), constant_values=False)
 
-    result = noise_span_to_unique_sentinel(tokens, noise_mask, sentinel_tokens, 10, force_initial_sentinel=False)
+    result = noise_span_to_unique_sentinel(tokens, 10, noise_mask, pad_token_id, sentinel_tokens, force_initial_sentinel=False)
 
     expected = jnp.array([100, 13, 14, 15, 16, 17, 18, 19])
     np.testing.assert_array_equal(result[:8], expected)
@@ -323,7 +324,7 @@ def test_noise_span_to_unique_sentinel():
     )
     noise_mask = jnp.pad(noise_mask, (0, padded_length - 15), constant_values=False)
 
-    result = noise_span_to_unique_sentinel(tokens, noise_mask, sentinel_tokens, 15, force_initial_sentinel=True)
+    result = noise_span_to_unique_sentinel(tokens, 15, noise_mask, pad_token_id, sentinel_tokens, force_initial_sentinel=True)
 
     # Should still start with sentinel
     expected = jnp.array([100, 10, 101, 13, 14, 15, 102, 17, 18, 19, 103, 23, 24])
@@ -335,7 +336,7 @@ def test_noise_span_to_unique_sentinel():
     tokens = jnp.pad(tokens, (0, padded_length - 10), constant_values=pad_token_id)
     noise_mask = jnp.zeros(padded_length, dtype=jnp.bool_)
 
-    result = noise_span_to_unique_sentinel(tokens, noise_mask, sentinel_tokens, 10, force_initial_sentinel=False)
+    result = noise_span_to_unique_sentinel(tokens, 10, noise_mask, pad_token_id, sentinel_tokens, force_initial_sentinel=False)
 
     # Should be unchanged except for padding
     np.testing.assert_array_equal(result[:10], jnp.arange(10, 20))
@@ -405,16 +406,18 @@ def test_to_ul2r_rx_tokens():
 
     inputs = noise_span_to_unique_sentinel(
         tokens,
+        length,
         noise_mask,
+        pad_token_id,
         sentinel_tokens,
-        length,
         force_initial_sentinel=False,
     )
     targets = noise_span_to_unique_sentinel(
         tokens,
+        length,
         ~noise_mask,
+        pad_token_id,
         sentinel_tokens,
-        length,
         force_initial_sentinel=True,
     )
 
@@ -430,6 +433,7 @@ def test_to_ul2r_rx_tokens():
         mask_prob=0.3,
         mean_noise_span_length=3.0,
         random_roll=False,
+        pad_token_id=pad_token_id,
         sentinel_token_ids=sentinel_tokens,
         max_length=max_length,
     )
@@ -502,6 +506,7 @@ def test_to_ul2r_rx_tokens_roll():
             mask_prob=0.3,
             mean_noise_span_length=3.0,
             random_roll=True,
+            pad_token_id=pad_token_id,
             sentinel_token_ids=sentinel_ids,
             max_length=max_length,
         )
@@ -530,6 +535,51 @@ def test_to_ul2r_rx_tokens_roll():
         assert jnp.any(jnp.isin(sentinel_ids, targets))
 
 
+def test_compute_denoising_length_rx_random_roll():
+    """
+    Test that compute_denoising_length with random_roll=True reserves enough
+    space and sets pad_token_id when it doesn't create an extra span.
+
+    When random_roll=True, we reserve space for an extra span. However, rolling doesn't
+    always create an additional span, so we should see both cases:
+    - no pad_token_id (rolling created an extra span)
+    - 1 pad_token_id (rolling created an extra span)
+    """
+    max_length = 16
+    pad_token_id = 999  # Use non-zero pad token to verify it's actually being used
+    sentinel_ids = jnp.arange(100, 120)
+
+    length = 12
+    tokens = jnp.arange(1, length + 1)
+    tokens = jnp.pad(tokens, (0, max_length - length), constant_values=pad_token_id)
+
+    mask_prob = 0.3
+    mean_noise_span_length = 3.0
+    task_params = jnp.array([RX_TASK_KIND, R_TASK_TOKEN_ID, mask_prob, mean_noise_span_length])
+
+    predicted_length = compute_denoising_length(task_params, length, random_roll=True)
+
+    padding_counts = []
+    for i in range(16):
+        key = jax.random.PRNGKey(i)
+        _input_length, result = to_ul2r_rx_tokens(
+            key, tokens, length, mask_prob, mean_noise_span_length, True,
+            pad_token_id, sentinel_ids, max_length
+        )
+
+        # Subtract 1 because `result` doesn't include the task token.
+        # print(result[:predicted_length - 1])
+        num_padding = jnp.sum(result[:predicted_length - 1] == pad_token_id)
+        padding_counts.append(int(num_padding))
+
+        actual_length = jnp.sum(result != pad_token_id)
+        assert actual_length <= predicted_length
+
+    assert any(p == 0 for p in padding_counts)
+    assert any(p == 2 for p in padding_counts)
+    assert all(p == 0 or p == 2 for p in padding_counts)
+
+
 def test_ul2r_loss_mask():
     # Test case 1: Simple single segment
     input_masks = jnp.array([1, 1, 0, 0])  # First 2 are inputs
@@ -597,6 +647,7 @@ def test_to_ul2r_rx_tokens_truncates_both_sections_and_contains_sentinels():
         mask_prob=mask_prob,
         mean_noise_span_length=mean_noise_span_length,
         random_roll=random_roll,
+        pad_token_id=pad_token_id,
         sentinel_token_ids=sentinel_tokens,
         max_length=padded_length,
     )
@@ -612,6 +663,7 @@ def test_to_ul2r_rx_tokens_truncates_both_sections_and_contains_sentinels():
         mask_prob=mask_prob,
         mean_noise_span_length=mean_noise_span_length,
         random_roll=random_roll,
+        pad_token_id=pad_token_id,
         sentinel_token_ids=sentinel_tokens,
         max_length=max_length,
     )
@@ -687,9 +739,9 @@ def test_create_ul2r_example():
     in_len_s = 5
     in_len = in_len_r + in_len_x + in_len_s
 
-    out_len_r = compute_denoising_length(task_params[0], in_len_r)
-    out_len_x = compute_denoising_length(task_params[1], in_len_x)
-    out_len_s = compute_denoising_length(task_params[2], in_len_s)
+    out_len_r = compute_denoising_length(task_params[0], in_len_r, False)
+    out_len_x = compute_denoising_length(task_params[1], in_len_x, False)
+    out_len_s = compute_denoising_length(task_params[2], in_len_s, False)
 
     tokens = jnp.concatenate(
         [
@@ -754,7 +806,7 @@ def prepare_segment(id: int) -> tuple[jnp.ndarray, jnp.ndarray, jnp.ndarray]:
         # batch independently (but in a way that matches how we computed
         # lengths for packing).
         task_idx = task_indices[id]
-        out_length = compute_denoising_length(task_params[task_idx], in_length)
+        out_length = compute_denoising_length(task_params[task_idx], in_length, False)
 
         return in_start, in_length, out_length
 
@@ -790,7 +842,7 @@ def process_segment(key, id: int) -> tuple[jnp.ndarray, jnp.ndarray, int, int]:
 
         segment = jnp.roll(tokens.array, -in_start)
         print(key, task_params[task_idx], segment, in_length, QPos.size)
-        inputs_len, denoising_tokens = to_ul2r_tokens(key, task_params[task_idx], segment, in_length, SENTINEL_TOKEN_IDS, QPos.size)
+        inputs_len, denoising_tokens = to_ul2r_tokens(key, task_params[task_idx], segment, in_length, pad_token_id, SENTINEL_TOKEN_IDS, QPos.size)
 
         n_tokens = tokens.array.shape[0]
         input_mask = jnp.arange(n_tokens) < inputs_len