[None][fix] Fix batching bug in Mistral3 model (NVIDIA#6841)

2ez4bz · web-flow · commit 7ebb770dcedb · 2025-08-14T02:15:44.000-04:00
Prior to this commit, if multiple requests with images were in the same
batch, the batching logic for the images would fail.

This commit fixes it, and adds unit tests for it that were verified to
fail prior to the fix.

Signed-off-by: William Zhang &lt;133824995+2ez4bz@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_mistral.py b/tensorrt_llm/_torch/models/modeling_mistral.py
@@ -373,7 +373,7 @@ def forward(
                     f"Expected as many `pixel_values` ({len(pixel_values)}) and "
                     f"`image_sizes` ({len(image_sizes)}) as number of multimodal parameters "
                     f"({multimodal_params_len}).")
-            batched_pixel_values, batched_image_sizes = self._batch_pixel_values(
+            batched_pixel_values, batched_image_sizes = self.batch_pixel_values(
                 pixel_values=pixel_values, image_sizes=image_sizes)
             mm_embeds = [
                 self._get_image_features(pixel_values=batched_pixel_values,
@@ -440,21 +440,38 @@ def _get_image_features(
     # (the transformers one expected numpy arrays).
     @staticmethod
     @torch.inference_mode()
-    def _batch_pixel_values(
+    def batch_pixel_values(
         pixel_values: List[torch.Tensor],
         image_sizes: List[torch.Tensor],
     ) -> tuple[torch.Tensor, torch.Tensor]:
+        # NOTES:
+        # * `pixel_values` is a list of `[B_idx, C, H_idx, W_idx]` tensors, i.e. a batch of images as
+        #   padded + batched by the input processor.
+        #   The height (H_idx) and width (W_idx) of each element need not coincide.
+        # * Similarly, each element in `image_sizes` describes the original image sizes prior to
+        #   padding for the corresponding element in `pixel_values`.
+
+        # The below creates a single `[sum(B_idx), 2]` tensor describing all image sizes, and then
+        # calculates the maximum height / width across all of them.
         batched_image_sizes = torch.cat(image_sizes)
         max_shape = batched_image_sizes.max(dim=0).values
+
+        # This next step then pads the pixel values potentially a second time by using the `max_shape`
+        # computed above. Note that as far as this function is concerned, the original sizes for
+        # batching purposes can be deduced from looking at the tensors in `pixel_values`, NOT in
+        # `image_sizes`.
         pixel_values = [
             torchvision.transforms.v2.functional.pad(
                 image,
                 # Per torchvision docs, this should be in LTRB order if it's a sequence of 4 numbers.
-                padding=[0, 0, max_shape[1] - size[1], max_shape[0] - size[0]],
+                padding=[
+                    0, 0, max_shape[1] - image.shape[-1],
+                    max_shape[0] - image.shape[-2]
+                ],
                 # Values extracted from HF implementation.
                 fill=0.0,
                 padding_mode="constant",
-            ) for image, size in zip(pixel_values, batched_image_sizes)
+            ) for image in pixel_values
         ]
         return torch.cat(pixel_values), batched_image_sizes
 
diff --git a/tests/unittest/_torch/modeling/test_modeling_mistral.py b/tests/unittest/_torch/modeling/test_modeling_mistral.py
@@ -438,3 +438,58 @@ def run_forward(input_ids, position_ids, attn_metadata):
             )
 
         torch.testing.assert_close(logits, ref.logits[:, -1].float(), atol=0.4, rtol=0.4)
+
+
+@pytest.mark.parametrize(
+    "in_shapes, image_sizes, expected_out_shape",
+    [
+        (
+            [(2, 3, 100, 150), (1, 3, 200, 100), (3, 3, 120, 180)],
+            [
+                [[92, 150], [100, 73]],
+                [[200, 100]],
+                [[37, 130], [120, 83], [73, 180]],
+            ],
+            [6, 3, 200, 180],
+        ),
+        # Single batch, single image.
+        (
+            [(1, 3, 64, 128)],
+            [[[64, 128]]],
+            [1, 3, 64, 128],
+        ),
+        # Same max size across batches.
+        (
+            [(2, 3, 59, 59), (1, 3, 59, 59), (5, 3, 59, 59)],
+            [
+                [[13, 59], [59, 17]],
+                [[59, 59]],
+                [[19, 29], [59, 31], [17, 54], [13, 59], [11, 37]],
+            ],
+            [8, 3, 59, 59],
+        ),
+    ],
+)
+def test_batch_pixel_values(in_shapes, image_sizes, expected_out_shape):
+    # Test case 1: Basic functionality with different sized images
+    pixel_values = [torch.randn(*shape) for shape in in_shapes]
+    image_sizes = [torch.tensor(size) for size in image_sizes]
+
+    batched_pixels, batched_sizes = modeling_mistral.Mistral3VLM.batch_pixel_values(
+        pixel_values, image_sizes
+    )
+
+    # Check output shapes
+    assert list(batched_pixels.shape) == expected_out_shape
+    assert list(batched_sizes.shape) == [expected_out_shape[0], 2]
+
+    # Check that the original image data is preserved (with padding).
+    start_idx = 0
+    for original_values in pixel_values:
+        batch_size = original_values.shape[0]
+        end_idx = start_idx + batch_size
+        orig_h, orig_w = original_values.shape[-2:]
+        padded_values = batched_pixels[start_idx:end_idx, :, :orig_h, :orig_w]
+        torch.testing.assert_close(padded_values, original_values)
+
+        start_idx += batch_size