Remove pad_max_tiles in CLIP (#1836)

pbontrager · web-flow · commit 6a8a027dfa39 · 2024-10-16T11:32:28.000-04:00
diff --git a/recipes/configs/llama3_2_vision/11B_full.yaml b/recipes/configs/llama3_2_vision/11B_full.yaml
@@ -28,6 +28,7 @@ tokenizer:
   _component_: torchtune.models.llama3_2_vision.llama3_2_vision_transform
   path: /tmp/Llama-3.2-11B-Vision-Instruct/original/tokenizer.model
   image_size: 560
+  max_seq_len: 8192
 
 # Checkpointer
 checkpointer:
diff --git a/recipes/configs/llama3_2_vision/11B_full_single_device.yaml b/recipes/configs/llama3_2_vision/11B_full_single_device.yaml
@@ -30,6 +30,7 @@ tokenizer:
   _component_: torchtune.models.llama3_2_vision.llama3_2_vision_transform
   path: /tmp/Llama-3.2-11B-Vision-Instruct/original/tokenizer.model
   image_size: 560
+  max_seq_len: 8192
 
 # Checkpointer
 checkpointer:
diff --git a/recipes/configs/llama3_2_vision/11B_lora.yaml b/recipes/configs/llama3_2_vision/11B_lora.yaml
@@ -34,6 +34,7 @@ tokenizer:
   _component_: torchtune.models.llama3_2_vision.llama3_2_vision_transform
   path: /tmp/Llama-3.2-11B-Vision-Instruct/original/tokenizer.model
   image_size: 560
+  max_seq_len: 8192
 
 # Checkpointer
 checkpointer:
diff --git a/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml b/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml
@@ -32,6 +32,7 @@ tokenizer:
   _component_: torchtune.models.llama3_2_vision.llama3_2_vision_transform
   path: /tmp/Llama-3.2-11B-Vision-Instruct/original/tokenizer.model
   image_size: 560
+  max_seq_len: 8192
 
 # Checkpointer
 checkpointer:
diff --git a/tests/torchtune/models/clip/test_clip_image_transform.py b/tests/torchtune/models/clip/test_clip_image_transform.py
@@ -37,17 +37,6 @@ class TestCLIPImageTransform:
                 "expected_tile_max": [1.0, 1.0],
                 "expected_tile_min": [0.0, 0.0],
                 "expected_aspect_ratio": [1, 2],
-                "pad_max_tiles": False,
-            },
-            {
-                "image_size": (100, 400, 3),
-                "expected_shape": torch.Size([4, 3, 224, 224]),
-                "resize_to_max_canvas": False,
-                "expected_tile_means": [0.2230, 0.1763, 0.0, 0.0],
-                "expected_tile_max": [1.0, 1.0, 0.0, 0.0],
-                "expected_tile_min": [0.0, 0.0, 0.0, 0.0],
-                "expected_aspect_ratio": [1, 2],
-                "pad_max_tiles": True,
             },
             {
                 "image_size": (1000, 300, 3),
@@ -57,7 +46,6 @@ class TestCLIPImageTransform:
                 "expected_tile_max": [0.9705, 0.9694, 0.9521, 0.9314],
                 "expected_tile_min": [0.0353, 0.0435, 0.0528, 0.0],
                 "expected_aspect_ratio": [4, 1],
-                "pad_max_tiles": False,
             },
             {
                 "image_size": (200, 200, 3),
@@ -67,7 +55,6 @@ class TestCLIPImageTransform:
                 "expected_tile_max": [0.9922, 0.9926, 0.9970, 0.9908],
                 "expected_tile_min": [0.0056, 0.0069, 0.0059, 0.0033],
                 "expected_aspect_ratio": [2, 2],
-                "pad_max_tiles": False,
                 "pad_tiles": 1,
             },
             {
@@ -78,17 +65,6 @@ class TestCLIPImageTransform:
                 "expected_tile_max": [1.0, 1.0, 1.0],
                 "expected_tile_min": [0.0, 0.0, 0.0],
                 "expected_aspect_ratio": [3, 1],
-                "pad_max_tiles": False,
-            },
-            {
-                "image_size": (600, 200, 3),
-                "expected_shape": torch.Size([4, 3, 224, 224]),
-                "resize_to_max_canvas": False,
-                "expected_tile_means": [0.4473, 0.4469, 0.3032, 0.0],
-                "expected_tile_max": [1.0, 1.0, 1.0, 0.0],
-                "expected_tile_min": [0.0, 0.0, 0.0, 0.0],
-                "expected_aspect_ratio": [3, 1],
-                "pad_max_tiles": True,
             },
         ],
     )
@@ -103,7 +79,6 @@ def test_clip_image_transform(self, params):
             resample="bilinear",
             dtype=torch.float32,
             resize_to_max_canvas=params["resize_to_max_canvas"],
-            pad_max_tiles=params["pad_max_tiles"],
         )
 
         image_transform_inference = CLIPImageTransformInference(
@@ -115,7 +90,6 @@ def test_clip_image_transform(self, params):
             resample="bilinear",
             resize_to_max_canvas=params["resize_to_max_canvas"],
             antialias=True,
-            pad_max_tiles=params["pad_max_tiles"],
         )
 
         # Generate a deterministic image using np.arange for reproducibility
@@ -169,13 +143,7 @@ def test_clip_image_transform(self, params):
         ), f"Expected aspect ratio {params['expected_aspect_ratio']} but got {tuple(output_ar.numpy())}"
 
         # number of tiles matches the product of the aspect ratio
-        if params["pad_max_tiles"]:
-            # max_num_tiles=4.
-            assert (
-                4 == output_image.shape[0]
-            ), f"Expected 4 tiles but got {output_image.shape[0]}"
-        else:
-            expected_num_tiles = output_ar[0] * output_ar[1]
-            assert (
-                expected_num_tiles == output_image.shape[0]
-            ), f"Expected {expected_num_tiles} tiles but got {output_image.shape[0]}"
+        expected_num_tiles = output_ar[0] * output_ar[1]
+        assert (
+            expected_num_tiles == output_image.shape[0]
+        ), f"Expected {expected_num_tiles} tiles but got {output_image.shape[0]}"
diff --git a/torchtune/data/_collate.py b/torchtune/data/_collate.py
@@ -222,6 +222,7 @@ def padded_collate_tiled_images_and_mask(
     padding_idx: int = 0,
     ignore_idx: int = CROSS_ENTROPY_IGNORE_IDX,
     pad_direction: str = "right",
+    pad_max_tiles: Optional[int] = None,
     pad_max_images: Optional[int] = None,
 ) -> Dict[str, torch.Tensor]:
     """Pad a batch of text sequences, tiled image tensors, aspect ratios,
@@ -259,6 +260,8 @@ def padded_collate_tiled_images_and_mask(
             :func:`torch.nn.utils.rnn.pad_sequence`, otherwise if ``pad_direction="left"``,
             we use :func:`torchtune.data.left_pad_sequence`. For training, we typically want to pad from the right.
             For inference, we typically want to pad from the left. Defaults to "right".
+        pad_max_tiles (Optional[int]): Maximum number of tiles to pad to. If None, will pad to the largest number of tiles
+            in the batch. Defaults to None.
         pad_max_images (Optional[int]): Maximum number of images to pad to. If None, will pad to the largest number of images
             in the batch. Defaults to None.
 
@@ -272,6 +275,7 @@ def padded_collate_tiled_images_and_mask(
 
     Raises:
         ValueError: if ``pad_direction`` is not one of "left" or "right".
+        ValueError: if pad_max_tiles is set to a value less than the largest number of tiles in an image.
 
     Example:
         >>> image_id = 1
@@ -355,6 +359,13 @@ def padded_collate_tiled_images_and_mask(
         for sample in batch
         for image in sample["encoder_input"]["images"]
     )
+    if pad_max_tiles is not None:
+        if pad_max_tiles < max_num_tiles:
+            raise ValueError(
+                f"More tiles in image {max_num_tiles}, than pad_max_tiles {pad_max_tiles}"
+            )
+        max_num_tiles = pad_max_tiles
+
     # Second loop: pad images and masks to max number of tiles, max text seq len in batch
     batch_images = []
     batch_masks = []
diff --git a/torchtune/models/clip/_transform.py b/torchtune/models/clip/_transform.py
@@ -15,7 +15,6 @@
     find_supported_resolutions,
     get_canvas_best_fit,
 )
-from torchtune.modules.transforms.vision_utils.pad_dim_to_size import pad_dim_to_size
 from torchtune.modules.transforms.vision_utils.resize_with_pad import resize_with_pad
 from torchtune.modules.transforms.vision_utils.tile_crop import tile_crop
 
@@ -63,7 +62,6 @@ class CLIPImageTransform:
             This will be used to generate possible_resolutions,
             e.g. [(224, 224), (224, 448), (448, 224)] if max_num_tiles = 2 and tile_size = 224.
             Default 4.
-        pad_max_tiles (bool): If True, the image will be padded to have tiles == max_num_tiles. Default False.
         dtype (torch.dtype): Data type of the output image. Default torch.bfloat16.
         resample (str): Resampling method used when resizing images. Supports any enum of
             ``torchvision.transforms.InterpolationMode``, e.g. "nearest", "nearest_exact", "bilinear", "bicubic".
@@ -101,7 +99,6 @@ def __init__(
         possible_resolutions: Optional[List[Tuple[int, int]]] = None,
         tile_size: int = 224,
         max_num_tiles: Optional[int] = 4,
-        pad_max_tiles: bool = False,
         dtype: torch.dtype = torch.bfloat16,
         resample: str = "bilinear",
         resize_to_max_canvas: bool = False,
@@ -142,7 +139,6 @@ def __init__(
         # tile_crop
         self.tile_size = tile_size
         self.tile_crop = tile_crop
-        self.pad_tile_size = max_num_tiles if pad_max_tiles else None
 
     def __call__(
         self, sample: Mapping[str, Any], inference: bool = False
@@ -190,8 +186,6 @@ def __call__(
 
         # Divide the image into equally sized tiles
         image = self.tile_crop(image=image, tile_size=self.tile_size)
-        if self.pad_tile_size:
-            image = pad_dim_to_size(image, size=self.pad_tile_size, dim=0)
 
         aspect_ratio = torch.tensor(best_resolution).reshape(-1) // self.tile_size
 
diff --git a/torchtune/models/llama3_2_vision/_transform.py b/torchtune/models/llama3_2_vision/_transform.py
@@ -86,7 +86,6 @@ def __init__(
             tile_size=tile_size,
             possible_resolutions=None,
             max_num_tiles=max_num_tiles,
-            pad_max_tiles=True,
             resample="bilinear",
             resize_to_max_canvas=False,
         )

Original file line number	Diff line number	Diff line change
`@@ -86,7 +86,6 @@ def __init__(`
`86`	`86`	`tile_size=tile_size,`
`87`	`87`	`possible_resolutions=None,`
`88`	`88`	`max_num_tiles=max_num_tiles,`
`89`		`- pad_max_tiles=True,`
`90`	`89`	`resample="bilinear",`
`91`	`90`	`resize_to_max_canvas=False,`
`92`	`91`	`)`