removed pad_max_tiles

pbontrager · pbontrager · commit 7fb6c3ec240c · 2024-10-15T10:25:05.000-07:00
diff --git a/torchtune/data/_collate.py b/torchtune/data/_collate.py
@@ -222,6 +222,7 @@ def padded_collate_tiled_images_and_mask(
     padding_idx: int = 0,
     ignore_idx: int = CROSS_ENTROPY_IGNORE_IDX,
     pad_direction: str = "right",
+    pad_max_tiles: Optional[int] = None,
     pad_max_images: Optional[int] = None,
 ) -> Dict[str, torch.Tensor]:
     """Pad a batch of text sequences, tiled image tensors, aspect ratios,
@@ -259,6 +260,8 @@ def padded_collate_tiled_images_and_mask(
             :func:`torch.nn.utils.rnn.pad_sequence`, otherwise if ``pad_direction="left"``,
             we use :func:`torchtune.data.left_pad_sequence`. For training, we typically want to pad from the right.
             For inference, we typically want to pad from the left. Defaults to "right".
+        pad_max_tiles (Optional[int]): Maximum number of tiles to pad to. If None, will pad to the largest number of tiles
+            in the batch. Defaults to None.
         pad_max_images (Optional[int]): Maximum number of images to pad to. If None, will pad to the largest number of images
             in the batch. Defaults to None.
 
@@ -272,6 +275,7 @@ def padded_collate_tiled_images_and_mask(
 
     Raises:
         ValueError: if ``pad_direction`` is not one of "left" or "right".
+        ValueError: if pad_max_tiles is set to a value less than the largest number of tiles in an image.
 
     Example:
         >>> image_id = 1
@@ -355,6 +359,13 @@ def padded_collate_tiled_images_and_mask(
         for sample in batch
         for image in sample["encoder_input"]["images"]
     )
+    if pad_max_tiles is not None:
+        if pad_max_tiles < max_num_tiles:
+            raise ValueError(
+                f"More tiles in image {max_num_tiles}, than pad_max_tiles {pad_max_tiles}"
+            )
+        max_num_tiles = pad_max_tiles
+
     # Second loop: pad images and masks to max number of tiles, max text seq len in batch
     batch_images = []
     batch_masks = []
diff --git a/torchtune/models/clip/_transform.py b/torchtune/models/clip/_transform.py
@@ -15,7 +15,6 @@
     find_supported_resolutions,
     get_canvas_best_fit,
 )
-from torchtune.modules.transforms.vision_utils.pad_dim_to_size import pad_dim_to_size
 from torchtune.modules.transforms.vision_utils.resize_with_pad import resize_with_pad
 from torchtune.modules.transforms.vision_utils.tile_crop import tile_crop
 
@@ -63,7 +62,6 @@ class CLIPImageTransform:
             This will be used to generate possible_resolutions,
             e.g. [(224, 224), (224, 448), (448, 224)] if max_num_tiles = 2 and tile_size = 224.
             Default 4.
-        pad_max_tiles (bool): If True, the image will be padded to have tiles == max_num_tiles. Default False.
         dtype (torch.dtype): Data type of the output image. Default torch.bfloat16.
         resample (str): Resampling method used when resizing images. Supports any enum of
             ``torchvision.transforms.InterpolationMode``, e.g. "nearest", "nearest_exact", "bilinear", "bicubic".
@@ -101,7 +99,6 @@ def __init__(
         possible_resolutions: Optional[List[Tuple[int, int]]] = None,
         tile_size: int = 224,
         max_num_tiles: Optional[int] = 4,
-        pad_max_tiles: bool = False,
         dtype: torch.dtype = torch.bfloat16,
         resample: str = "bilinear",
         resize_to_max_canvas: bool = False,
@@ -142,7 +139,6 @@ def __init__(
         # tile_crop
         self.tile_size = tile_size
         self.tile_crop = tile_crop
-        self.pad_tile_size = max_num_tiles if pad_max_tiles else None
 
     def __call__(
         self, sample: Mapping[str, Any], inference: bool = False
@@ -190,8 +186,6 @@ def __call__(
 
         # Divide the image into equally sized tiles
         image = self.tile_crop(image=image, tile_size=self.tile_size)
-        if self.pad_tile_size:
-            image = pad_dim_to_size(image, size=self.pad_tile_size, dim=0)
 
         aspect_ratio = torch.tensor(best_resolution).reshape(-1) // self.tile_size
 
diff --git a/torchtune/models/llama3_2_vision/_transform.py b/torchtune/models/llama3_2_vision/_transform.py
@@ -86,7 +86,6 @@ def __init__(
             tile_size=tile_size,
             possible_resolutions=None,
             max_num_tiles=max_num_tiles,
-            pad_max_tiles=True,
             resample="bilinear",
             resize_to_max_canvas=False,
         )

Original file line number	Diff line number	Diff line change
`@@ -86,7 +86,6 @@ def __init__(`
`86`	`86`	`tile_size=tile_size,`
`87`	`87`	`possible_resolutions=None,`
`88`	`88`	`max_num_tiles=max_num_tiles,`
`89`		`- pad_max_tiles=True,`
`90`	`89`	`resample="bilinear",`
`91`	`90`	`resize_to_max_canvas=False,`
`92`	`91`	`)`