Refactor load_image to return torch.Tensor instead of PIL.Image (#2366)

Ankur-singh · web-flow · commit 0e8f840e4a34 · 2025-02-25T11:22:51.000-05:00
diff --git a/tests/torchtune/data/test_data_utils.py b/tests/torchtune/data/test_data_utils.py
@@ -7,6 +7,7 @@
 import os
 
 import pytest
+import torch
 from PIL import Image
 
 from tests.common import ASSETS
@@ -107,8 +108,8 @@ def test_load_image(monkeypatch, tmp_path):
 
     # Test loading from local file
     image = load_image(tmp_image)
-    assert isinstance(image, Image.Image)
-    assert image.size == (580, 403)
+    assert isinstance(image, torch.Tensor)
+    assert image.size() == (3, 403, 580)
 
     # Test loading from remote file
     # Mock the urlopen function to return a BytesIO object
@@ -117,11 +118,11 @@ def mock_urlopen(url):
 
     monkeypatch.setattr("urllib.request.urlopen", mock_urlopen)
     image = load_image("http://example.com/test_image.jpg")
-    assert isinstance(image, Image.Image)
-    assert image.size == (580, 403)
+    assert isinstance(image, torch.Tensor)
+    assert image.size() == (3, 403, 580)
 
     # Test that a ValueError is raised when the image path is invalid
-    with pytest.raises(ValueError, match="Failed to open image as PIL.Image"):
+    with pytest.raises(ValueError, match="Failed to load local image as torch.Tensor"):
         load_image("invalid_path")
 
     # Test a temporary file with invalid image data
@@ -130,16 +131,16 @@ def mock_urlopen(url):
         f.write("Invalid image data")
 
     # Test that a ValueError is raised when the image data is invalid
-    with pytest.raises(ValueError, match="Failed to open image as PIL.Image"):
+    with pytest.raises(ValueError, match="Failed to load local image as torch.Tensor"):
         load_image(str(image_path))
 
     # Test that a ValueError is raised when there is an HTTP error
     # Mock the urlopen function to raise an exception
     def mock_urlopen(url):
-        raise Exception("Failed to load image")
+        raise Exception("Failed to load remote image as torch.Tensor")
 
     monkeypatch.setattr("urllib.request.urlopen", mock_urlopen)
-    with pytest.raises(ValueError, match="Failed to load image"):
+    with pytest.raises(ValueError, match="Failed to load remote image as torch.Tensor"):
         load_image("http://example.com/test_image.jpg")
 
     # Test that a ValueError is raised when there is an IO error
@@ -148,7 +149,7 @@ def mock_urlopen(url):
     with open(image_path, "w") as f:
         f.write("Test data")
     os.chmod(image_path, 0o000)  # Remove read permissions
-    with pytest.raises(ValueError, match="Failed to open image as PIL.Image"):
+    with pytest.raises(ValueError, match="Failed to load local image as torch.Tensor"):
         load_image(str(image_path))
     os.chmod(image_path, 0o644)  # Restore read permissions
 
@@ -157,5 +158,5 @@ def mock_urlopen(url):
     image_path = tmp_path / "test_image.jpg"
     with open(image_path, "wb") as f:
         f.write(b"Invalid image data")
-    with pytest.raises(ValueError, match="Failed to open image as PIL.Image"):
+    with pytest.raises(ValueError, match="Failed to load local image as torch.Tensor"):
         load_image(str(image_path))
diff --git a/tests/torchtune/datasets/multimodal/test_vqa_dataset.py b/tests/torchtune/datasets/multimodal/test_vqa_dataset.py
@@ -5,10 +5,10 @@
 # LICENSE file in the root directory of this source tree.
 
 import pytest
-from PIL.PngImagePlugin import PngImageFile
+
+import torch
 from tests.common import ASSETS
 from tests.test_utils import DummyTokenizer
-
 from torchtune.datasets.multimodal import vqa_dataset
 
 
@@ -46,7 +46,7 @@ def test_get_item(self, tokenizer):
             )
             assert prompt == expected_tokens[i]
             assert label == expected_labels[i]
-            assert isinstance(image[0], PngImageFile)
+            assert isinstance(image[0], torch.Tensor)
 
     def test_dataset_fails_with_packed(self, tokenizer):
         with pytest.raises(
diff --git a/tests/torchtune/models/clip/test_clip_image_transform.py b/tests/torchtune/models/clip/test_clip_image_transform.py
@@ -31,6 +31,7 @@ class TestCLIPImageTransform:
         [
             {
                 "image_size": (100, 400, 3),
+                "image_type": "PIL.Image",
                 "expected_shape": torch.Size([2, 3, 224, 224]),
                 "resize_to_max_canvas": False,
                 "expected_tile_means": [0.2230, 0.1763],
@@ -40,6 +41,7 @@ class TestCLIPImageTransform:
             },
             {
                 "image_size": (1000, 300, 3),
+                "image_type": "PIL.Image",
                 "expected_shape": torch.Size([4, 3, 224, 224]),
                 "resize_to_max_canvas": True,
                 "expected_tile_means": [0.5007, 0.4995, 0.5003, 0.1651],
@@ -49,6 +51,7 @@ class TestCLIPImageTransform:
             },
             {
                 "image_size": (200, 200, 3),
+                "image_type": "PIL.Image",
                 "expected_shape": torch.Size([4, 3, 224, 224]),
                 "resize_to_max_canvas": True,
                 "expected_tile_means": [0.5012, 0.5020, 0.5011, 0.4991],
@@ -59,6 +62,48 @@ class TestCLIPImageTransform:
             },
             {
                 "image_size": (600, 200, 3),
+                "image_type": "torch.Tensor",
+                "expected_shape": torch.Size([3, 3, 224, 224]),
+                "resize_to_max_canvas": False,
+                "expected_tile_means": [0.4473, 0.4469, 0.3032],
+                "expected_tile_max": [1.0, 1.0, 1.0],
+                "expected_tile_min": [0.0, 0.0, 0.0],
+                "expected_aspect_ratio": [3, 1],
+            },
+            {
+                "image_size": (100, 400, 3),
+                "image_type": "torch.Tensor",
+                "expected_shape": torch.Size([2, 3, 224, 224]),
+                "resize_to_max_canvas": False,
+                "expected_tile_means": [0.2230, 0.1763],
+                "expected_tile_max": [1.0, 1.0],
+                "expected_tile_min": [0.0, 0.0],
+                "expected_aspect_ratio": [1, 2],
+            },
+            {
+                "image_size": (1000, 300, 3),
+                "image_type": "torch.Tensor",
+                "expected_shape": torch.Size([4, 3, 224, 224]),
+                "resize_to_max_canvas": True,
+                "expected_tile_means": [0.5007, 0.4995, 0.5003, 0.1651],
+                "expected_tile_max": [0.9705, 0.9694, 0.9521, 0.9314],
+                "expected_tile_min": [0.0353, 0.0435, 0.0528, 0.0],
+                "expected_aspect_ratio": [4, 1],
+            },
+            {
+                "image_size": (200, 200, 3),
+                "image_type": "torch.Tensor",
+                "expected_shape": torch.Size([4, 3, 224, 224]),
+                "resize_to_max_canvas": True,
+                "expected_tile_means": [0.5012, 0.5020, 0.5011, 0.4991],
+                "expected_tile_max": [0.9922, 0.9926, 0.9970, 0.9908],
+                "expected_tile_min": [0.0056, 0.0069, 0.0059, 0.0033],
+                "expected_aspect_ratio": [2, 2],
+                "pad_tiles": 1,
+            },
+            {
+                "image_size": (600, 200, 3),
+                "image_type": "torch.Tensor",
                 "expected_shape": torch.Size([3, 3, 224, 224]),
                 "resize_to_max_canvas": False,
                 "expected_tile_means": [0.4473, 0.4469, 0.3032],
@@ -99,7 +144,10 @@ def test_clip_image_transform(self, params):
             .reshape(image_size)
             .astype(np.uint8)
         )
-        image = PIL.Image.fromarray(image)
+        if params["image_type"] == "PIL.Image":
+            image = PIL.Image.fromarray(image)
+        elif params["image_type"] == "torch.Tensor":
+            image = torch.from_numpy(image).permute(2, 0, 1)
 
         # Apply the transformation
         output = image_transform({"image": image})
diff --git a/torchtune/data/_utils.py b/torchtune/data/_utils.py
@@ -8,6 +8,8 @@
 from typing import Any, Callable, Dict, List, Literal, Optional, TypeVar, Union
 from urllib import request
 
+import torch
+import torchvision
 from datasets import load_dataset
 from datasets.distributed import split_dataset_by_node
 from torch.utils.data import default_collate, DistributedSampler
@@ -44,9 +46,9 @@ def truncate(
     return tokens_truncated
 
 
-def load_image(image_loc: Union[Path, str]) -> "PIL.Image.Image":
+def load_image(image_loc: Union[Path, str]) -> torch.Tensor:
     """
-    Convenience method to load an image in PIL format from a local file path or remote source.
+    Convenience method to load an image in torch.Tensor format from a local file path or remote source.
 
     Args:
         image_loc (Union[Path, str]): Local file path or remote source pointing to the image
@@ -59,7 +61,7 @@ def load_image(image_loc: Union[Path, str]) -> "PIL.Image.Image":
     Raises:
         ValueError:
             If the image cannot be loaded from remote source, **or**
-            if the image cannot be opened as a :class:`~PIL.Image.Image`.
+            if the image cannot be opened as a :class:`~torch.Tensor`.
 
     Examples:
         >>> # Load from remote source
@@ -69,25 +71,25 @@ def load_image(image_loc: Union[Path, str]) -> "PIL.Image.Image":
         >>> image = load_image(Path("/home/user/bird.jpg"))
 
     Returns:
-        PIL.Image.Image: The loaded image.
+        torch.Tensor: The loaded image.
     """
-    # Hackily import PIL to avoid burdensome import in the main module
-    # TODO: Fix this
-    from PIL import Image
-
     # If pointing to remote source, try to load to local
     if isinstance(image_loc, str) and image_loc.startswith("http"):
         try:
-            image_loc = request.urlopen(image_loc)
+            image_loc = request.urlopen(image_loc).read()
+            image = torchvision.io.decode_image(
+                torch.frombuffer(image_loc, dtype=torch.uint8),
+                mode="RGB",
+            )
         except Exception as e:
-            raise ValueError(f"Failed to load image from {image_loc}") from e
-
-    # Open the local image as a PIL image
-    try:
-        image = Image.open(image_loc)
-    except Exception as e:
-        raise ValueError(f"Failed to open image as PIL Image from {image_loc}") from e
+            raise ValueError("Failed to load remote image as torch.Tensor") from e
 
+    # Open the local image as a Tensor image
+    else:
+        try:
+            image = torchvision.io.decode_image(image_loc, mode="RGB")
+        except Exception as e:
+            raise ValueError("Failed to load local image as torch.Tensor") from e
     return image
 
 
diff --git a/torchtune/models/clip/_transform.py b/torchtune/models/clip/_transform.py
@@ -156,10 +156,12 @@ def __call__(
                 "aspect_ratio" field.
         """
         image = sample["image"]
-        assert isinstance(image, Image.Image), "Input image must be a PIL image."
+        assert isinstance(
+            image, (Image.Image, torch.Tensor)
+        ), "Input image must be a PIL image or a torch.Tensor."
 
         # Make image torch.tensor((3, H, W), dtype=dtype), 0<=values<=1
-        if image.mode != "RGB":
+        if isinstance(image, Image.Image) and image.mode != "RGB":
             image = image.convert("RGB")
         image = F.to_image(image)
         image = F.to_dtype(image, dtype=self.dtype, scale=True)
diff --git a/torchtune/models/clip/inference/_transform.py b/torchtune/models/clip/inference/_transform.py
@@ -5,7 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import logging
-from typing import Any, List, Mapping, Optional, Tuple
+from typing import Any, List, Mapping, Optional, Tuple, Union
 
 import torch
 import torchvision
@@ -252,9 +252,13 @@ def __init__(
             antialias=self.antialias,
         )
 
-    def __call__(self, *, image: Image.Image, **kwargs) -> Mapping[str, Any]:
+    def __call__(
+        self, *, image: Union[Image.Image, torch.Tensor], **kwargs
+    ) -> Mapping[str, Any]:
 
-        assert isinstance(image, Image.Image), "Input image must be a PIL image."
+        assert isinstance(
+            image, (Image.Image, torch.Tensor)
+        ), "Input image must be a PIL image or torch.Tensor."
 
         # Make image torch.tensor((3, H, W), dtype='float32'), 0<=values<=1.
         image_tensor = F.to_dtype(