Merge branch 'main' into fix-hardcoded-255

pmeier · pmeier · commit d0394b73a123 · 2022-11-03T10:24:21.000+01:00
diff --git a/.github/workflows/build-wheels-m1.yml b/.github/workflows/build-wheels-m1.yml
@@ -0,0 +1,47 @@
+name: Build M1 Wheels
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+  workflow_dispatch:
+
+jobs:
+  generate-matrix:
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    with:
+      package-type: wheel
+      os: macos-arm64
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+  build:
+    needs: generate-matrix
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: pytorch/vision
+            pre-script: ""
+            post-script: ""
+            smoke-test-script: test/smoke_test.py
+            package-name: torchvision
+    name: ${{ matrix.repository }}
+    uses: pytorch/test-infra/.github/workflows/build_wheels_macos.yml@main
+    with:
+      repository: ${{ matrix.repository }}
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      pre-script: ${{ matrix.pre-script }}
+      post-script: ${{ matrix.post-script }}
+      package-name: ${{ matrix.package-name }}
+      runner-type: macos-m1-12
+      smoke-test-script: ${{ matrix.smoke-test-script }}
+      # Using "development" as trigger event so these binaries are not uploaded
+      # to official channels yet
+      trigger-event: development
+    secrets:
+      AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+      AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
@@ -862,6 +862,27 @@ def sample_inputs_crop_video():
         yield ArgsKwargs(video_loader, top=4, left=3, height=7, width=8)
 
 
+def reference_crop_bounding_box(bounding_box, *, format, top, left, height, width):
+
+    affine_matrix = np.array(
+        [
+            [1, 0, -left],
+            [0, 1, -top],
+        ],
+        dtype="float32",
+    )
+
+    expected_bboxes = reference_affine_bounding_box_helper(bounding_box, format=format, affine_matrix=affine_matrix)
+    return expected_bboxes, (height, width)
+
+
+def reference_inputs_crop_bounding_box():
+    for bounding_box_loader, params in itertools.product(
+        make_bounding_box_loaders(extra_dims=((), (4,))), [_CROP_PARAMS[0], _CROP_PARAMS[-1]]
+    ):
+        yield ArgsKwargs(bounding_box_loader, format=bounding_box_loader.format, **params)
+
+
 KERNEL_INFOS.extend(
     [
         KernelInfo(
@@ -875,6 +896,8 @@ def sample_inputs_crop_video():
         KernelInfo(
             F.crop_bounding_box,
             sample_inputs_fn=sample_inputs_crop_bounding_box,
+            reference_fn=reference_crop_bounding_box,
+            reference_inputs_fn=reference_inputs_crop_bounding_box,
         ),
         KernelInfo(
             F.crop_mask,
diff --git a/test/test_extended_models.py b/test/test_extended_models.py
@@ -1,3 +1,4 @@
+import copy
 import os
 
 import pytest
@@ -59,6 +60,25 @@ def test_get_model_weights(name, weight):
     assert models.get_model_weights(name) == weight
 
 
+@pytest.mark.parametrize("copy_fn", [copy.copy, copy.deepcopy])
+@pytest.mark.parametrize(
+    "name",
+    [
+        "resnet50",
+        "retinanet_resnet50_fpn_v2",
+        "raft_large",
+        "quantized_resnet50",
+        "lraspp_mobilenet_v3_large",
+        "mvit_v1_b",
+    ],
+)
+def test_weights_copyable(copy_fn, name):
+    model_weights = models.get_model_weights(name)
+    for weights in list(model_weights):
+        copied_weights = copy_fn(weights)
+        assert copied_weights is weights
+
+
 @pytest.mark.parametrize(
     "module", [models, models.detection, models.quantization, models.segmentation, models.video, models.optical_flow]
 )
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
@@ -900,7 +900,8 @@ def test_correctness_center_crop_bounding_box(device, output_size):
     def _compute_expected_bbox(bbox, output_size_):
         format_ = bbox.format
         spatial_size_ = bbox.spatial_size
-        bbox = convert_format_bounding_box(bbox, format_, features.BoundingBoxFormat.XYWH)
+        dtype = bbox.dtype
+        bbox = convert_format_bounding_box(bbox.float(), format_, features.BoundingBoxFormat.XYWH)
 
         if len(output_size_) == 1:
             output_size_.append(output_size_[-1])
@@ -913,14 +914,9 @@ def _compute_expected_bbox(bbox, output_size_):
             bbox[2].item(),
             bbox[3].item(),
         ]
-        out_bbox = features.BoundingBox(
-            out_bbox,
-            format=features.BoundingBoxFormat.XYWH,
-            spatial_size=output_size_,
-            dtype=bbox.dtype,
-            device=bbox.device,
-        )
-        return convert_format_bounding_box(out_bbox, features.BoundingBoxFormat.XYWH, format_)
+        out_bbox = torch.tensor(out_bbox)
+        out_bbox = convert_format_bounding_box(out_bbox, features.BoundingBoxFormat.XYWH, format_)
+        return out_bbox.to(dtype=dtype, device=bbox.device)
 
     for bboxes in make_bounding_boxes(extra_dims=((4,),)):
         bboxes = bboxes.to(device)
diff --git a/torchvision/models/_api.py b/torchvision/models/_api.py
@@ -75,6 +75,9 @@ def __getattr__(self, name):
                 return object.__getattribute__(self.value, name)
         return super().__getattr__(name)
 
+    def __deepcopy__(self, memodict=None):
+        return self
+
 
 def get_weight(name: str) -> WeightsEnum:
     """
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
@@ -4,6 +4,7 @@
 
 import PIL.Image
 import torch
+from torch.nn.functional import interpolate
 from torchvision.prototype import features
 from torchvision.transforms import functional_pil as _FP, functional_tensor as _FT
 from torchvision.transforms.functional import (
@@ -115,20 +116,37 @@ def resize_image_tensor(
     max_size: Optional[int] = None,
     antialias: bool = False,
 ) -> torch.Tensor:
+    align_corners: Optional[bool] = None
+    if interpolation == InterpolationMode.BILINEAR or interpolation == InterpolationMode.BICUBIC:
+        align_corners = False
+    elif antialias:
+        raise ValueError("Antialias option is supported for bilinear and bicubic interpolation modes only")
+
     shape = image.shape
     num_channels, old_height, old_width = shape[-3:]
     new_height, new_width = _compute_resized_output_size((old_height, old_width), size=size, max_size=max_size)
 
     if image.numel() > 0:
         image = image.reshape(-1, num_channels, old_height, old_width)
 
-        image = _FT.resize(
+        dtype = image.dtype
+        need_cast = dtype not in (torch.float32, torch.float64)
+        if need_cast:
+            image = image.to(dtype=torch.float32)
+
+        image = interpolate(
             image,
             size=[new_height, new_width],
-            interpolation=interpolation.value,
+            mode=interpolation.value,
+            align_corners=align_corners,
             antialias=antialias,
         )
 
+        if need_cast:
+            if interpolation == InterpolationMode.BICUBIC and dtype == torch.uint8:
+                image = image.clamp_(min=0, max=255)
+            image = image.round_().to(dtype=dtype)
+
     return image.reshape(shape[:-3] + (num_channels, new_height, new_width))
 
 
@@ -802,22 +820,17 @@ def crop_bounding_box(
     height: int,
     width: int,
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
-    # TODO: Investigate if it makes sense from a performance perspective to have an implementation for every
-    #  BoundingBoxFormat instead of converting back and forth
-    bounding_box = convert_format_bounding_box(
-        bounding_box.clone(), old_format=format, new_format=features.BoundingBoxFormat.XYXY, inplace=True
-    )
+
+    bounding_box = bounding_box.clone()
 
     # Crop or implicit pad if left and/or top have negative values:
-    bounding_box[..., 0::2] -= left
-    bounding_box[..., 1::2] -= top
+    if format == features.BoundingBoxFormat.XYXY:
+        sub = torch.tensor([left, top, left, top], device=bounding_box.device)
+    else:
+        sub = torch.tensor([left, top, 0, 0], device=bounding_box.device)
+    bounding_box = bounding_box.sub_(sub)
 
-    return (
-        convert_format_bounding_box(
-            bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format, inplace=True
-        ),
-        (height, width),
-    )
+    return bounding_box, (height, width)
 
 
 def crop_mask(mask: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
@@ -1317,9 +1330,11 @@ def resized_crop(
 
 def _parse_five_crop_size(size: List[int]) -> List[int]:
     if isinstance(size, numbers.Number):
-        size = [int(size), int(size)]
+        s = int(size)
+        size = [s, s]
     elif isinstance(size, (tuple, list)) and len(size) == 1:
-        size = [size[0], size[0]]
+        s = size[0]
+        size = [s, s]
 
     if len(size) != 2:
         raise ValueError("Please provide only two dimensions (h, w) for size.")
diff --git a/torchvision/prototype/transforms/functional/_misc.py b/torchvision/prototype/transforms/functional/_misc.py
@@ -5,7 +5,6 @@
 import torch
 from torch.nn.functional import conv2d, pad as torch_pad
 from torchvision.prototype import features
-from torchvision.transforms import functional_tensor as _FT
 from torchvision.transforms.functional import pil_to_tensor, to_pil_image
 
 
@@ -68,9 +67,9 @@ def normalize(
 
 
 def _get_gaussian_kernel1d(kernel_size: int, sigma: float, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
-    lim = (kernel_size - 1) / (2 * math.sqrt(2) * sigma)
+    lim = (kernel_size - 1) / (2.0 * math.sqrt(2.0) * sigma)
     x = torch.linspace(-lim, lim, steps=kernel_size, dtype=dtype, device=device)
-    kernel1d = torch.softmax(-x.pow_(2), dim=0)
+    kernel1d = torch.softmax(x.pow_(2).neg_(), dim=0)
     return kernel1d
 
 
@@ -89,54 +88,61 @@ def gaussian_blur_image_tensor(
     # TODO: consider deprecating integers from sigma on the future
     if isinstance(kernel_size, int):
         kernel_size = [kernel_size, kernel_size]
-    if len(kernel_size) != 2:
+    elif len(kernel_size) != 2:
         raise ValueError(f"If kernel_size is a sequence its length should be 2. Got {len(kernel_size)}")
     for ksize in kernel_size:
         if ksize % 2 == 0 or ksize < 0:
             raise ValueError(f"kernel_size should have odd and positive integers. Got {kernel_size}")
 
     if sigma is None:
         sigma = [ksize * 0.15 + 0.35 for ksize in kernel_size]
-
-    if sigma is not None and not isinstance(sigma, (int, float, list, tuple)):
-        raise TypeError(f"sigma should be either float or sequence of floats. Got {type(sigma)}")
-    if isinstance(sigma, (int, float)):
-        sigma = [float(sigma), float(sigma)]
-    if isinstance(sigma, (list, tuple)) and len(sigma) == 1:
-        sigma = [sigma[0], sigma[0]]
-    if len(sigma) != 2:
-        raise ValueError(f"If sigma is a sequence, its length should be 2. Got {len(sigma)}")
+    else:
+        if isinstance(sigma, (list, tuple)):
+            length = len(sigma)
+            if length == 1:
+                s = float(sigma[0])
+                sigma = [s, s]
+            elif length != 2:
+                raise ValueError(f"If sigma is a sequence, its length should be 2. Got {length}")
+        elif isinstance(sigma, (int, float)):
+            s = float(sigma)
+            sigma = [s, s]
+        else:
+            raise TypeError(f"sigma should be either float or sequence of floats. Got {type(sigma)}")
     for s in sigma:
         if s <= 0.0:
             raise ValueError(f"sigma should have positive values. Got {sigma}")
 
     if image.numel() == 0:
         return image
 
+    dtype = image.dtype
     shape = image.shape
-
-    if image.ndim > 4:
+    ndim = image.ndim
+    if ndim == 3:
+        image = image.unsqueeze(dim=0)
+    elif ndim > 4:
         image = image.reshape((-1,) + shape[-3:])
-        needs_unsquash = True
-    else:
-        needs_unsquash = False
 
-    dtype = image.dtype if torch.is_floating_point(image) else torch.float32
-    kernel = _get_gaussian_kernel2d(kernel_size, sigma, dtype=dtype, device=image.device)
-    kernel = kernel.expand(image.shape[-3], 1, kernel.shape[0], kernel.shape[1])
+    fp = torch.is_floating_point(image)
+    kernel = _get_gaussian_kernel2d(kernel_size, sigma, dtype=dtype if fp else torch.float32, device=image.device)
+    kernel = kernel.expand(shape[-3], 1, kernel.shape[0], kernel.shape[1])
 
-    image, need_cast, need_squeeze, out_dtype = _FT._cast_squeeze_in(image, [kernel.dtype])
+    output = image if fp else image.to(dtype=torch.float32)
 
     # padding = (left, right, top, bottom)
     padding = [kernel_size[0] // 2, kernel_size[0] // 2, kernel_size[1] // 2, kernel_size[1] // 2]
-    output = torch_pad(image, padding, mode="reflect")
-    output = conv2d(output, kernel, groups=output.shape[-3])
-
-    output = _FT._cast_squeeze_out(output, need_cast, need_squeeze, out_dtype)
+    output = torch_pad(output, padding, mode="reflect")
+    output = conv2d(output, kernel, groups=shape[-3])
 
-    if needs_unsquash:
+    if ndim == 3:
+        output = output.squeeze(dim=0)
+    elif ndim > 4:
         output = output.reshape(shape)
 
+    if not fp:
+        output = output.round_().to(dtype=dtype)
+
     return output