From 53fd50920be623583eba9dc7945d68127823a563 Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Thu, 6 Jan 2022 16:42:02 +0800 Subject: [PATCH 01/26] Add Flowers102 datasets --- torchvision/datasets/flowers102.py | 114 +++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 torchvision/datasets/flowers102.py diff --git a/torchvision/datasets/flowers102.py b/torchvision/datasets/flowers102.py new file mode 100644 index 00000000000..617143daa6c --- /dev/null +++ b/torchvision/datasets/flowers102.py @@ -0,0 +1,114 @@ +from pathlib import Path +from typing import Any, Tuple, Callable, Optional + +import numpy as np +import PIL.Image + +from .utils import verify_str_arg, download_and_extract_archive, download_url +from .vision import VisionDataset + + +class Flowers102(VisionDataset): + """`Oxford 102 Flower `_ Dataset. + + .. warning:: + + This class needs `scipy `_ to load target files from `.mat` format. + + Oxford 102 Flower is an image classification dataset consisting of 102 flower categories. The + flowers chosen to be flower commonly occurring in the United Kingdom. Each class consists of + between 40 and 258 images. + + The images have large scale, pose and light variations. In addition, there are categories that + have large variations within the category and several very similar categories. + + Args: + root (string): Root directory of the dataset. + split (string, optional): The dataset split, supports ``"train"`` (default), ``"val"``, or ``"test"``. + transform (callable, optional): A function/transform that takes in an PIL image and returns a + transformed version. E.g, ``transforms.RandomCrop``. + target_transform (callable, optional): A function/transform that takes in the target and transforms it. + """ + + def __init__( + self, + root: str, + split: str = "train", + download: bool = True, + transform: Optional[Callable] = None, + target_transform: Optional[Callable] = None, + ) -> None: + super().__init__(root, transform=transform, target_transform=target_transform) + self._split = verify_str_arg(split, "split", ("train", "valid", "test")) + self._base_folder = Path(self.root) / "flowers-102" + self._images_folder = self._base_folder / "jpg" + + if download: + self._download() + + if not self._check_exists(): + raise RuntimeError("Dataset not found. You can use download=True to download it") + + self._labels = [] + self._image_files = [] + + from scipy.io import loadmat + + # Read the label ids + label_mat = loadmat(self._base_folder / "imagelabels.mat") + labels = label_mat["labels"] + + self.classes = np.unique(labels).tolist() + self.class_to_idx = dict(zip(self.classes, range(len(self.classes)))) + + # Read the image ids + set_ids = loadmat(self._base_folder / "setid.mat") + splits_map = {"train": "trnid", "valid": "valid", "test": "tstid"} + + image_ids = set_ids[splits_map[self._split]][0] + for image_id in image_ids: + self._labels += self.class_to_idx[labels[0, image_id - 1]] + self._image_files += self._images_folder / f"image_{image_id}.jpg" + + def __len__(self) -> int: + return len(self._image_files) + + def __getitem__(self, idx) -> Tuple[Any, Any]: + image_file, label = self._image_files[idx], self._labels[idx] + image = PIL.Image.open(image_file).convert("RGB") + + if self.transform: + image = self.transform(image) + + if self.target_transform: + label = self.target_transform(label) + + return image, label + + def extra_repr(self) -> str: + return f"split={self._split}" + + def _check_exists(self) -> bool: + return all(folder.exists() and folder.is_dir() for folder in self._images_folder) + + def _download(self) -> None: + if self._check_exists(): + return + + download_and_extract_archive( + "https://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz", + download_root=self.root, + md5="52808999861908f626f3c1f4e79d11fa", + ) + + download_url( + "https://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat", + self.root, + md5="a5357ecc9cb78c4bef273ce3793fc85c", + ) + + download_url( + "https://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat", + self.root, + md5="e0620be6f572b9609742df49c70aed4d", + ) From 79d0596a861369488777d5881112f999de111b3f Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Sat, 8 Jan 2022 22:37:52 +0800 Subject: [PATCH 02/26] Fix initialization of images and labels --- torchvision/datasets/flowers102.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/torchvision/datasets/flowers102.py b/torchvision/datasets/flowers102.py index 617143daa6c..6a68a476749 100644 --- a/torchvision/datasets/flowers102.py +++ b/torchvision/datasets/flowers102.py @@ -56,7 +56,7 @@ def __init__( # Read the label ids label_mat = loadmat(self._base_folder / "imagelabels.mat") - labels = label_mat["labels"] + labels = label_mat["labels"][0] self.classes = np.unique(labels).tolist() self.class_to_idx = dict(zip(self.classes, range(len(self.classes)))) @@ -66,9 +66,10 @@ def __init__( splits_map = {"train": "trnid", "valid": "valid", "test": "tstid"} image_ids = set_ids[splits_map[self._split]][0] + for image_id in image_ids: - self._labels += self.class_to_idx[labels[0, image_id - 1]] - self._image_files += self._images_folder / f"image_{image_id}.jpg" + self._labels.append(self.class_to_idx[labels[image_id - 1]]) + self._image_files.append(self._images_folder / f"image_{image_id:>05}.jpg") def __len__(self) -> int: return len(self._image_files) @@ -89,7 +90,7 @@ def extra_repr(self) -> str: return f"split={self._split}" def _check_exists(self) -> bool: - return all(folder.exists() and folder.is_dir() for folder in self._images_folder) + return self._images_folder.exists() and self._images_folder.is_dir() def _download(self) -> None: if self._check_exists(): @@ -97,18 +98,18 @@ def _download(self) -> None: download_and_extract_archive( "https://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz", - download_root=self.root, + download_root=self._base_folder, md5="52808999861908f626f3c1f4e79d11fa", ) download_url( "https://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat", - self.root, + self._base_folder, md5="a5357ecc9cb78c4bef273ce3793fc85c", ) download_url( "https://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat", - self.root, + self._base_folder, md5="e0620be6f572b9609742df49c70aed4d", ) From 3b7ce9048b9538d8fba023c124d89ac084f1bca2 Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Sat, 8 Jan 2022 23:08:56 +0800 Subject: [PATCH 03/26] Fix _check_exists in Flowers102 --- torchvision/datasets/flowers102.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/torchvision/datasets/flowers102.py b/torchvision/datasets/flowers102.py index 6a68a476749..ed8fdd2a8dd 100644 --- a/torchvision/datasets/flowers102.py +++ b/torchvision/datasets/flowers102.py @@ -41,6 +41,7 @@ def __init__( super().__init__(root, transform=transform, target_transform=target_transform) self._split = verify_str_arg(split, "split", ("train", "valid", "test")) self._base_folder = Path(self.root) / "flowers-102" + self._meta_folder = self._base_folder / "labels" self._images_folder = self._base_folder / "jpg" if download: @@ -55,21 +56,21 @@ def __init__( from scipy.io import loadmat # Read the label ids - label_mat = loadmat(self._base_folder / "imagelabels.mat") + label_mat = loadmat(self._meta_folder / "imagelabels.mat") labels = label_mat["labels"][0] self.classes = np.unique(labels).tolist() self.class_to_idx = dict(zip(self.classes, range(len(self.classes)))) # Read the image ids - set_ids = loadmat(self._base_folder / "setid.mat") + set_ids = loadmat(self._meta_folder / "setid.mat") splits_map = {"train": "trnid", "valid": "valid", "test": "tstid"} image_ids = set_ids[splits_map[self._split]][0] for image_id in image_ids: self._labels.append(self.class_to_idx[labels[image_id - 1]]) - self._image_files.append(self._images_folder / f"image_{image_id:>05}.jpg") + self._image_files.append(self._images_folder / f"image_{image_id:05d}.jpg") def __len__(self) -> int: return len(self._image_files) @@ -90,7 +91,7 @@ def extra_repr(self) -> str: return f"split={self._split}" def _check_exists(self) -> bool: - return self._images_folder.exists() and self._images_folder.is_dir() + return all(folder.exists() and folder.is_dir() for folder in (self._meta_folder, self._images_folder)) def _download(self) -> None: if self._check_exists(): @@ -104,12 +105,12 @@ def _download(self) -> None: download_url( "https://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat", - self._base_folder, + self._meta_folder, md5="a5357ecc9cb78c4bef273ce3793fc85c", ) download_url( "https://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat", - self._base_folder, + self._meta_folder, md5="e0620be6f572b9609742df49c70aed4d", ) From 38df988a35a2cb0e343e6f7ad0240d4eb3b316db Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Sun, 9 Jan 2022 02:06:22 +0800 Subject: [PATCH 04/26] Add Flowers102 to datasets and docs --- docs/source/datasets.rst | 1 + torchvision/datasets/__init__.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/docs/source/datasets.rst b/docs/source/datasets.rst index 5a494613217..4c57478a65a 100644 --- a/docs/source/datasets.rst +++ b/docs/source/datasets.rst @@ -45,6 +45,7 @@ You can also create your own datasets using the provided :ref:`base classes Date: Sun, 9 Jan 2022 02:09:36 +0800 Subject: [PATCH 05/26] Add Flowers102TestCase to unittest --- test/test_datasets.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/test/test_datasets.py b/test/test_datasets.py index d3b4fdc139a..e8d9f58b6d8 100644 --- a/test/test_datasets.py +++ b/test/test_datasets.py @@ -2463,5 +2463,44 @@ def _meta_to_split_and_classification_ann(self, meta, idx): return (image_id, class_id, species, breed_id) +class Flowers102TestCase(datasets_utils.ImageDatasetTestCase): + DATASET_CLASS = datasets.Flowers102 + FEATURE_TYPES = (PIL.Image.Image, int) + + ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "valid", "test")) + REQUIRED_PACKAGES = ("scipy",) + + def inject_fake_data(self, tmpdir: str, config): + base_folder = pathlib.Path(tmpdir) / "flowers-102" + + num_classes = 3 + num_images_per_split = dict(train=3, valid=3, test=4) + num_images_total = sum(num_images_per_split.values()) + datasets_utils.create_image_folder( + base_folder, + "jpg", + file_name_fn=lambda idx: f"image_{idx + 1:05d}.jpg", + num_examples=num_images_total, + ) + + meta_folder = base_folder / "labels" + meta_folder.mkdir() + + label_dict = dict( + labels=np.random.randint(1, num_classes + 1, size=(1, num_images_total), dtype=np.uint8), + ) + datasets_utils.lazy_importer.scipy.io.savemat(str(meta_folder / "imagelabels.mat"), label_dict) + + setid_mat = np.arange(1, num_images_total + 1, dtype=np.uint16) + setid_dict = dict( + trnid=setid_mat[: num_images_per_split["train"]].reshape(1, -1), + valid=setid_mat[num_images_per_split["train"] : -num_images_per_split["test"]].reshape(1, -1), + tstid=setid_mat[-num_images_per_split["test"] :].reshape(1, -1), + ) + datasets_utils.lazy_importer.scipy.io.savemat(str(meta_folder / "setid.mat"), setid_dict) + + return num_images_per_split[config["split"]] + + if __name__ == "__main__": unittest.main() From bf3e8d54abc92c73d0078f55c5f9d940e8539f33 Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Sun, 9 Jan 2022 02:22:31 +0800 Subject: [PATCH 06/26] Fixing Python type statically --- torchvision/datasets/flowers102.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/torchvision/datasets/flowers102.py b/torchvision/datasets/flowers102.py index ed8fdd2a8dd..ff2798f9ddc 100644 --- a/torchvision/datasets/flowers102.py +++ b/torchvision/datasets/flowers102.py @@ -99,18 +99,18 @@ def _download(self) -> None: download_and_extract_archive( "https://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz", - download_root=self._base_folder, + download_root=str(self._base_folder), md5="52808999861908f626f3c1f4e79d11fa", ) download_url( "https://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat", - self._meta_folder, + str(self._meta_folder), md5="a5357ecc9cb78c4bef273ce3793fc85c", ) download_url( "https://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat", - self._meta_folder, + str(self._meta_folder), md5="e0620be6f572b9609742df49c70aed4d", ) From 8cf1bfd1698069abd14a8d7a171c53ed87493644 Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Sun, 9 Jan 2022 12:59:02 +0800 Subject: [PATCH 07/26] Shuffle the fake labels --- test/test_datasets.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_datasets.py b/test/test_datasets.py index e8d9f58b6d8..e3ee523ba54 100644 --- a/test/test_datasets.py +++ b/test/test_datasets.py @@ -2492,6 +2492,7 @@ def inject_fake_data(self, tmpdir: str, config): datasets_utils.lazy_importer.scipy.io.savemat(str(meta_folder / "imagelabels.mat"), label_dict) setid_mat = np.arange(1, num_images_total + 1, dtype=np.uint16) + np.random.shuffle(setid_mat) setid_dict = dict( trnid=setid_mat[: num_images_per_split["train"]].reshape(1, -1), valid=setid_mat[num_images_per_split["train"] : -num_images_per_split["test"]].reshape(1, -1), From 4792f9e4403a01547a77948c0a882d680a7cedd0 Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Mon, 10 Jan 2022 19:11:23 +0800 Subject: [PATCH 08/26] Update test/test_datasets.py Co-authored-by: Philip Meier --- test/test_datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_datasets.py b/test/test_datasets.py index e3ee523ba54..57249fddbe6 100644 --- a/test/test_datasets.py +++ b/test/test_datasets.py @@ -2474,7 +2474,7 @@ def inject_fake_data(self, tmpdir: str, config): base_folder = pathlib.Path(tmpdir) / "flowers-102" num_classes = 3 - num_images_per_split = dict(train=3, valid=3, test=4) + num_images_per_split = dict(train=5, valid=4, test=3) num_images_total = sum(num_images_per_split.values()) datasets_utils.create_image_folder( base_folder, From fb3ae0d34b02e1ce6a7ed10be86340e3416033e6 Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Mon, 10 Jan 2022 19:22:50 +0800 Subject: [PATCH 09/26] Apply the suggestions by pmeier --- torchvision/datasets/flowers102.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/torchvision/datasets/flowers102.py b/torchvision/datasets/flowers102.py index ff2798f9ddc..e10223f8632 100644 --- a/torchvision/datasets/flowers102.py +++ b/torchvision/datasets/flowers102.py @@ -25,6 +25,9 @@ class Flowers102(VisionDataset): Args: root (string): Root directory of the dataset. split (string, optional): The dataset split, supports ``"train"`` (default), ``"val"``, or ``"test"``. + download (bool, optional): If true, downloads the dataset from the internet and + puts it in root directory. If dataset is already downloaded, it is not + downloaded again. transform (callable, optional): A function/transform that takes in an PIL image and returns a transformed version. E.g, ``transforms.RandomCrop``. target_transform (callable, optional): A function/transform that takes in the target and transforms it. @@ -50,17 +53,14 @@ def __init__( if not self._check_exists(): raise RuntimeError("Dataset not found. You can use download=True to download it") - self._labels = [] - self._image_files = [] - from scipy.io import loadmat # Read the label ids label_mat = loadmat(self._meta_folder / "imagelabels.mat") labels = label_mat["labels"][0] - self.classes = np.unique(labels).tolist() - self.class_to_idx = dict(zip(self.classes, range(len(self.classes)))) + self.labels = np.unique(labels).tolist() + self.label_to_idx = dict(zip(self.labels, range(len(self.labels)))) # Read the image ids set_ids = loadmat(self._meta_folder / "setid.mat") @@ -68,8 +68,11 @@ def __init__( image_ids = set_ids[splits_map[self._split]][0] + self._labels = [] + self._image_files = [] + for image_id in image_ids: - self._labels.append(self.class_to_idx[labels[image_id - 1]]) + self._labels.append(self.label_to_idx[labels[image_id - 1]]) self._image_files.append(self._images_folder / f"image_{image_id:05d}.jpg") def __len__(self) -> int: From d4b00a37dcf9595ced7c58f249382eec9afffe80 Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Wed, 12 Jan 2022 16:33:25 +0800 Subject: [PATCH 10/26] Use check_integrity to check file existence --- torchvision/datasets/flowers102.py | 64 +++++++++++++++--------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/torchvision/datasets/flowers102.py b/torchvision/datasets/flowers102.py index e10223f8632..4fc21f61449 100644 --- a/torchvision/datasets/flowers102.py +++ b/torchvision/datasets/flowers102.py @@ -4,7 +4,7 @@ import numpy as np import PIL.Image -from .utils import verify_str_arg, download_and_extract_archive, download_url +from .utils import check_integrity, download_and_extract_archive, download_url, verify_str_arg from .vision import VisionDataset @@ -33,6 +33,14 @@ class Flowers102(VisionDataset): target_transform (callable, optional): A function/transform that takes in the target and transforms it. """ + download_url_prefix = "https://www.robots.ox.ac.uk/~vgg/data/flowers/102/" + file_dict = { + "image": ("102flowers.tgz", "52808999861908f626f3c1f4e79d11fa"), + "label": ("imagelabels.mat", "e0620be6f572b9609742df49c70aed4d"), + "setid": ("setid.mat", "a5357ecc9cb78c4bef273ce3793fc85c"), + } + splits_map = {"train": "trnid", "valid": "valid", "test": "tstid"} + def __init__( self, root: str, @@ -44,29 +52,26 @@ def __init__( super().__init__(root, transform=transform, target_transform=target_transform) self._split = verify_str_arg(split, "split", ("train", "valid", "test")) self._base_folder = Path(self.root) / "flowers-102" - self._meta_folder = self._base_folder / "labels" self._images_folder = self._base_folder / "jpg" if download: - self._download() + self.download() - if not self._check_exists(): - raise RuntimeError("Dataset not found. You can use download=True to download it") + if not self._check_integrity(): + raise RuntimeError("Dataset not found or corrupted. You can use download=True to download it") from scipy.io import loadmat # Read the label ids - label_mat = loadmat(self._meta_folder / "imagelabels.mat") - labels = label_mat["labels"][0] + label_mat = loadmat(self._base_folder / self.file_dict["label"][0]) + labels = label_mat["label"][0] self.labels = np.unique(labels).tolist() self.label_to_idx = dict(zip(self.labels, range(len(self.labels)))) # Read the image ids - set_ids = loadmat(self._meta_folder / "setid.mat") - splits_map = {"train": "trnid", "valid": "valid", "test": "tstid"} - - image_ids = set_ids[splits_map[self._split]][0] + set_ids = loadmat(self._base_folder / self.file_dict["setid"][0]) + image_ids = set_ids[self.splits_map[self._split]][0] self._labels = [] self._image_files = [] @@ -93,27 +98,22 @@ def __getitem__(self, idx) -> Tuple[Any, Any]: def extra_repr(self) -> str: return f"split={self._split}" - def _check_exists(self) -> bool: - return all(folder.exists() and folder.is_dir() for folder in (self._meta_folder, self._images_folder)) - - def _download(self) -> None: - if self._check_exists(): + def _check_integrity(self): + st1 = check_integrity(str(self._base_folder / self.file_dict["image"][0]), self.file_dict["image"][1]) + st2 = check_integrity(str(self._base_folder / self.file_dict["label"][0]), self.file_dict["label"][1]) + st3 = check_integrity(str(self._base_folder / self.file_dict["setid"][0]), self.file_dict["setid"][1]) + if not (st1 and st2 and st3): + return False + return True + + def download(self): + if self._check_integrity(): + print("Files already downloaded and verified") return - download_and_extract_archive( - "https://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz", - download_root=str(self._base_folder), - md5="52808999861908f626f3c1f4e79d11fa", - ) - - download_url( - "https://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat", - str(self._meta_folder), - md5="a5357ecc9cb78c4bef273ce3793fc85c", - ) - - download_url( - "https://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat", - str(self._meta_folder), - md5="e0620be6f572b9609742df49c70aed4d", + f"{self.download_url_prefix}{self.file_dict['image'][0]}", + str(self._base_folder), + md5=self.file_dict["image"][1], ) + download_url(f"{self.download_url_prefix}{self.file_dict['label']}", str(self._base_folder)) + download_url(f"{self.download_url_prefix}{self.file_dict['setid']}", str(self._base_folder)) From b55568f69d42c59cce37336baacb32ff03a97ebc Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Wed, 12 Jan 2022 16:35:20 +0800 Subject: [PATCH 11/26] Save the labels to base_folder --- test/test_datasets.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/test/test_datasets.py b/test/test_datasets.py index 57249fddbe6..fed62728494 100644 --- a/test/test_datasets.py +++ b/test/test_datasets.py @@ -2483,13 +2483,10 @@ def inject_fake_data(self, tmpdir: str, config): num_examples=num_images_total, ) - meta_folder = base_folder / "labels" - meta_folder.mkdir() - label_dict = dict( labels=np.random.randint(1, num_classes + 1, size=(1, num_images_total), dtype=np.uint8), ) - datasets_utils.lazy_importer.scipy.io.savemat(str(meta_folder / "imagelabels.mat"), label_dict) + datasets_utils.lazy_importer.scipy.io.savemat(str(base_folder / "imagelabels.mat"), label_dict) setid_mat = np.arange(1, num_images_total + 1, dtype=np.uint16) np.random.shuffle(setid_mat) @@ -2498,7 +2495,7 @@ def inject_fake_data(self, tmpdir: str, config): valid=setid_mat[num_images_per_split["train"] : -num_images_per_split["test"]].reshape(1, -1), tstid=setid_mat[-num_images_per_split["test"] :].reshape(1, -1), ) - datasets_utils.lazy_importer.scipy.io.savemat(str(meta_folder / "setid.mat"), setid_dict) + datasets_utils.lazy_importer.scipy.io.savemat(str(base_folder / "setid.mat"), setid_dict) return num_images_per_split[config["split"]] From 7fb98768b63bbde9aa7186d829b61d44c0bef240 Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Wed, 12 Jan 2022 17:39:39 +0800 Subject: [PATCH 12/26] Minor fixes --- torchvision/datasets/flowers102.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/torchvision/datasets/flowers102.py b/torchvision/datasets/flowers102.py index 4fc21f61449..8723024b0b4 100644 --- a/torchvision/datasets/flowers102.py +++ b/torchvision/datasets/flowers102.py @@ -64,7 +64,7 @@ def __init__( # Read the label ids label_mat = loadmat(self._base_folder / self.file_dict["label"][0]) - labels = label_mat["label"][0] + labels = label_mat["labels"][0] self.labels = np.unique(labels).tolist() self.label_to_idx = dict(zip(self.labels, range(len(self.labels)))) @@ -115,5 +115,5 @@ def download(self): str(self._base_folder), md5=self.file_dict["image"][1], ) - download_url(f"{self.download_url_prefix}{self.file_dict['label']}", str(self._base_folder)) - download_url(f"{self.download_url_prefix}{self.file_dict['setid']}", str(self._base_folder)) + download_url(f"{self.download_url_prefix}{self.file_dict['label'][0]}", str(self._base_folder)) + download_url(f"{self.download_url_prefix}{self.file_dict['setid'][0]}", str(self._base_folder)) From 87cc4f1a02332a8a29990ec7b5302874638948c4 Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Wed, 12 Jan 2022 18:18:37 +0800 Subject: [PATCH 13/26] Using a loop makes this more concise without reducing readability Co-authored-by: Philip Meier --- torchvision/datasets/flowers102.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/torchvision/datasets/flowers102.py b/torchvision/datasets/flowers102.py index 8723024b0b4..65cffd8e542 100644 --- a/torchvision/datasets/flowers102.py +++ b/torchvision/datasets/flowers102.py @@ -99,11 +99,10 @@ def extra_repr(self) -> str: return f"split={self._split}" def _check_integrity(self): - st1 = check_integrity(str(self._base_folder / self.file_dict["image"][0]), self.file_dict["image"][1]) - st2 = check_integrity(str(self._base_folder / self.file_dict["label"][0]), self.file_dict["label"][1]) - st3 = check_integrity(str(self._base_folder / self.file_dict["setid"][0]), self.file_dict["setid"][1]) - if not (st1 and st2 and st3): - return False + for id in ["label", "setid"]: + filename, md5 = self.file_dict[id] + if not check_integrity(str(self._base_folder / filename), md5): + return False return True def download(self): From d84399ecbab28064f250fa4c328c457a90b3ba34 Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Wed, 12 Jan 2022 18:18:45 +0800 Subject: [PATCH 14/26] Using a loop makes this more concise without reducing readability Co-authored-by: Philip Meier --- torchvision/datasets/flowers102.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/torchvision/datasets/flowers102.py b/torchvision/datasets/flowers102.py index 65cffd8e542..e87bb3e9c05 100644 --- a/torchvision/datasets/flowers102.py +++ b/torchvision/datasets/flowers102.py @@ -114,5 +114,6 @@ def download(self): str(self._base_folder), md5=self.file_dict["image"][1], ) - download_url(f"{self.download_url_prefix}{self.file_dict['label'][0]}", str(self._base_folder)) - download_url(f"{self.download_url_prefix}{self.file_dict['setid'][0]}", str(self._base_folder)) + for id in ["label", "setid"]: + filename, md5 = self.file_dict[id] + download_url(self.download_url_prefix + filename, str(self._base_folder), md5=md5) From 6adabad60cdb0d01b3f51273f63ac7586ef204cb Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Wed, 12 Jan 2022 18:23:13 +0800 Subject: [PATCH 15/26] Remove self.labels and self.label_to_index attributes --- torchvision/datasets/flowers102.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/torchvision/datasets/flowers102.py b/torchvision/datasets/flowers102.py index e87bb3e9c05..288eb2dc8e9 100644 --- a/torchvision/datasets/flowers102.py +++ b/torchvision/datasets/flowers102.py @@ -66,8 +66,8 @@ def __init__( label_mat = loadmat(self._base_folder / self.file_dict["label"][0]) labels = label_mat["labels"][0] - self.labels = np.unique(labels).tolist() - self.label_to_idx = dict(zip(self.labels, range(len(self.labels)))) + classes = np.unique(labels).tolist() + class_to_idx = dict(zip(classes, range(len(classes)))) # Read the image ids set_ids = loadmat(self._base_folder / self.file_dict["setid"][0]) @@ -77,7 +77,7 @@ def __init__( self._image_files = [] for image_id in image_ids: - self._labels.append(self.label_to_idx[labels[image_id - 1]]) + self._labels.append(class_to_idx[labels[image_id - 1]]) self._image_files.append(self._images_folder / f"image_{image_id:05d}.jpg") def __len__(self) -> int: From 861841585aab74273fa8967663a032da4a7e3070 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Wed, 12 Jan 2022 11:33:23 +0100 Subject: [PATCH 16/26] minor simplification --- torchvision/datasets/flowers102.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/torchvision/datasets/flowers102.py b/torchvision/datasets/flowers102.py index 288eb2dc8e9..78f89738300 100644 --- a/torchvision/datasets/flowers102.py +++ b/torchvision/datasets/flowers102.py @@ -1,7 +1,6 @@ from pathlib import Path from typing import Any, Tuple, Callable, Optional -import numpy as np import PIL.Image from .utils import check_integrity, download_and_extract_archive, download_url, verify_str_arg @@ -62,22 +61,16 @@ def __init__( from scipy.io import loadmat - # Read the label ids - label_mat = loadmat(self._base_folder / self.file_dict["label"][0]) - labels = label_mat["labels"][0] + set_ids = loadmat(self._base_folder / self.file_dict["setid"][0], squeeze_me=True) + image_ids = set_ids[self.splits_map[self._split]].tolist() - classes = np.unique(labels).tolist() - class_to_idx = dict(zip(classes, range(len(classes)))) - - # Read the image ids - set_ids = loadmat(self._base_folder / self.file_dict["setid"][0]) - image_ids = set_ids[self.splits_map[self._split]][0] + labels = loadmat(self._base_folder / self.file_dict["label"][0], squeeze_me=True) + image_id_to_label = dict(enumerate(labels["labels"].tolist(), 1)) self._labels = [] self._image_files = [] - for image_id in image_ids: - self._labels.append(class_to_idx[labels[image_id - 1]]) + self._labels.append(image_id_to_label[image_id]) self._image_files.append(self._images_folder / f"image_{image_id:05d}.jpg") def __len__(self) -> int: From 2bb1ee6efed066258c980c0437014c047f3860f1 Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Wed, 12 Jan 2022 18:40:17 +0800 Subject: [PATCH 17/26] Check the exitence of image folder --- torchvision/datasets/flowers102.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/torchvision/datasets/flowers102.py b/torchvision/datasets/flowers102.py index 78f89738300..a54a1c1821f 100644 --- a/torchvision/datasets/flowers102.py +++ b/torchvision/datasets/flowers102.py @@ -92,6 +92,9 @@ def extra_repr(self) -> str: return f"split={self._split}" def _check_integrity(self): + if not (self._images_folder.exists() and self._images_folder.is_dir()): + return False + for id in ["label", "setid"]: filename, md5 = self.file_dict[id] if not check_integrity(str(self._base_folder / filename), md5): From 9fef1696e65c31cdd4a523878795da59363fb158 Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Wed, 12 Jan 2022 18:46:45 +0800 Subject: [PATCH 18/26] Revert the check --- torchvision/datasets/flowers102.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/torchvision/datasets/flowers102.py b/torchvision/datasets/flowers102.py index a54a1c1821f..78f89738300 100644 --- a/torchvision/datasets/flowers102.py +++ b/torchvision/datasets/flowers102.py @@ -92,9 +92,6 @@ def extra_repr(self) -> str: return f"split={self._split}" def _check_integrity(self): - if not (self._images_folder.exists() and self._images_folder.is_dir()): - return False - for id in ["label", "setid"]: filename, md5 = self.file_dict[id] if not check_integrity(str(self._base_folder / filename), md5): From d8a343a1b3ba7df1d6e5be242a076fbf65459991 Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Wed, 12 Jan 2022 18:53:13 +0800 Subject: [PATCH 19/26] Check the existence of image folder --- torchvision/datasets/flowers102.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/torchvision/datasets/flowers102.py b/torchvision/datasets/flowers102.py index 78f89738300..163c9acc49e 100644 --- a/torchvision/datasets/flowers102.py +++ b/torchvision/datasets/flowers102.py @@ -91,6 +91,9 @@ def __getitem__(self, idx) -> Tuple[Any, Any]: def extra_repr(self) -> str: return f"split={self._split}" + def _check_exists(self) -> bool: + return self._images_folder.exists() and self._images_folder.is_dir() + def _check_integrity(self): for id in ["label", "setid"]: filename, md5 = self.file_dict[id] @@ -99,14 +102,14 @@ def _check_integrity(self): return True def download(self): + if not self._check_exists(): + download_and_extract_archive( + f"{self.download_url_prefix}{self.file_dict['image'][0]}", + str(self._base_folder), + md5=self.file_dict["image"][1], + ) if self._check_integrity(): - print("Files already downloaded and verified") return - download_and_extract_archive( - f"{self.download_url_prefix}{self.file_dict['image'][0]}", - str(self._base_folder), - md5=self.file_dict["image"][1], - ) for id in ["label", "setid"]: filename, md5 = self.file_dict[id] download_url(self.download_url_prefix + filename, str(self._base_folder), md5=md5) From d3d0698ce5b72692f004fda672dd20683a133a2d Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 12 Jan 2022 11:25:11 +0000 Subject: [PATCH 20/26] valid -> val --- test/test_datasets.py | 4 ++-- torchvision/datasets/flowers102.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test/test_datasets.py b/test/test_datasets.py index ba38636cedf..66baf68fc69 100644 --- a/test/test_datasets.py +++ b/test/test_datasets.py @@ -2494,14 +2494,14 @@ class Flowers102TestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.Flowers102 FEATURE_TYPES = (PIL.Image.Image, int) - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "valid", "test")) + ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test")) REQUIRED_PACKAGES = ("scipy",) def inject_fake_data(self, tmpdir: str, config): base_folder = pathlib.Path(tmpdir) / "flowers-102" num_classes = 3 - num_images_per_split = dict(train=5, valid=4, test=3) + num_images_per_split = dict(train=5, val=4, test=3) num_images_total = sum(num_images_per_split.values()) datasets_utils.create_image_folder( base_folder, diff --git a/torchvision/datasets/flowers102.py b/torchvision/datasets/flowers102.py index 163c9acc49e..23115d2b199 100644 --- a/torchvision/datasets/flowers102.py +++ b/torchvision/datasets/flowers102.py @@ -38,7 +38,7 @@ class Flowers102(VisionDataset): "label": ("imagelabels.mat", "e0620be6f572b9609742df49c70aed4d"), "setid": ("setid.mat", "a5357ecc9cb78c4bef273ce3793fc85c"), } - splits_map = {"train": "trnid", "valid": "valid", "test": "tstid"} + splits_map = {"train": "trnid", "val": "valid", "test": "tstid"} def __init__( self, @@ -49,7 +49,7 @@ def __init__( target_transform: Optional[Callable] = None, ) -> None: super().__init__(root, transform=transform, target_transform=target_transform) - self._split = verify_str_arg(split, "split", ("train", "valid", "test")) + self._split = verify_str_arg(split, "split", ("train", "val", "test")) self._base_folder = Path(self.root) / "flowers-102" self._images_folder = self._base_folder / "jpg" From 7fa9c679eb34d33922c5d4319c111d01c2a1a528 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 12 Jan 2022 11:30:19 +0000 Subject: [PATCH 21/26] keep some stuff private --- torchvision/datasets/flowers102.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/torchvision/datasets/flowers102.py b/torchvision/datasets/flowers102.py index 23115d2b199..c393b62f838 100644 --- a/torchvision/datasets/flowers102.py +++ b/torchvision/datasets/flowers102.py @@ -32,13 +32,13 @@ class Flowers102(VisionDataset): target_transform (callable, optional): A function/transform that takes in the target and transforms it. """ - download_url_prefix = "https://www.robots.ox.ac.uk/~vgg/data/flowers/102/" - file_dict = { + _download_url_prefix = "https://www.robots.ox.ac.uk/~vgg/data/flowers/102/" + _file_dict = { # filename, md5 "image": ("102flowers.tgz", "52808999861908f626f3c1f4e79d11fa"), "label": ("imagelabels.mat", "e0620be6f572b9609742df49c70aed4d"), "setid": ("setid.mat", "a5357ecc9cb78c4bef273ce3793fc85c"), } - splits_map = {"train": "trnid", "val": "valid", "test": "tstid"} + _splits_map = {"train": "trnid", "val": "valid", "test": "tstid"} def __init__( self, @@ -61,10 +61,10 @@ def __init__( from scipy.io import loadmat - set_ids = loadmat(self._base_folder / self.file_dict["setid"][0], squeeze_me=True) - image_ids = set_ids[self.splits_map[self._split]].tolist() + set_ids = loadmat(self._base_folder / self._file_dict["setid"][0], squeeze_me=True) + image_ids = set_ids[self._splits_map[self._split]].tolist() - labels = loadmat(self._base_folder / self.file_dict["label"][0], squeeze_me=True) + labels = loadmat(self._base_folder / self._file_dict["label"][0], squeeze_me=True) image_id_to_label = dict(enumerate(labels["labels"].tolist(), 1)) self._labels = [] @@ -96,7 +96,7 @@ def _check_exists(self) -> bool: def _check_integrity(self): for id in ["label", "setid"]: - filename, md5 = self.file_dict[id] + filename, md5 = self._file_dict[id] if not check_integrity(str(self._base_folder / filename), md5): return False return True @@ -104,12 +104,12 @@ def _check_integrity(self): def download(self): if not self._check_exists(): download_and_extract_archive( - f"{self.download_url_prefix}{self.file_dict['image'][0]}", + f"{self._download_url_prefix}{self._file_dict['image'][0]}", str(self._base_folder), - md5=self.file_dict["image"][1], + md5=self._file_dict["image"][1], ) if self._check_integrity(): return for id in ["label", "setid"]: - filename, md5 = self.file_dict[id] - download_url(self.download_url_prefix + filename, str(self._base_folder), md5=md5) + filename, md5 = self._file_dict[id] + download_url(self._download_url_prefix + filename, str(self._base_folder), md5=md5) From ce957c6cc509107dfe3ccc3c499f3f0f48bb2aa0 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 12 Jan 2022 11:34:02 +0000 Subject: [PATCH 22/26] minor doc arrangements --- torchvision/datasets/flowers102.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torchvision/datasets/flowers102.py b/torchvision/datasets/flowers102.py index c393b62f838..3976212546d 100644 --- a/torchvision/datasets/flowers102.py +++ b/torchvision/datasets/flowers102.py @@ -15,11 +15,11 @@ class Flowers102(VisionDataset): This class needs `scipy `_ to load target files from `.mat` format. Oxford 102 Flower is an image classification dataset consisting of 102 flower categories. The - flowers chosen to be flower commonly occurring in the United Kingdom. Each class consists of + flowers were chosen to be flowers commonly occurring in the United Kingdom. Each class consists of between 40 and 258 images. The images have large scale, pose and light variations. In addition, there are categories that - have large variations within the category and several very similar categories. + have large variations within the category, and several very similar categories. Args: root (string): Root directory of the dataset. From a5b701e211471f9f46298f99d9443e9008f6047c Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 12 Jan 2022 11:35:55 +0000 Subject: [PATCH 23/26] remove default FEATURE_TYPES --- test/test_datasets.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_datasets.py b/test/test_datasets.py index 66baf68fc69..943bfd4eb9b 100644 --- a/test/test_datasets.py +++ b/test/test_datasets.py @@ -2492,7 +2492,6 @@ def inject_fake_data(self, tmpdir: str, config): class Flowers102TestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.Flowers102 - FEATURE_TYPES = (PIL.Image.Image, int) ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test")) REQUIRED_PACKAGES = ("scipy",) From 4b21a2f87d648d7bacf8d711eced840a3fd1dc8e Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Wed, 12 Jan 2022 21:22:59 +0800 Subject: [PATCH 24/26] Simplify the datasets existence --- torchvision/datasets/flowers102.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/torchvision/datasets/flowers102.py b/torchvision/datasets/flowers102.py index 3976212546d..a67177b526c 100644 --- a/torchvision/datasets/flowers102.py +++ b/torchvision/datasets/flowers102.py @@ -91,9 +91,6 @@ def __getitem__(self, idx) -> Tuple[Any, Any]: def extra_repr(self) -> str: return f"split={self._split}" - def _check_exists(self) -> bool: - return self._images_folder.exists() and self._images_folder.is_dir() - def _check_integrity(self): for id in ["label", "setid"]: filename, md5 = self._file_dict[id] @@ -102,14 +99,13 @@ def _check_integrity(self): return True def download(self): - if not self._check_exists(): - download_and_extract_archive( - f"{self._download_url_prefix}{self._file_dict['image'][0]}", - str(self._base_folder), - md5=self._file_dict["image"][1], - ) if self._check_integrity(): return + download_and_extract_archive( + f"{self._download_url_prefix}{self._file_dict['image'][0]}", + str(self._base_folder), + md5=self._file_dict["image"][1], + ) for id in ["label", "setid"]: filename, md5 = self._file_dict[id] download_url(self._download_url_prefix + filename, str(self._base_folder), md5=md5) From 53ad2c9f93d22883c10688caa700fa3da0bd6d77 Mon Sep 17 00:00:00 2001 From: Zhiqiang Wang Date: Wed, 12 Jan 2022 21:51:00 +0800 Subject: [PATCH 25/26] check if the image folder exists Co-authored-by: Philip Meier --- torchvision/datasets/flowers102.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/torchvision/datasets/flowers102.py b/torchvision/datasets/flowers102.py index a67177b526c..d91c1c0f86b 100644 --- a/torchvision/datasets/flowers102.py +++ b/torchvision/datasets/flowers102.py @@ -92,6 +92,9 @@ def extra_repr(self) -> str: return f"split={self._split}" def _check_integrity(self): + if not (self._images_folder.exists() and self._images_folder.isdir()): + return False + for id in ["label", "setid"]: filename, md5 = self._file_dict[id] if not check_integrity(str(self._base_folder / filename), md5): From 0791dfcdb66650afaecf8ce506070cc0bfbb7c59 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 12 Jan 2022 14:20:00 +0000 Subject: [PATCH 26/26] isdir -> is_dir --- torchvision/datasets/flowers102.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/datasets/flowers102.py b/torchvision/datasets/flowers102.py index d91c1c0f86b..55347ffa550 100644 --- a/torchvision/datasets/flowers102.py +++ b/torchvision/datasets/flowers102.py @@ -92,7 +92,7 @@ def extra_repr(self) -> str: return f"split={self._split}" def _check_integrity(self): - if not (self._images_folder.exists() and self._images_folder.isdir()): + if not (self._images_folder.exists() and self._images_folder.is_dir()): return False for id in ["label", "setid"]: