Skip to content

Merge mock data preparation and dataset logic in prototype tests #6010

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
May 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 78 additions & 45 deletions test/builtin_dataset_mocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,18 @@
import pathlib
import pickle
import random
import shutil
import unittest.mock
import warnings
import xml.etree.ElementTree as ET
from collections import defaultdict, Counter

import numpy as np
import PIL.Image
import pytest
import torch
from datasets_utils import make_zip, make_tar, create_image_folder, create_image_file, combinations_grid
from torch.nn.functional import one_hot
from torch.testing import make_tensor as _make_tensor
from torchvision._utils import sequence_to_str
from torchvision.prototype import datasets

make_tensor = functools.partial(_make_tensor, device="cpu")
Expand Down Expand Up @@ -62,27 +61,51 @@ def _parse_mock_info(self, mock_info):

return mock_info

def prepare(self, config):
def load(self, config):
# `datasets.home()` is patched to a temporary directory through the autouse fixture `test_home` in
# test/test_prototype_builtin_datasets.py
root = pathlib.Path(datasets.home()) / self.name
root.mkdir(exist_ok=True)
# We cannot place the mock data upfront in `root`. Loading a dataset calls `OnlineResource.load`. In turn,
# this will only download **and** preprocess if the file is not present. In other words, if we already place
# the file in `root` before the resource is loaded, we are effectively skipping the preprocessing.
# To avoid that we first place the mock data in a temporary directory and patch the download logic to move it to
# `root` only when it is requested.
tmp_mock_data_folder = root / "__mock__"
tmp_mock_data_folder.mkdir(parents=True)

mock_info = self._parse_mock_info(self.mock_data_fn(tmp_mock_data_folder, config))

def patched_download(resource, root, **kwargs):
src = tmp_mock_data_folder / resource.file_name
if not src.exists():
raise pytest.UsageError(
f"Dataset '{self.name}' requires the file {resource.file_name} for {config}"
f"but it was not created by the mock data function."
)

mock_info = self._parse_mock_info(self.mock_data_fn(root, config))
dst = root / resource.file_name
shutil.move(str(src), str(root))

with unittest.mock.patch.object(datasets.utils.Dataset, "__init__"):
required_file_names = {
resource.file_name for resource in datasets.load(self.name, root=root, **config)._resources()
}
available_file_names = {path.name for path in root.glob("*")}
missing_file_names = required_file_names - available_file_names
if missing_file_names:
return dst

with unittest.mock.patch(
"torchvision.prototype.datasets.utils._resource.OnlineResource.download", new=patched_download
):
dataset = datasets.load(self.name, **config)

extra_files = list(tmp_mock_data_folder.glob("**/*"))
if extra_files:
raise pytest.UsageError(
f"Dataset '{self.name}' requires the files {sequence_to_str(sorted(missing_file_names))} "
f"for {config}, but they were not created by the mock data function."
(
f"Dataset '{self.name}' created the following files for {config} in the mock data function, "
f"but they were not loaded:\n\n"
)
+ "\n".join(str(file.relative_to(tmp_mock_data_folder)) for file in extra_files)
)

return mock_info
tmp_mock_data_folder.rmdir()

return dataset, mock_info


def config_id(name, config):
Expand Down Expand Up @@ -513,22 +536,6 @@ def imagenet(root, config):


class CocoMockData:
@classmethod
def _make_images_archive(cls, root, name, *, num_samples):
image_paths = create_image_folder(
root, name, file_name_fn=lambda idx: f"{idx:012d}.jpg", num_examples=num_samples
)

images_meta = []
for path in image_paths:
with PIL.Image.open(path) as image:
width, height = image.size
images_meta.append(dict(file_name=path.name, id=int(path.stem), width=width, height=height))

make_zip(root, f"{name}.zip")

return images_meta

@classmethod
def _make_annotations_json(
cls,
Expand Down Expand Up @@ -596,16 +603,38 @@ def generate(
cls,
root,
*,
split,
year,
num_samples,
):
annotations_dir = root / "annotations"
annotations_dir.mkdir()

for split in ("train", "val"):
config_name = f"{split}{year}"
for split_ in ("train", "val"):
config_name = f"{split_}{year}"

images_meta = [
dict(
file_name=f"{idx:012d}.jpg",
id=idx,
width=width,
height=height,
)
for idx, (height, width) in enumerate(
torch.randint(3, 11, size=(num_samples, 2), dtype=torch.int).tolist()
)
]

if split_ == split:
create_image_folder(
root,
config_name,
file_name_fn=lambda idx: images_meta[idx]["file_name"],
num_examples=num_samples,
size=lambda idx: (3, images_meta[idx]["height"], images_meta[idx]["width"]),
)
make_zip(root, f"{config_name}.zip")

images_meta = cls._make_images_archive(root, config_name, num_samples=num_samples)
cls._make_annotations(
annotations_dir,
config_name,
Expand All @@ -625,7 +654,7 @@ def generate(
)
)
def coco(root, config):
return CocoMockData.generate(root, year=config["year"], num_samples=5)
return CocoMockData.generate(root, split=config["split"], year=config["year"], num_samples=5)


class SBDMockData:
Expand Down Expand Up @@ -799,8 +828,11 @@ def add_bndbox(obj):
def generate(cls, root, *, year, trainval):
archive_folder = root
if year == "2011":
archive_folder /= "TrainVal"
data_folder = archive_folder / "VOCdevkit" / f"VOC{year}"
archive_folder = root / "TrainVal"
data_folder = archive_folder / "VOCdevkit"
else:
archive_folder = data_folder = root / "VOCdevkit"
data_folder = data_folder / f"VOC{year}"
data_folder.mkdir(parents=True, exist_ok=True)

ids, num_samples_map = cls._make_split_files(data_folder, year=year, trainval=trainval)
Expand All @@ -810,7 +842,7 @@ def generate(cls, root, *, year, trainval):
(cls._make_detection_anns_folder, "Annotations", ".xml"),
]:
make_folder_fn(data_folder, name, file_name_fn=lambda idx: ids[idx] + suffix, num_examples=len(ids))
make_tar(root, (cls._TRAIN_VAL_FILE_NAMES if trainval else cls._TEST_FILE_NAMES)[year], data_folder)
make_tar(root, (cls._TRAIN_VAL_FILE_NAMES if trainval else cls._TEST_FILE_NAMES)[year], archive_folder)

return num_samples_map

Expand Down Expand Up @@ -1091,8 +1123,10 @@ def _make_ann_file(path, num_examples, class_idx):
}
)

archive_folder = root / "GTSRB"

if config["split"] == "train":
train_folder = root / "GTSRB" / "Training"
train_folder = archive_folder / "Training"
train_folder.mkdir(parents=True)

for class_idx in classes:
Expand All @@ -1107,9 +1141,9 @@ def _make_ann_file(path, num_examples, class_idx):
num_examples=num_examples_per_class,
class_idx=int(class_idx),
)
make_zip(root, "GTSRB-Training_fixed.zip", train_folder)
make_zip(root, "GTSRB-Training_fixed.zip", archive_folder)
else:
test_folder = root / "GTSRB" / "Final_Test"
test_folder = archive_folder / "Final_Test"
test_folder.mkdir(parents=True)

create_image_folder(
Expand All @@ -1119,7 +1153,7 @@ def _make_ann_file(path, num_examples, class_idx):
num_examples=num_examples,
)

make_zip(root, "GTSRB_Final_Test_Images.zip", test_folder)
make_zip(root, "GTSRB_Final_Test_Images.zip", archive_folder)

_make_ann_file(
path=root / "GT-final_test.csv",
Expand Down Expand Up @@ -1484,11 +1518,10 @@ def stanford_cars(root, config):
num_samples = {"train": 5, "test": 7}[split]
num_categories = 3

devkit = root / "devkit"
devkit.mkdir(parents=True)

if split == "train":
images_folder_name = "cars_train"
devkit = root / "devkit"
devkit.mkdir()
annotations_mat_path = devkit / "cars_train_annos.mat"
else:
images_folder_name = "cars_test"
Expand Down
55 changes: 16 additions & 39 deletions test/test_prototype_builtin_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,18 +56,14 @@ def test_info(self, name):

@parametrize_dataset_mocks(DATASET_MOCKS)
def test_smoke(self, dataset_mock, config):
dataset_mock.prepare(config)

dataset = datasets.load(dataset_mock.name, **config)
dataset, _ = dataset_mock.load(config)

if not isinstance(dataset, datasets.utils.Dataset):
raise AssertionError(f"Loading the dataset should return an Dataset, but got {type(dataset)} instead.")

@parametrize_dataset_mocks(DATASET_MOCKS)
def test_sample(self, dataset_mock, config):
dataset_mock.prepare(config)

dataset = datasets.load(dataset_mock.name, **config)
dataset, _ = dataset_mock.load(config)

try:
sample = next(iter(dataset))
Expand All @@ -84,17 +80,13 @@ def test_sample(self, dataset_mock, config):

@parametrize_dataset_mocks(DATASET_MOCKS)
def test_num_samples(self, dataset_mock, config):
mock_info = dataset_mock.prepare(config)

dataset = datasets.load(dataset_mock.name, **config)
dataset, mock_info = dataset_mock.load(config)

assert len(list(dataset)) == mock_info["num_samples"]

@parametrize_dataset_mocks(DATASET_MOCKS)
def test_no_vanilla_tensors(self, dataset_mock, config):
dataset_mock.prepare(config)

dataset = datasets.load(dataset_mock.name, **config)
dataset, _ = dataset_mock.load(config)

vanilla_tensors = {key for key, value in next(iter(dataset)).items() if type(value) is torch.Tensor}
if vanilla_tensors:
Expand All @@ -105,24 +97,20 @@ def test_no_vanilla_tensors(self, dataset_mock, config):

@parametrize_dataset_mocks(DATASET_MOCKS)
def test_transformable(self, dataset_mock, config):
dataset_mock.prepare(config)

dataset = datasets.load(dataset_mock.name, **config)
dataset, _ = dataset_mock.load(config)

next(iter(dataset.map(transforms.Identity())))

@pytest.mark.parametrize("only_datapipe", [False, True])
@parametrize_dataset_mocks(DATASET_MOCKS)
def test_traversable(self, dataset_mock, config, only_datapipe):
dataset_mock.prepare(config)
dataset = datasets.load(dataset_mock.name, **config)
dataset, _ = dataset_mock.load(config)

traverse(dataset, only_datapipe=only_datapipe)

@parametrize_dataset_mocks(DATASET_MOCKS)
def test_serializable(self, dataset_mock, config):
dataset_mock.prepare(config)
dataset = datasets.load(dataset_mock.name, **config)
dataset, _ = dataset_mock.load(config)

pickle.dumps(dataset)

Expand All @@ -135,8 +123,7 @@ def _collate_fn(self, batch):
@pytest.mark.parametrize("num_workers", [0, 1])
@parametrize_dataset_mocks(DATASET_MOCKS)
def test_data_loader(self, dataset_mock, config, num_workers):
dataset_mock.prepare(config)
dataset = datasets.load(dataset_mock.name, **config)
dataset, _ = dataset_mock.load(config)

dl = DataLoader(
dataset,
Expand All @@ -153,17 +140,15 @@ def test_data_loader(self, dataset_mock, config, num_workers):
@parametrize_dataset_mocks(DATASET_MOCKS)
@pytest.mark.parametrize("annotation_dp_type", (Shuffler, ShardingFilter))
def test_has_annotations(self, dataset_mock, config, annotation_dp_type):

dataset_mock.prepare(config)
dataset = datasets.load(dataset_mock.name, **config)
dataset, _ = dataset_mock.load(config)

if not any(isinstance(dp, annotation_dp_type) for dp in extract_datapipes(dataset)):
raise AssertionError(f"The dataset doesn't contain a {annotation_dp_type.__name__}() datapipe.")

@parametrize_dataset_mocks(DATASET_MOCKS)
def test_save_load(self, dataset_mock, config):
dataset_mock.prepare(config)
dataset = datasets.load(dataset_mock.name, **config)
dataset, _ = dataset_mock.load(config)

sample = next(iter(dataset))

with io.BytesIO() as buffer:
Expand All @@ -173,8 +158,7 @@ def test_save_load(self, dataset_mock, config):

@parametrize_dataset_mocks(DATASET_MOCKS)
def test_infinite_buffer_size(self, dataset_mock, config):
dataset_mock.prepare(config)
dataset = datasets.load(dataset_mock.name, **config)
dataset, _ = dataset_mock.load(config)

for dp in extract_datapipes(dataset):
if hasattr(dp, "buffer_size"):
Expand All @@ -184,18 +168,15 @@ def test_infinite_buffer_size(self, dataset_mock, config):

@parametrize_dataset_mocks(DATASET_MOCKS)
def test_has_length(self, dataset_mock, config):
dataset_mock.prepare(config)
dataset = datasets.load(dataset_mock.name, **config)
dataset, _ = dataset_mock.load(config)

assert len(dataset) > 0


@parametrize_dataset_mocks(DATASET_MOCKS["qmnist"])
class TestQMNIST:
def test_extra_label(self, dataset_mock, config):
dataset_mock.prepare(config)

dataset = datasets.load(dataset_mock.name, **config)
dataset, _ = dataset_mock.load(config)

sample = next(iter(dataset))
for key, type in (
Expand All @@ -218,9 +199,7 @@ def test_label_matches_path(self, dataset_mock, config):
if config["split"] != "train":
return

dataset_mock.prepare(config)

dataset = datasets.load(dataset_mock.name, **config)
dataset, _ = dataset_mock.load(config)

for sample in dataset:
label_from_path = int(Path(sample["path"]).parent.name)
Expand All @@ -230,9 +209,7 @@ def test_label_matches_path(self, dataset_mock, config):
@parametrize_dataset_mocks(DATASET_MOCKS["usps"])
class TestUSPS:
def test_sample_content(self, dataset_mock, config):
dataset_mock.prepare(config)

dataset = datasets.load(dataset_mock.name, **config)
dataset, _ = dataset_mock.load(config)

for sample in dataset:
assert "image" in sample
Expand Down