-
Notifications
You must be signed in to change notification settings - Fork 7.1k
Closing streams to avoid testing issues #6128
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
b6ae4df
05caddc
f87e97d
a830b5d
ab5e918
7b0613c
d3cfb2a
8fc5551
6a72184
f63d6e2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,6 +11,7 @@ | |
from torch.utils.data.graph import traverse | ||
from torch.utils.data.graph_settings import get_all_graph_pipes | ||
from torchdata.datapipes.iter import ShardingFilter, Shuffler | ||
from torchdata.datapipes.utils import StreamWrapper | ||
from torchvision._utils import sequence_to_str | ||
from torchvision.prototype import datasets, transforms | ||
from torchvision.prototype.datasets.utils._internal import INFINITE_BUFFER_SIZE | ||
|
@@ -64,9 +65,9 @@ def test_smoke(self, dataset_mock, config): | |
@parametrize_dataset_mocks(DATASET_MOCKS) | ||
def test_sample(self, dataset_mock, config): | ||
dataset, _ = dataset_mock.load(config) | ||
|
||
try: | ||
sample = next(iter(dataset)) | ||
iterator = iter(dataset) | ||
sample = next(iterator) | ||
except StopIteration: | ||
raise AssertionError("Unable to draw any sample.") from None | ||
except Exception as error: | ||
|
@@ -78,23 +79,33 @@ def test_sample(self, dataset_mock, config): | |
if not sample: | ||
raise AssertionError("Sample dictionary is empty.") | ||
|
||
list(iterator) # Cleanups and closing streams in buffers | ||
|
||
@parametrize_dataset_mocks(DATASET_MOCKS) | ||
def test_num_samples(self, dataset_mock, config): | ||
dataset, mock_info = dataset_mock.load(config) | ||
|
||
assert len(list(dataset)) == mock_info["num_samples"] | ||
|
||
@parametrize_dataset_mocks(DATASET_MOCKS) | ||
def test_no_vanilla_tensors(self, dataset_mock, config): | ||
StreamWrapper.session_streams = {} | ||
dataset, _ = dataset_mock.load(config) | ||
|
||
vanilla_tensors = {key for key, value in next(iter(dataset)).items() if type(value) is torch.Tensor} | ||
iterator = iter(dataset) | ||
one_element = next(iterator) | ||
Comment on lines
+94
to
+95
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as above. |
||
|
||
vanilla_tensors = {key for key, value in one_element.items() if type(value) is torch.Tensor} | ||
if vanilla_tensors: | ||
raise AssertionError( | ||
f"The values of key(s) " | ||
f"{sequence_to_str(sorted(vanilla_tensors), separate_last='and ')} contained vanilla tensors." | ||
) | ||
|
||
list(iterator) # Cleanups and closing streams in buffers | ||
|
||
if len(StreamWrapper.session_streams) > 0: | ||
raise Exception(StreamWrapper.session_streams) | ||
Comment on lines
+106
to
+107
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you explain what this does? Is def check_unclosed_streams(test_fn):
@functools.wraps(test_fn)
def wrapper(*args, **kwargs):
if len(StreamWrapper.session_streams) > 0:
raise pytest.UsageError("Some previous test didn't clean up")
test_fn(*args, **kwargs)
if len(StreamWrapper.session_streams) > 0:
raise Assertion("This test didn't clean up")
return wrapper |
||
|
||
@parametrize_dataset_mocks(DATASET_MOCKS) | ||
def test_transformable(self, dataset_mock, config): | ||
dataset, _ = dataset_mock.load(config) | ||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -102,7 +102,9 @@ def _prepare_sample( | |||||
ann_path, ann_buffer = ann_data | ||||||
|
||||||
image = EncodedImage.from_file(image_buffer) | ||||||
image_buffer.close() | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The errors we have seen in our test suite have never been with these files, but only with archives. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tests complain that archive stream is not closed. This is because child (unpacked file stream) also remains open and referencing parent. In pytorch/pytorch#78952 and pytorch/data#560 we introduced mechanism to close parent steams when every child is closed. |
||||||
ann = read_mat(ann_buffer) | ||||||
ann_buffer.close() | ||||||
Comment on lines
+105
to
+107
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of doing that in every dataset individually, can't we just do it in
and
? I think so far we don't have a case where we need to read from the same file handle twice. Plus, that would only work if the stream is seekable, which I don't know if we can guarantee. |
||||||
|
||||||
return dict( | ||||||
label=Label.from_category(category, categories=self._categories), | ||||||
|
@@ -181,10 +183,11 @@ def _is_not_rogue_file(self, data: Tuple[str, Any]) -> bool: | |||||
|
||||||
def _prepare_sample(self, data: Tuple[str, BinaryIO]) -> Dict[str, Any]: | ||||||
path, buffer = data | ||||||
|
||||||
image = EncodedImage.from_file(buffer) | ||||||
buffer.close() | ||||||
return dict( | ||||||
path=path, | ||||||
image=EncodedImage.from_file(buffer), | ||||||
image=image, | ||||||
label=Label(int(pathlib.Path(path).parent.name.split(".", 1)[0]) - 1, categories=self._categories), | ||||||
) | ||||||
|
||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,8 +29,8 @@ def __init__( | |
self.fieldnames = fieldnames | ||
|
||
def __iter__(self) -> Iterator[Tuple[str, Dict[str, str]]]: | ||
for _, file in self.datapipe: | ||
file = (line.decode() for line in file) | ||
for _, fh in self.datapipe: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm ok with the closing here, but why the rename? Can you revert that? |
||
file = (line.decode() for line in fh) | ||
|
||
if self.fieldnames: | ||
fieldnames = self.fieldnames | ||
|
@@ -48,6 +48,8 @@ def __iter__(self) -> Iterator[Tuple[str, Dict[str, str]]]: | |
for line in csv.DictReader(file, fieldnames=fieldnames, dialect="celeba"): | ||
yield line.pop("image_id"), line | ||
|
||
fh.close() | ||
|
||
|
||
NAME = "celeba" | ||
|
||
|
@@ -132,6 +134,7 @@ def _prepare_sample( | |
path, buffer = image_data | ||
|
||
image = EncodedImage.from_file(buffer) | ||
buffer.close() | ||
(_, identity), (_, attributes), (_, bounding_box), (_, landmarks) = ann_data | ||
|
||
return dict( | ||
|
Original file line number | Diff line number | Diff line change | ||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -1,6 +1,7 @@ | ||||||||||||||||||
import pathlib | ||||||||||||||||||
from typing import Any, BinaryIO, Dict, List, Optional, Tuple, Union | ||||||||||||||||||
|
||||||||||||||||||
from torchdata import janitor | ||||||||||||||||||
from torchdata.datapipes.iter import Demultiplexer, Filter, IterDataPipe, IterKeyZipper, JsonParser, Mapper, UnBatcher | ||||||||||||||||||
from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource | ||||||||||||||||||
from torchvision.prototype.datasets.utils._internal import ( | ||||||||||||||||||
|
@@ -62,10 +63,12 @@ def _add_empty_anns(self, data: Tuple[str, BinaryIO]) -> Tuple[Tuple[str, Binary | |||||||||||||||||
def _prepare_sample(self, data: Tuple[Tuple[str, BinaryIO], Optional[Dict[str, Any]]]) -> Dict[str, Any]: | ||||||||||||||||||
image_data, scenes_data = data | ||||||||||||||||||
path, buffer = image_data | ||||||||||||||||||
image = EncodedImage.from_file(buffer) | ||||||||||||||||||
buffer.close() | ||||||||||||||||||
|
||||||||||||||||||
return dict( | ||||||||||||||||||
path=path, | ||||||||||||||||||
image=EncodedImage.from_file(buffer), | ||||||||||||||||||
image=image, | ||||||||||||||||||
label=Label(len(scenes_data["objects"])) if scenes_data else None, | ||||||||||||||||||
) | ||||||||||||||||||
|
||||||||||||||||||
|
@@ -97,6 +100,8 @@ def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, | |||||||||||||||||
buffer_size=INFINITE_BUFFER_SIZE, | ||||||||||||||||||
) | ||||||||||||||||||
else: | ||||||||||||||||||
for i in scenes_dp: | ||||||||||||||||||
janitor(i) | ||||||||||||||||||
Comment on lines
+103
to
+104
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Plus, do we even need to use
Suggested change
|
||||||||||||||||||
dp = Mapper(images_dp, self._add_empty_anns) | ||||||||||||||||||
|
||||||||||||||||||
return Mapper(dp, self._prepare_sample) | ||||||||||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,6 +16,7 @@ | |
) | ||
from torchvision.prototype.datasets.utils import Dataset, HttpResource, OnlineResource | ||
from torchvision.prototype.datasets.utils._internal import ( | ||
close_buffer, | ||
getitem, | ||
hint_sharding, | ||
hint_shuffling, | ||
|
@@ -169,9 +170,10 @@ def _classify_meta(self, data: Tuple[str, Any]) -> Optional[int]: | |
|
||
def _prepare_image(self, data: Tuple[str, BinaryIO]) -> Dict[str, Any]: | ||
path, buffer = data | ||
image = close_buffer(EncodedImage.from_file, buffer) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If |
||
return dict( | ||
path=path, | ||
image=EncodedImage.from_file(buffer), | ||
image=image, | ||
) | ||
|
||
def _prepare_sample( | ||
|
@@ -182,9 +184,11 @@ def _prepare_sample( | |
anns, image_meta = ann_data | ||
|
||
sample = self._prepare_image(image_data) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you revert the formatting changes? |
||
# this method is only called if we have annotations | ||
annotations = cast(str, self._annotations) | ||
sample.update(self._ANN_DECODERS[annotations](self, anns, image_meta)) | ||
|
||
return sample | ||
|
||
def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -128,10 +128,11 @@ def _prepare_test_data(self, data: Tuple[str, BinaryIO]) -> Tuple[None, Tuple[st | |
return None, data | ||
|
||
def _classifiy_devkit(self, data: Tuple[str, BinaryIO]) -> Optional[int]: | ||
name, binary_io = data | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you revert this, since |
||
return { | ||
"meta.mat": ImageNetDemux.META, | ||
"ILSVRC2012_validation_ground_truth.txt": ImageNetDemux.LABEL, | ||
}.get(pathlib.Path(data[0]).name) | ||
}.get(pathlib.Path(name).name) | ||
|
||
_VAL_TEST_IMAGE_NAME_PATTERN = re.compile(r"ILSVRC2012_(val|test)_(?P<id>\d{8})[.]JPEG") | ||
|
||
|
@@ -151,11 +152,13 @@ def _prepare_sample( | |
data: Tuple[Optional[Tuple[Label, str]], Tuple[str, BinaryIO]], | ||
) -> Dict[str, Any]: | ||
label_data, (path, buffer) = data | ||
image = EncodedImage.from_file(buffer) | ||
buffer.close() | ||
|
||
return dict( | ||
dict(zip(("label", "wnid"), label_data if label_data else (None, None))), | ||
path=path, | ||
image=EncodedImage.from_file(buffer), | ||
image=image, | ||
) | ||
|
||
def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,6 +21,8 @@ | |
|
||
NAME = "sbd" | ||
|
||
from torchdata import janitor | ||
|
||
|
||
@register_info(NAME) | ||
def _info() -> Dict[str, Any]: | ||
|
@@ -82,10 +84,12 @@ def _prepare_sample(self, data: Tuple[Tuple[Any, Tuple[str, BinaryIO]], Tuple[st | |
ann_path, ann_buffer = ann_data | ||
|
||
anns = read_mat(ann_buffer, squeeze_me=True)["GTcls"] | ||
|
||
ann_buffer.close() | ||
image = EncodedImage.from_file(image_buffer) | ||
image_buffer.close() | ||
return dict( | ||
image_path=image_path, | ||
image=EncodedImage.from_file(image_buffer), | ||
image=image, | ||
ann_path=ann_path, | ||
# the boundaries are stored in sparse CSC format, which is not supported by PyTorch | ||
boundaries=_Feature(np.stack([raw_boundary.toarray() for raw_boundary in anns["Boundaries"].item()])), | ||
|
@@ -104,6 +108,8 @@ def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, | |
drop_none=True, | ||
) | ||
if self._split == "train_noval": | ||
for i in split_dp: | ||
janitor(i) | ||
Comment on lines
+111
to
+112
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as above. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Plus, don't we need to do the same on |
||
split_dp = extra_split_dp | ||
|
||
split_dp = Filter(split_dp, path_comparator("name", f"{self._split}.txt")) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Instead of sticking with the iterator pattern here, can't we just simply do