Description
Motivation
I've realized that the way torchvision is coded it's not possible to store a transformation to be applied several times. Video requires the same transformation to be applied to the whole sequence.
Proposed changes
I propose to restructure the code with minor changes such that:
A base transformation class (template) were created, providing get_params and reset_params method:
class BaseTransformation(object):
def get_params(self):
pass
def reset_params(self):
pass
get_params would provide needed parameters if necessary meanwhile reset_params would act as param initilizer + reseter.
To modify compose class to deal with list/tuples of frames such that when the list were exhausted, paramters would be reset:
class Compose(object):
"""Composes several transforms together.
Args:
transforms (list of ``Transform`` objects): list of transforms to compose.
Example:
>>> transforms.Compose([
>>> transforms.CenterCrop(10),
>>> transforms.ToTensor(),
>>> ])
"""
def __init__(self, transforms):
self.transforms = transforms
def __call__(self, inpt):
if isinstance(inpt,(list,tuple)):
return self.apply_sequence(inpt)
else:
return self.apply_img(inpt)
def apply_img(self,img):
for t in self.transforms:
img = t(img)
return img
def apply_sequence(self,seq):
output = list(map(self.apply_img,seq))
for t in self.transforms:
t.reset_params()
return output
def __repr__(self):
format_string = self.__class__.__name__ + '('
for t in self.transforms:
format_string += '\n'
format_string += ' {0}'.format(t)
format_string += '\n)'
return format_string
To set random parameters and image parameters as object attributes. As some parameters requires image features to be computed, parameters would be initialized as None and computed/stored with the 1st frame:
Example 1:
class RandomHorizontalFlip(object):
"""Horizontally flip the given PIL Image randomly with a given probability.
Args:
p (float): probability of the image being flipped. Default value is 0.5
"""
def __init__(self, p=0.5):
self.p = p
def __call__(self, img):
"""
Args:
img (PIL Image): Image to be flipped.
Returns:
PIL Image: Randomly flipped image.
"""
if self.flag is None: #This was initially if random.random() < self.p: so it was not possible
#to apply the same transformation to another frame
self.get_paramters()
if self.flag:
return F.hflip(img)
return img
def __repr__(self):
return self.__class__.__name__ + '(p={})'.format(self.p)
def get_paramters(self):
self.flag = random.random() < self.p
def reset_params(self):
self.flag = None
Example 2:
class RandomResizedCrop(BaseTransformation):
"""Crop the given PIL Image to random size and aspect ratio.
A crop of random size (default: of 0.08 to 1.0) of the original size and a random
aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. This crop
is finally resized to given size.
This is popularly used to train the Inception networks.
Args:
size: expected output size of each edge
scale: range of size of the origin size cropped
ratio: range of aspect ratio of the origin aspect ratio cropped
interpolation: Default: PIL.Image.BILINEAR
"""
def __init__(self, size, scale=(0.08, 1.0), ratio=(3. / 4., 4. / 3.), interpolation=Image.BILINEAR):
if isinstance(size, tuple):
self.size = size
else:
self.size = (size, size)
if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
warnings.warn("range should be of kind (min, max)")
self.interpolation = interpolation
self.scale = scale
self.ratio = ratio
self.reset_params()
def get_params(self,img, scale, ratio):
"""Get parameters for ``crop`` for a random sized crop.
Args:
img (PIL Image): Image to be cropped.
scale (tuple): range of size of the origin size cropped
ratio (tuple): range of aspect ratio of the origin aspect ratio cropped
Returns:
tuple: params (i, j, h, w) to be passed to ``crop`` for a random
sized crop.
"""
area = img.size[0] * img.size[1]
for attempt in range(10):
target_area = random.uniform(*scale) * area
log_ratio = (math.log(ratio[0]), math.log(ratio[1]))
aspect_ratio = math.exp(random.uniform(*log_ratio))
w = int(round(math.sqrt(target_area * aspect_ratio)))
h = int(round(math.sqrt(target_area / aspect_ratio)))
if w <= img.size[0] and h <= img.size[1]:
i = random.randint(0, img.size[1] - h)
j = random.randint(0, img.size[0] - w)
return i, j, h, w
# Fallback to central crop
in_ratio = img.size[0] / img.size[1]
if (in_ratio < min(ratio)):
w = img.size[0]
h = w / min(ratio)
elif (in_ratio > max(ratio)):
h = img.size[1]
w = h * max(ratio)
else: # whole image
w = img.size[0]
h = img.size[1]
self.i = (img.size[1] - h) // 2
self.j = (img.size[0] - w) // 2
self.h = h
self.w = w
def reset_params(self):
self.i = None
self.j = None
self.h = None
self.w = None
def __call__(self, img):
"""
Args:
img (PIL Image): Image to be cropped and resized.
Returns:
PIL Image: Randomly cropped and resized image.
"""
if self.i is None:
assert self.i == self.h == self.j == self.w
self.get_params(img, self.size)
return F.resized_crop(img, self.i, self.j, self.h,
self.w, self.size, self.interpolation)
def __repr__(self):
interpolate_str = _pil_interpolation_to_str[self.interpolation]
format_string = self.__class__.__name__ + '(size={0}'.format(self.size)
format_string += ', scale={0}'.format(tuple(round(s, 4) for s in self.scale))
format_string += ', ratio={0}'.format(tuple(round(r, 4) for r in self.ratio))
format_string += ', interpolation={0})'.format(interpolate_str)
return format_string