diff --git a/test/common_utils.py b/test/common_utils.py index a1d188efdae..73eb6b7d0b1 100644 --- a/test/common_utils.py +++ b/test/common_utils.py @@ -406,6 +406,7 @@ def make_bounding_boxes( canvas_size=DEFAULT_SIZE, *, format=tv_tensors.BoundingBoxFormat.XYXY, + num_objects=1, dtype=None, device="cpu", ): @@ -419,8 +420,7 @@ def sample_position(values, max_value): dtype = dtype or torch.float32 - num_objects = 1 - h, w = [torch.randint(1, c, (num_objects,)) for c in canvas_size] + h, w = [torch.randint(1, s, (num_objects,)) for s in canvas_size] y = sample_position(h, canvas_size[0]) x = sample_position(w, canvas_size[1]) diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py index 3e8456e1ef1..2014b9f6515 100644 --- a/test/test_transforms_v2.py +++ b/test/test_transforms_v2.py @@ -17,7 +17,7 @@ from torchvision.ops.boxes import box_iou from torchvision.transforms.functional import to_pil_image from torchvision.transforms.v2 import functional as F -from torchvision.transforms.v2._utils import check_type, is_pure_tensor, query_chw +from torchvision.transforms.v2._utils import is_pure_tensor, query_chw from transforms_v2_legacy_utils import ( make_bounding_boxes, make_detection_mask, @@ -62,22 +62,6 @@ def parametrize(transforms_with_inputs): ) -def auto_augment_adapter(transform, input, device): - adapted_input = {} - image_or_video_found = False - for key, value in input.items(): - if isinstance(value, (tv_tensors.BoundingBoxes, tv_tensors.Mask)): - # AA transforms don't support bounding boxes or masks - continue - elif check_type(value, (tv_tensors.Image, tv_tensors.Video, is_pure_tensor, PIL.Image.Image)): - if image_or_video_found: - # AA transforms only support a single image or video - continue - image_or_video_found = True - adapted_input[key] = value - return adapted_input - - def linear_transformation_adapter(transform, input, device): flat_inputs = list(input.values()) c, h, w = query_chw( @@ -93,58 +77,19 @@ def linear_transformation_adapter(transform, input, device): return {key: value for key, value in input.items() if not isinstance(value, PIL.Image.Image)} -def normalize_adapter(transform, input, device): - adapted_input = {} - for key, value in input.items(): - if isinstance(value, PIL.Image.Image): - # normalize doesn't support PIL images - continue - elif check_type(value, (tv_tensors.Image, tv_tensors.Video, is_pure_tensor)): - # normalize doesn't support integer images - value = F.to_dtype(value, torch.float32, scale=True) - adapted_input[key] = value - return adapted_input - - class TestSmoke: @pytest.mark.parametrize( ("transform", "adapter"), [ - (transforms.RandomErasing(p=1.0), None), - (transforms.AugMix(), auto_augment_adapter), - (transforms.AutoAugment(), auto_augment_adapter), - (transforms.RandAugment(), auto_augment_adapter), - (transforms.TrivialAugmentWide(), auto_augment_adapter), (transforms.ColorJitter(brightness=0.1, contrast=0.2, saturation=0.3, hue=0.15), None), (transforms.Grayscale(), None), - (transforms.RandomAdjustSharpness(sharpness_factor=0.5, p=1.0), None), - (transforms.RandomAutocontrast(p=1.0), None), - (transforms.RandomEqualize(p=1.0), None), (transforms.RandomGrayscale(p=1.0), None), - (transforms.RandomInvert(p=1.0), None), (transforms.RandomChannelPermutation(), None), (transforms.RandomPhotometricDistort(p=1.0), None), - (transforms.RandomPosterize(bits=4, p=1.0), None), - (transforms.RandomSolarize(threshold=0.5, p=1.0), None), - (transforms.CenterCrop([16, 16]), None), - (transforms.ElasticTransform(sigma=1.0), None), - (transforms.Pad(4), None), - (transforms.RandomAffine(degrees=30.0), None), - (transforms.RandomCrop([16, 16], pad_if_needed=True), None), - (transforms.RandomHorizontalFlip(p=1.0), None), - (transforms.RandomPerspective(p=1.0), None), - (transforms.RandomResize(min_size=10, max_size=20, antialias=True), None), - (transforms.RandomResizedCrop([16, 16], antialias=True), None), - (transforms.RandomRotation(degrees=30), None), (transforms.RandomShortestSize(min_size=10, antialias=True), None), - (transforms.RandomVerticalFlip(p=1.0), None), (transforms.RandomZoomOut(p=1.0), None), - (transforms.Resize([16, 16], antialias=True), None), (transforms.ScaleJitter((16, 16), scale_range=(0.8, 1.2), antialias=True), None), - (transforms.ClampBoundingBoxes(), None), - (transforms.ConvertBoundingBoxFormat(tv_tensors.BoundingBoxFormat.CXCYWH), None), (transforms.ConvertImageDtype(), None), - (transforms.GaussianBlur(kernel_size=3), None), ( transforms.LinearTransformation( # These are just dummy values that will be filled by the adapter. We can't define them upfront, @@ -154,9 +99,6 @@ class TestSmoke: ), linear_transformation_adapter, ), - (transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), normalize_adapter), - (transforms.ToDtype(torch.float64), None), - (transforms.UniformTemporalSubsample(num_samples=2), None), ], ids=lambda transform: type(transform).__name__, ) diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py index cfad5fe064e..015ad721b1c 100644 --- a/test/test_transforms_v2_refactored.py +++ b/test/test_transforms_v2_refactored.py @@ -6,6 +6,7 @@ import math import pickle import re +from copy import deepcopy from pathlib import Path from unittest import mock @@ -38,13 +39,14 @@ from torch import nn from torch.testing import assert_close -from torch.utils._pytree import tree_map +from torch.utils._pytree import tree_flatten, tree_map from torch.utils.data import DataLoader, default_collate from torchvision import tv_tensors from torchvision.transforms._functional_tensor import _max_value as get_max_value from torchvision.transforms.functional import pil_modes_mapping from torchvision.transforms.v2 import functional as F +from torchvision.transforms.v2._utils import check_type, is_pure_tensor from torchvision.transforms.v2.functional._geometry import _get_perspective_coeffs from torchvision.transforms.v2.functional._utils import _get_kernel, _register_kernel_internal @@ -276,7 +278,120 @@ def _check_transform_v1_compatibility(transform, input, *, rtol, atol): _script(v1_transform)(input) -def check_transform(transform, input, check_v1_compatibility=True): +def _make_transform_sample(transform, *, image_or_video, adapter): + device = image_or_video.device if isinstance(image_or_video, torch.Tensor) else "cpu" + size = F.get_size(image_or_video) + input = dict( + image_or_video=image_or_video, + image_tv_tensor=make_image(size, device=device), + video_tv_tensor=make_video(size, device=device), + image_pil=make_image_pil(size), + bounding_boxes_xyxy=make_bounding_boxes( + size, format=tv_tensors.BoundingBoxFormat.XYXY, num_objects=3, device=device + ), + bounding_boxes_xywh=make_bounding_boxes( + size, + format=tv_tensors.BoundingBoxFormat.XYWH, + num_objects=4, + device=device, + ), + bounding_boxes_cxcywh=make_bounding_boxes( + size, + format=tv_tensors.BoundingBoxFormat.CXCYWH, + num_objects=5, + device=device, + ), + bounding_boxes_degenerate_xyxy=tv_tensors.BoundingBoxes( + [ + [0, 0, 0, 0], # no height or width + [0, 0, 0, 1], # no height + [0, 0, 1, 0], # no width + [2, 0, 1, 1], # x1 > x2, y1 < y2 + [0, 2, 1, 1], # x1 < x2, y1 > y2 + [2, 2, 1, 1], # x1 > x2, y1 > y2 + ], + format=tv_tensors.BoundingBoxFormat.XYXY, + canvas_size=size, + device=device, + ), + bounding_boxes_degenerate_xywh=tv_tensors.BoundingBoxes( + [ + [0, 0, 0, 0], # no height or width + [0, 0, 0, 1], # no height + [0, 0, 1, 0], # no width + [0, 0, 1, -1], # negative height + [0, 0, -1, 1], # negative width + [0, 0, -1, -1], # negative height and width + ], + format=tv_tensors.BoundingBoxFormat.XYWH, + canvas_size=size, + device=device, + ), + bounding_boxes_degenerate_cxcywh=tv_tensors.BoundingBoxes( + [ + [0, 0, 0, 0], # no height or width + [0, 0, 0, 1], # no height + [0, 0, 1, 0], # no width + [0, 0, 1, -1], # negative height + [0, 0, -1, 1], # negative width + [0, 0, -1, -1], # negative height and width + ], + format=tv_tensors.BoundingBoxFormat.CXCYWH, + canvas_size=size, + device=device, + ), + detection_mask=make_detection_mask(size, device=device), + segmentation_mask=make_segmentation_mask(size, device=device), + int=0, + float=0.0, + bool=True, + none=None, + str="str", + path=Path.cwd(), + object=object(), + tensor=torch.empty(5), + array=np.empty(5), + ) + if adapter is not None: + input = adapter(transform, input, device) + return input + + +def _check_transform_sample_input_smoke(transform, input, *, adapter): + if not check_type(input, (is_pure_tensor, PIL.Image.Image, tv_tensors.Image, tv_tensors.Video)): + return + image_or_video = input + + for container_type in [dict, list, tuple]: + input = _make_transform_sample( + # adapter might change transform inplace + transform=transform if adapter is None else deepcopy(transform), + image_or_video=image_or_video, + adapter=adapter, + ) + + if container_type in {tuple, list}: + input = container_type(input.values()) + + input_flat, input_spec = tree_flatten(input) + + with freeze_rng_state(): + torch.manual_seed(0) + output = transform(input) + output_flat, output_spec = tree_flatten(output) + + assert output_spec == input_spec + + for output_item, input_item, should_be_transformed in zip( + output_flat, input_flat, transforms.Transform()._needs_transform_list(input_flat) + ): + if should_be_transformed: + assert type(output_item) is type(input_item) + else: + assert output_item is input_item + + +def check_transform(transform, input, check_v1_compatibility=True, check_sample_input=True): pickle.loads(pickle.dumps(transform)) output = transform(input) @@ -289,6 +404,11 @@ def check_transform(transform, input, check_v1_compatibility=True): if isinstance(input, tv_tensors.BoundingBoxes) and not isinstance(transform, transforms.ConvertBoundingBoxFormat): assert output.format == input.format + if check_sample_input: + _check_transform_sample_input_smoke( + transform, input, adapter=check_sample_input if callable(check_sample_input) else None + ) + if check_v1_compatibility: _check_transform_v1_compatibility(transform, input, **_to_tolerances(check_v1_compatibility)) @@ -1800,7 +1920,7 @@ def test_transform(self, make_input, input_dtype, output_dtype, device, scale, a input = make_input(dtype=input_dtype, device=device) if as_dict: output_dtype = {type(input): output_dtype} - check_transform(transforms.ToDtype(dtype=output_dtype, scale=scale), input) + check_transform(transforms.ToDtype(dtype=output_dtype, scale=scale), input, check_sample_input=not as_dict) def reference_convert_dtype_image_tensor(self, image, dtype=torch.float, scale=False): input_dtype = image.dtype @@ -2601,9 +2721,13 @@ def test_functional_image_correctness(self, kwargs): def test_transform(self, param, value, make_input): input = make_input(self.INPUT_SIZE) + check_sample_input = True if param == "fill": - if isinstance(input, tv_tensors.Mask) and isinstance(value, (tuple, list)): - pytest.skip("F.pad_mask doesn't support non-scalar fill.") + if isinstance(value, (tuple, list)): + if isinstance(input, tv_tensors.Mask): + pytest.skip("F.pad_mask doesn't support non-scalar fill.") + else: + check_sample_input = False kwargs = dict( # 1. size is required @@ -2618,6 +2742,7 @@ def test_transform(self, param, value, make_input): transforms.RandomCrop(**kwargs, pad_if_needed=True), input, check_v1_compatibility=param != "fill" or isinstance(value, (int, float)), + check_sample_input=check_sample_input, ) @pytest.mark.parametrize("padding", [1, (1, 1), (1, 1, 1, 1)]) @@ -2803,9 +2928,13 @@ def test_functional_signature(self, kernel, input_type): @pytest.mark.parametrize("device", cpu_and_cuda()) def test_transform(self, make_input, device): input = make_input(device=device) - check_transform( - transforms.RandomErasing(p=1), input, check_v1_compatibility=not isinstance(input, PIL.Image.Image) - ) + + with pytest.warns(UserWarning, match="currently passing through inputs of type"): + check_transform( + transforms.RandomErasing(p=1), + input, + check_v1_compatibility=not isinstance(input, PIL.Image.Image), + ) def _reference_erase_image(self, image, *, i, j, h, w, v): mask = torch.zeros_like(image, dtype=torch.bool) @@ -2877,18 +3006,6 @@ def test_transform_errors(self): with pytest.raises(ValueError, match="If value is a sequence, it should have either a single value"): transform._get_params([make_image()]) - @pytest.mark.parametrize("make_input", [make_bounding_boxes, make_detection_mask]) - def test_transform_passthrough(self, make_input): - transform = transforms.RandomErasing(p=1) - - input = make_input(self.INPUT_SIZE) - - with pytest.warns(UserWarning, match="currently passing through inputs of type"): - # RandomErasing requires an image or video to be present - _, output = transform(make_image(self.INPUT_SIZE), input) - - assert output is input - class TestGaussianBlur: @pytest.mark.parametrize("kernel_size", [1, 3, (3, 1), [3, 5]]) @@ -3105,6 +3222,21 @@ def test_correctness_shear_translate(self, transform_id, magnitude, interpolatio else: assert_close(actual, expected, rtol=0, atol=1) + def _sample_input_adapter(self, transform, input, device): + adapted_input = {} + image_or_video_found = False + for key, value in input.items(): + if isinstance(value, (tv_tensors.BoundingBoxes, tv_tensors.Mask)): + # AA transforms don't support bounding boxes or masks + continue + elif check_type(value, (tv_tensors.Image, tv_tensors.Video, is_pure_tensor, PIL.Image.Image)): + if image_or_video_found: + # AA transforms only support a single image or video + continue + image_or_video_found = True + adapted_input[key] = value + return adapted_input + @pytest.mark.parametrize( "transform", [transforms.AutoAugment(), transforms.RandAugment(), transforms.TrivialAugmentWide(), transforms.AugMix()], @@ -3129,7 +3261,9 @@ def test_transform_smoke(self, transform, make_input, dtype, device): # For v2, we changed the random sampling of the AA transforms. This makes it impossible to compare the v1 # and v2 outputs without complicated mocking and monkeypatching. Thus, we skip the v1 compatibility checks # here and only check if we can script the v2 transform and subsequently call the result. - check_transform(transform, input, check_v1_compatibility=False) + check_transform( + transform, input, check_v1_compatibility=False, check_sample_input=self._sample_input_adapter + ) if type(input) is torch.Tensor and dtype is torch.uint8: _script(transform)(input) @@ -4103,9 +4237,25 @@ def test_functional_error(self): with pytest.raises(ValueError, match="std evaluated to zero, leading to division by zero"): F.normalize_image(make_image(dtype=torch.float32), mean=self.MEAN, std=std) + def _sample_input_adapter(self, transform, input, device): + adapted_input = {} + for key, value in input.items(): + if isinstance(value, PIL.Image.Image): + # normalize doesn't support PIL images + continue + elif check_type(value, (is_pure_tensor, tv_tensors.Image, tv_tensors.Video)): + # normalize doesn't support integer images + value = F.to_dtype(value, torch.float32, scale=True) + adapted_input[key] = value + return adapted_input + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_video]) def test_transform(self, make_input): - check_transform(transforms.Normalize(mean=self.MEAN, std=self.STD), make_input(dtype=torch.float32)) + check_transform( + transforms.Normalize(mean=self.MEAN, std=self.STD), + make_input(dtype=torch.float32), + check_sample_input=self._sample_input_adapter, + ) def _assert_is_standard_normal_distributed(self, tensor): result = scipy.stats.kstest(tensor.flatten().cpu(), cdf="norm", args=(0, 1)) @@ -4600,7 +4750,7 @@ def test_functional_signature(self, functional, kernel, input_type): ) @pytest.mark.parametrize("transform_cls", [transforms.FiveCrop, transforms.TenCrop]) def test_transform(self, make_input, transform_cls): - check_transform(transform_cls(size=self.OUTPUT_SIZE), make_input(self.INPUT_SIZE)) + check_transform(transform_cls(size=self.OUTPUT_SIZE), make_input(self.INPUT_SIZE), check_sample_input=False) @pytest.mark.parametrize("make_input", [make_bounding_boxes, make_detection_mask]) @pytest.mark.parametrize("transform_cls", [transforms.FiveCrop, transforms.TenCrop])