diff --git a/test/common_utils.py b/test/common_utils.py
index a1d188efdae..73eb6b7d0b1 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -406,6 +406,7 @@ def make_bounding_boxes(
     canvas_size=DEFAULT_SIZE,
     *,
     format=tv_tensors.BoundingBoxFormat.XYXY,
+    num_objects=1,
     dtype=None,
     device="cpu",
 ):
@@ -419,8 +420,7 @@ def sample_position(values, max_value):
 
     dtype = dtype or torch.float32
 
-    num_objects = 1
-    h, w = [torch.randint(1, c, (num_objects,)) for c in canvas_size]
+    h, w = [torch.randint(1, s, (num_objects,)) for s in canvas_size]
     y = sample_position(h, canvas_size[0])
     x = sample_position(w, canvas_size[1])
 
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 3e8456e1ef1..2014b9f6515 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -17,7 +17,7 @@
 from torchvision.ops.boxes import box_iou
 from torchvision.transforms.functional import to_pil_image
 from torchvision.transforms.v2 import functional as F
-from torchvision.transforms.v2._utils import check_type, is_pure_tensor, query_chw
+from torchvision.transforms.v2._utils import is_pure_tensor, query_chw
 from transforms_v2_legacy_utils import (
     make_bounding_boxes,
     make_detection_mask,
@@ -62,22 +62,6 @@ def parametrize(transforms_with_inputs):
     )
 
 
-def auto_augment_adapter(transform, input, device):
-    adapted_input = {}
-    image_or_video_found = False
-    for key, value in input.items():
-        if isinstance(value, (tv_tensors.BoundingBoxes, tv_tensors.Mask)):
-            # AA transforms don't support bounding boxes or masks
-            continue
-        elif check_type(value, (tv_tensors.Image, tv_tensors.Video, is_pure_tensor, PIL.Image.Image)):
-            if image_or_video_found:
-                # AA transforms only support a single image or video
-                continue
-            image_or_video_found = True
-        adapted_input[key] = value
-    return adapted_input
-
-
 def linear_transformation_adapter(transform, input, device):
     flat_inputs = list(input.values())
     c, h, w = query_chw(
@@ -93,58 +77,19 @@ def linear_transformation_adapter(transform, input, device):
     return {key: value for key, value in input.items() if not isinstance(value, PIL.Image.Image)}
 
 
-def normalize_adapter(transform, input, device):
-    adapted_input = {}
-    for key, value in input.items():
-        if isinstance(value, PIL.Image.Image):
-            # normalize doesn't support PIL images
-            continue
-        elif check_type(value, (tv_tensors.Image, tv_tensors.Video, is_pure_tensor)):
-            # normalize doesn't support integer images
-            value = F.to_dtype(value, torch.float32, scale=True)
-        adapted_input[key] = value
-    return adapted_input
-
-
 class TestSmoke:
     @pytest.mark.parametrize(
         ("transform", "adapter"),
         [
-            (transforms.RandomErasing(p=1.0), None),
-            (transforms.AugMix(), auto_augment_adapter),
-            (transforms.AutoAugment(), auto_augment_adapter),
-            (transforms.RandAugment(), auto_augment_adapter),
-            (transforms.TrivialAugmentWide(), auto_augment_adapter),
             (transforms.ColorJitter(brightness=0.1, contrast=0.2, saturation=0.3, hue=0.15), None),
             (transforms.Grayscale(), None),
-            (transforms.RandomAdjustSharpness(sharpness_factor=0.5, p=1.0), None),
-            (transforms.RandomAutocontrast(p=1.0), None),
-            (transforms.RandomEqualize(p=1.0), None),
             (transforms.RandomGrayscale(p=1.0), None),
-            (transforms.RandomInvert(p=1.0), None),
             (transforms.RandomChannelPermutation(), None),
             (transforms.RandomPhotometricDistort(p=1.0), None),
-            (transforms.RandomPosterize(bits=4, p=1.0), None),
-            (transforms.RandomSolarize(threshold=0.5, p=1.0), None),
-            (transforms.CenterCrop([16, 16]), None),
-            (transforms.ElasticTransform(sigma=1.0), None),
-            (transforms.Pad(4), None),
-            (transforms.RandomAffine(degrees=30.0), None),
-            (transforms.RandomCrop([16, 16], pad_if_needed=True), None),
-            (transforms.RandomHorizontalFlip(p=1.0), None),
-            (transforms.RandomPerspective(p=1.0), None),
-            (transforms.RandomResize(min_size=10, max_size=20, antialias=True), None),
-            (transforms.RandomResizedCrop([16, 16], antialias=True), None),
-            (transforms.RandomRotation(degrees=30), None),
             (transforms.RandomShortestSize(min_size=10, antialias=True), None),
-            (transforms.RandomVerticalFlip(p=1.0), None),
             (transforms.RandomZoomOut(p=1.0), None),
-            (transforms.Resize([16, 16], antialias=True), None),
             (transforms.ScaleJitter((16, 16), scale_range=(0.8, 1.2), antialias=True), None),
-            (transforms.ClampBoundingBoxes(), None),
-            (transforms.ConvertBoundingBoxFormat(tv_tensors.BoundingBoxFormat.CXCYWH), None),
             (transforms.ConvertImageDtype(), None),
-            (transforms.GaussianBlur(kernel_size=3), None),
             (
                 transforms.LinearTransformation(
                     # These are just dummy values that will be filled by the adapter. We can't define them upfront,
@@ -154,9 +99,6 @@ class TestSmoke:
                 ),
                 linear_transformation_adapter,
             ),
-            (transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), normalize_adapter),
-            (transforms.ToDtype(torch.float64), None),
-            (transforms.UniformTemporalSubsample(num_samples=2), None),
         ],
         ids=lambda transform: type(transform).__name__,
     )
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index cfad5fe064e..015ad721b1c 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -6,6 +6,7 @@
 import math
 import pickle
 import re
+from copy import deepcopy
 from pathlib import Path
 from unittest import mock
 
@@ -38,13 +39,14 @@
 
 from torch import nn
 from torch.testing import assert_close
-from torch.utils._pytree import tree_map
+from torch.utils._pytree import tree_flatten, tree_map
 from torch.utils.data import DataLoader, default_collate
 from torchvision import tv_tensors
 
 from torchvision.transforms._functional_tensor import _max_value as get_max_value
 from torchvision.transforms.functional import pil_modes_mapping
 from torchvision.transforms.v2 import functional as F
+from torchvision.transforms.v2._utils import check_type, is_pure_tensor
 from torchvision.transforms.v2.functional._geometry import _get_perspective_coeffs
 from torchvision.transforms.v2.functional._utils import _get_kernel, _register_kernel_internal
 
@@ -276,7 +278,120 @@ def _check_transform_v1_compatibility(transform, input, *, rtol, atol):
     _script(v1_transform)(input)
 
 
-def check_transform(transform, input, check_v1_compatibility=True):
+def _make_transform_sample(transform, *, image_or_video, adapter):
+    device = image_or_video.device if isinstance(image_or_video, torch.Tensor) else "cpu"
+    size = F.get_size(image_or_video)
+    input = dict(
+        image_or_video=image_or_video,
+        image_tv_tensor=make_image(size, device=device),
+        video_tv_tensor=make_video(size, device=device),
+        image_pil=make_image_pil(size),
+        bounding_boxes_xyxy=make_bounding_boxes(
+            size, format=tv_tensors.BoundingBoxFormat.XYXY, num_objects=3, device=device
+        ),
+        bounding_boxes_xywh=make_bounding_boxes(
+            size,
+            format=tv_tensors.BoundingBoxFormat.XYWH,
+            num_objects=4,
+            device=device,
+        ),
+        bounding_boxes_cxcywh=make_bounding_boxes(
+            size,
+            format=tv_tensors.BoundingBoxFormat.CXCYWH,
+            num_objects=5,
+            device=device,
+        ),
+        bounding_boxes_degenerate_xyxy=tv_tensors.BoundingBoxes(
+            [
+                [0, 0, 0, 0],  # no height or width
+                [0, 0, 0, 1],  # no height
+                [0, 0, 1, 0],  # no width
+                [2, 0, 1, 1],  # x1 > x2, y1 < y2
+                [0, 2, 1, 1],  # x1 < x2, y1 > y2
+                [2, 2, 1, 1],  # x1 > x2, y1 > y2
+            ],
+            format=tv_tensors.BoundingBoxFormat.XYXY,
+            canvas_size=size,
+            device=device,
+        ),
+        bounding_boxes_degenerate_xywh=tv_tensors.BoundingBoxes(
+            [
+                [0, 0, 0, 0],  # no height or width
+                [0, 0, 0, 1],  # no height
+                [0, 0, 1, 0],  # no width
+                [0, 0, 1, -1],  # negative height
+                [0, 0, -1, 1],  # negative width
+                [0, 0, -1, -1],  # negative height and width
+            ],
+            format=tv_tensors.BoundingBoxFormat.XYWH,
+            canvas_size=size,
+            device=device,
+        ),
+        bounding_boxes_degenerate_cxcywh=tv_tensors.BoundingBoxes(
+            [
+                [0, 0, 0, 0],  # no height or width
+                [0, 0, 0, 1],  # no height
+                [0, 0, 1, 0],  # no width
+                [0, 0, 1, -1],  # negative height
+                [0, 0, -1, 1],  # negative width
+                [0, 0, -1, -1],  # negative height and width
+            ],
+            format=tv_tensors.BoundingBoxFormat.CXCYWH,
+            canvas_size=size,
+            device=device,
+        ),
+        detection_mask=make_detection_mask(size, device=device),
+        segmentation_mask=make_segmentation_mask(size, device=device),
+        int=0,
+        float=0.0,
+        bool=True,
+        none=None,
+        str="str",
+        path=Path.cwd(),
+        object=object(),
+        tensor=torch.empty(5),
+        array=np.empty(5),
+    )
+    if adapter is not None:
+        input = adapter(transform, input, device)
+    return input
+
+
+def _check_transform_sample_input_smoke(transform, input, *, adapter):
+    if not check_type(input, (is_pure_tensor, PIL.Image.Image, tv_tensors.Image, tv_tensors.Video)):
+        return
+    image_or_video = input
+
+    for container_type in [dict, list, tuple]:
+        input = _make_transform_sample(
+            # adapter might change transform inplace
+            transform=transform if adapter is None else deepcopy(transform),
+            image_or_video=image_or_video,
+            adapter=adapter,
+        )
+
+        if container_type in {tuple, list}:
+            input = container_type(input.values())
+
+        input_flat, input_spec = tree_flatten(input)
+
+        with freeze_rng_state():
+            torch.manual_seed(0)
+            output = transform(input)
+        output_flat, output_spec = tree_flatten(output)
+
+        assert output_spec == input_spec
+
+        for output_item, input_item, should_be_transformed in zip(
+            output_flat, input_flat, transforms.Transform()._needs_transform_list(input_flat)
+        ):
+            if should_be_transformed:
+                assert type(output_item) is type(input_item)
+            else:
+                assert output_item is input_item
+
+
+def check_transform(transform, input, check_v1_compatibility=True, check_sample_input=True):
     pickle.loads(pickle.dumps(transform))
 
     output = transform(input)
@@ -289,6 +404,11 @@ def check_transform(transform, input, check_v1_compatibility=True):
     if isinstance(input, tv_tensors.BoundingBoxes) and not isinstance(transform, transforms.ConvertBoundingBoxFormat):
         assert output.format == input.format
 
+    if check_sample_input:
+        _check_transform_sample_input_smoke(
+            transform, input, adapter=check_sample_input if callable(check_sample_input) else None
+        )
+
     if check_v1_compatibility:
         _check_transform_v1_compatibility(transform, input, **_to_tolerances(check_v1_compatibility))
 
@@ -1800,7 +1920,7 @@ def test_transform(self, make_input, input_dtype, output_dtype, device, scale, a
         input = make_input(dtype=input_dtype, device=device)
         if as_dict:
             output_dtype = {type(input): output_dtype}
-        check_transform(transforms.ToDtype(dtype=output_dtype, scale=scale), input)
+        check_transform(transforms.ToDtype(dtype=output_dtype, scale=scale), input, check_sample_input=not as_dict)
 
     def reference_convert_dtype_image_tensor(self, image, dtype=torch.float, scale=False):
         input_dtype = image.dtype
@@ -2601,9 +2721,13 @@ def test_functional_image_correctness(self, kwargs):
     def test_transform(self, param, value, make_input):
         input = make_input(self.INPUT_SIZE)
 
+        check_sample_input = True
         if param == "fill":
-            if isinstance(input, tv_tensors.Mask) and isinstance(value, (tuple, list)):
-                pytest.skip("F.pad_mask doesn't support non-scalar fill.")
+            if isinstance(value, (tuple, list)):
+                if isinstance(input, tv_tensors.Mask):
+                    pytest.skip("F.pad_mask doesn't support non-scalar fill.")
+                else:
+                    check_sample_input = False
 
             kwargs = dict(
                 # 1. size is required
@@ -2618,6 +2742,7 @@ def test_transform(self, param, value, make_input):
             transforms.RandomCrop(**kwargs, pad_if_needed=True),
             input,
             check_v1_compatibility=param != "fill" or isinstance(value, (int, float)),
+            check_sample_input=check_sample_input,
         )
 
     @pytest.mark.parametrize("padding", [1, (1, 1), (1, 1, 1, 1)])
@@ -2803,9 +2928,13 @@ def test_functional_signature(self, kernel, input_type):
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_transform(self, make_input, device):
         input = make_input(device=device)
-        check_transform(
-            transforms.RandomErasing(p=1), input, check_v1_compatibility=not isinstance(input, PIL.Image.Image)
-        )
+
+        with pytest.warns(UserWarning, match="currently passing through inputs of type"):
+            check_transform(
+                transforms.RandomErasing(p=1),
+                input,
+                check_v1_compatibility=not isinstance(input, PIL.Image.Image),
+            )
 
     def _reference_erase_image(self, image, *, i, j, h, w, v):
         mask = torch.zeros_like(image, dtype=torch.bool)
@@ -2877,18 +3006,6 @@ def test_transform_errors(self):
         with pytest.raises(ValueError, match="If value is a sequence, it should have either a single value"):
             transform._get_params([make_image()])
 
-    @pytest.mark.parametrize("make_input", [make_bounding_boxes, make_detection_mask])
-    def test_transform_passthrough(self, make_input):
-        transform = transforms.RandomErasing(p=1)
-
-        input = make_input(self.INPUT_SIZE)
-
-        with pytest.warns(UserWarning, match="currently passing through inputs of type"):
-            # RandomErasing requires an image or video to be present
-            _, output = transform(make_image(self.INPUT_SIZE), input)
-
-        assert output is input
-
 
 class TestGaussianBlur:
     @pytest.mark.parametrize("kernel_size", [1, 3, (3, 1), [3, 5]])
@@ -3105,6 +3222,21 @@ def test_correctness_shear_translate(self, transform_id, magnitude, interpolatio
         else:
             assert_close(actual, expected, rtol=0, atol=1)
 
+    def _sample_input_adapter(self, transform, input, device):
+        adapted_input = {}
+        image_or_video_found = False
+        for key, value in input.items():
+            if isinstance(value, (tv_tensors.BoundingBoxes, tv_tensors.Mask)):
+                # AA transforms don't support bounding boxes or masks
+                continue
+            elif check_type(value, (tv_tensors.Image, tv_tensors.Video, is_pure_tensor, PIL.Image.Image)):
+                if image_or_video_found:
+                    # AA transforms only support a single image or video
+                    continue
+                image_or_video_found = True
+            adapted_input[key] = value
+        return adapted_input
+
     @pytest.mark.parametrize(
         "transform",
         [transforms.AutoAugment(), transforms.RandAugment(), transforms.TrivialAugmentWide(), transforms.AugMix()],
@@ -3129,7 +3261,9 @@ def test_transform_smoke(self, transform, make_input, dtype, device):
             # For v2, we changed the random sampling of the AA transforms. This makes it impossible to compare the v1
             # and v2 outputs without complicated mocking and monkeypatching. Thus, we skip the v1 compatibility checks
             # here and only check if we can script the v2 transform and subsequently call the result.
-            check_transform(transform, input, check_v1_compatibility=False)
+            check_transform(
+                transform, input, check_v1_compatibility=False, check_sample_input=self._sample_input_adapter
+            )
 
             if type(input) is torch.Tensor and dtype is torch.uint8:
                 _script(transform)(input)
@@ -4103,9 +4237,25 @@ def test_functional_error(self):
             with pytest.raises(ValueError, match="std evaluated to zero, leading to division by zero"):
                 F.normalize_image(make_image(dtype=torch.float32), mean=self.MEAN, std=std)
 
+    def _sample_input_adapter(self, transform, input, device):
+        adapted_input = {}
+        for key, value in input.items():
+            if isinstance(value, PIL.Image.Image):
+                # normalize doesn't support PIL images
+                continue
+            elif check_type(value, (is_pure_tensor, tv_tensors.Image, tv_tensors.Video)):
+                # normalize doesn't support integer images
+                value = F.to_dtype(value, torch.float32, scale=True)
+            adapted_input[key] = value
+        return adapted_input
+
     @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_video])
     def test_transform(self, make_input):
-        check_transform(transforms.Normalize(mean=self.MEAN, std=self.STD), make_input(dtype=torch.float32))
+        check_transform(
+            transforms.Normalize(mean=self.MEAN, std=self.STD),
+            make_input(dtype=torch.float32),
+            check_sample_input=self._sample_input_adapter,
+        )
 
     def _assert_is_standard_normal_distributed(self, tensor):
         result = scipy.stats.kstest(tensor.flatten().cpu(), cdf="norm", args=(0, 1))
@@ -4600,7 +4750,7 @@ def test_functional_signature(self, functional, kernel, input_type):
     )
     @pytest.mark.parametrize("transform_cls", [transforms.FiveCrop, transforms.TenCrop])
     def test_transform(self, make_input, transform_cls):
-        check_transform(transform_cls(size=self.OUTPUT_SIZE), make_input(self.INPUT_SIZE))
+        check_transform(transform_cls(size=self.OUTPUT_SIZE), make_input(self.INPUT_SIZE), check_sample_input=False)
 
     @pytest.mark.parametrize("make_input", [make_bounding_boxes, make_detection_mask])
     @pytest.mark.parametrize("transform_cls", [transforms.FiveCrop, transforms.TenCrop])