From 31bebdea147c96f8a00a0d55931858bf727ae370 Mon Sep 17 00:00:00 2001
From: Jack Zhang <jackzhxng@meta.com>
Date: Mon, 5 Aug 2024 13:07:02 -0700
Subject: [PATCH 1/3] Fix torch export issue

Summary:
X-link: https://github.com/fairinternal/detectron2/pull/604

Pull Request resolved: https://github.com/facebookresearch/detectron2/pull/5334

Add a check to deal with dynamic shapes so that the model can be exported with `torch.export`. This check prevents the graph break caused by the SymInt by delaying the assertion to runtime.

Reviewed By: wat3rBro

Differential Revision: D60126415

fbshipit-source-id: a2a75530db523bfdde984b890595e02360d8e07f
---
 detectron2/export/c10.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/detectron2/export/c10.py b/detectron2/export/c10.py
index adbc62bea7..35380af01a 100644
--- a/detectron2/export/c10.py
+++ b/detectron2/export/c10.py
@@ -84,6 +84,11 @@ def set(self, name, value):
         else:
             data_len = len(value)
         if len(self.batch_extra_fields):
+            # If we are tracing with Dynamo, the check here is needed since len(self)
+            # represents the number of bounding boxes detected in the image and thus is
+            # an unbounded SymInt.
+            if torch._utils.is_compiling():
+                torch._check(len(self) == data_len)
             assert (
                 len(self) == data_len
             ), "Adding a field of length {} to a Instances of length {}".format(data_len, len(self))

From bcfd464d0c810f0442d91a349c0f6df945467143 Mon Sep 17 00:00:00 2001
From: generatedunixname89002005307016
 <generatedunixname89002005307016@meta.com>
Date: Fri, 9 Aug 2024 03:00:43 -0700
Subject: [PATCH 2/3] upgrade pyre version in `fbcode/vision` - batch 1

Differential Revision: D60992191

fbshipit-source-id: f826042c9d5b4f9b72b142fcef13f5772e3b9a8d
---
 projects/DensePose/densepose/modeling/losses/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/projects/DensePose/densepose/modeling/losses/utils.py b/projects/DensePose/densepose/modeling/losses/utils.py
index f865798760..f4475820c5 100644
--- a/projects/DensePose/densepose/modeling/losses/utils.py
+++ b/projects/DensePose/densepose/modeling/losses/utils.py
@@ -225,6 +225,7 @@ def resample_data(
     grid_h = torch.arange(hout, device=z.device, dtype=torch.float) / hout
     grid_w_expanded = grid_w[None, None, :].expand(n, hout, wout)
     grid_h_expanded = grid_h[None, :, None].expand(n, hout, wout)
+    # pyre-fixme[16]: `float` has no attribute `__getitem__`.
     dx_expanded = (x1dst_norm - x0dst_norm)[:, None, None].expand(n, hout, wout)
     dy_expanded = (y1dst_norm - y0dst_norm)[:, None, None].expand(n, hout, wout)
     x0_expanded = x0dst_norm[:, None, None].expand(n, hout, wout)

From 5b72c27ae39f99db75d43f18fd1312e1ea934e60 Mon Sep 17 00:00:00 2001
From: Yanghan Wang <yanghan@meta.com>
Date: Thu, 22 Aug 2024 10:00:16 -0700
Subject: [PATCH 3/3] fix inference accuracy test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
Pull Request resolved: https://github.com/facebookresearch/detectron2/pull/5348

Some accuracy tests started to fail in between Jun 11 and Jun 17:
- ❌ mask_rcnn_R_50_FPN_inference_acc_test
- ✅ keypoint_rcnn_R_50_FPN_inference_acc_test
- ✅ fast_rcnn_R_50_FPN_inference_acc_test
- ❌ panoptic_fpn_R_50_inference_acc_test
- ✅ retinanet_R_50_FPN_inference_acc_test
- ❌ rpn_R_50_FPN_inference_acc_test
- ✅ semantic_R_50_FPN_inference_acc_test
- ❌ cascade_mask_rcnn_R_50_FPN_inference_acc_test

V1: update the yaml to reflect the new scores.
V5: it turns out that we can match the old scores by disabling tf32.

Reviewed By: balakv504

Differential Revision: D61301698

fbshipit-source-id: 60f17b03574fbde62c7a84f47bedff4fd040aaa8
---
 ...mask_rcnn_R_50_FPN_inference_acc_test.yaml |  1 +
 ...fast_rcnn_R_50_FPN_inference_acc_test.yaml |  1 +
 ...oint_rcnn_R_50_FPN_inference_acc_test.yaml |  1 +
 .../mask_rcnn_R_50_C4_inference_acc_test.yaml |  1 +
 ...mask_rcnn_R_50_DC5_inference_acc_test.yaml |  1 +
 ...mask_rcnn_R_50_FPN_inference_acc_test.yaml |  1 +
 .../panoptic_fpn_R_50_inference_acc_test.yaml |  1 +
 .../rpn_R_50_FPN_inference_acc_test.yaml      |  1 +
 detectron2/config/defaults.py                 |  4 +++
 detectron2/engine/defaults.py                 | 32 +++++++++++++++++++
 10 files changed, 44 insertions(+)

diff --git a/configs/quick_schedules/cascade_mask_rcnn_R_50_FPN_inference_acc_test.yaml b/configs/quick_schedules/cascade_mask_rcnn_R_50_FPN_inference_acc_test.yaml
index fc5a4116cb..b76788b6b4 100644
--- a/configs/quick_schedules/cascade_mask_rcnn_R_50_FPN_inference_acc_test.yaml
+++ b/configs/quick_schedules/cascade_mask_rcnn_R_50_FPN_inference_acc_test.yaml
@@ -5,3 +5,4 @@ DATASETS:
   TEST: ("coco_2017_val_100",)
 TEST:
   EXPECTED_RESULTS: [["bbox", "AP", 50.18, 0.02], ["segm", "AP",  43.87, 0.02]]
+FLOAT32_PRECISION: "highest"
diff --git a/configs/quick_schedules/fast_rcnn_R_50_FPN_inference_acc_test.yaml b/configs/quick_schedules/fast_rcnn_R_50_FPN_inference_acc_test.yaml
index a2f37e5e2c..1be53eb7d7 100644
--- a/configs/quick_schedules/fast_rcnn_R_50_FPN_inference_acc_test.yaml
+++ b/configs/quick_schedules/fast_rcnn_R_50_FPN_inference_acc_test.yaml
@@ -5,3 +5,4 @@ DATASETS:
   TEST: ("coco_2017_val_100",)
 TEST:
   EXPECTED_RESULTS: [["bbox", "AP", 45.70, 0.02]]
+FLOAT32_PRECISION: "highest"
diff --git a/configs/quick_schedules/keypoint_rcnn_R_50_FPN_inference_acc_test.yaml b/configs/quick_schedules/keypoint_rcnn_R_50_FPN_inference_acc_test.yaml
index 14cf2aa82a..df496c1f27 100644
--- a/configs/quick_schedules/keypoint_rcnn_R_50_FPN_inference_acc_test.yaml
+++ b/configs/quick_schedules/keypoint_rcnn_R_50_FPN_inference_acc_test.yaml
@@ -5,3 +5,4 @@ DATASETS:
   TEST: ("keypoints_coco_2017_val_100",)
 TEST:
   EXPECTED_RESULTS: [["bbox", "AP", 52.47, 0.02], ["keypoints", "AP", 67.36, 0.02]]
+FLOAT32_PRECISION: "highest"
diff --git a/configs/quick_schedules/mask_rcnn_R_50_C4_inference_acc_test.yaml b/configs/quick_schedules/mask_rcnn_R_50_C4_inference_acc_test.yaml
index b2d5b7ff87..5f18275274 100644
--- a/configs/quick_schedules/mask_rcnn_R_50_C4_inference_acc_test.yaml
+++ b/configs/quick_schedules/mask_rcnn_R_50_C4_inference_acc_test.yaml
@@ -5,3 +5,4 @@ DATASETS:
   TEST: ("coco_2017_val_100",)
 TEST:
   EXPECTED_RESULTS: [["bbox", "AP", 47.37, 0.02], ["segm", "AP", 40.99, 0.02]]
+FLOAT32_PRECISION: "highest"
diff --git a/configs/quick_schedules/mask_rcnn_R_50_DC5_inference_acc_test.yaml b/configs/quick_schedules/mask_rcnn_R_50_DC5_inference_acc_test.yaml
index e3ce6cf922..b72ffc9fe4 100644
--- a/configs/quick_schedules/mask_rcnn_R_50_DC5_inference_acc_test.yaml
+++ b/configs/quick_schedules/mask_rcnn_R_50_DC5_inference_acc_test.yaml
@@ -5,3 +5,4 @@ DATASETS:
   TEST: ("coco_2017_val_100",)
 TEST:
   EXPECTED_RESULTS: [["bbox", "AP", 47.44, 0.02], ["segm", "AP", 42.94, 0.02]]
+FLOAT32_PRECISION: "highest"
diff --git a/configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml b/configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml
index e5454bfd95..8cad72eb23 100644
--- a/configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml
+++ b/configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml
@@ -8,3 +8,4 @@ TEST:
   AUG:
     ENABLED: True
     MIN_SIZES: (700, 800)  # to save some time
+FLOAT32_PRECISION: "highest"
diff --git a/configs/quick_schedules/panoptic_fpn_R_50_inference_acc_test.yaml b/configs/quick_schedules/panoptic_fpn_R_50_inference_acc_test.yaml
index 70874e3a92..f5429b6330 100644
--- a/configs/quick_schedules/panoptic_fpn_R_50_inference_acc_test.yaml
+++ b/configs/quick_schedules/panoptic_fpn_R_50_inference_acc_test.yaml
@@ -5,3 +5,4 @@ DATASETS:
   TEST: ("coco_2017_val_100_panoptic_separated",)
 TEST:
   EXPECTED_RESULTS: [["bbox", "AP", 46.47, 0.02], ["segm", "AP", 43.39, 0.02], ["sem_seg", "mIoU", 42.55, 0.02], ["panoptic_seg", "PQ", 38.99, 0.02]]
+FLOAT32_PRECISION: "highest"
diff --git a/configs/quick_schedules/rpn_R_50_FPN_inference_acc_test.yaml b/configs/quick_schedules/rpn_R_50_FPN_inference_acc_test.yaml
index c7c3f908a9..aa17e742d7 100644
--- a/configs/quick_schedules/rpn_R_50_FPN_inference_acc_test.yaml
+++ b/configs/quick_schedules/rpn_R_50_FPN_inference_acc_test.yaml
@@ -5,3 +5,4 @@ DATASETS:
   TEST: ("coco_2017_val_100",)
 TEST:
   EXPECTED_RESULTS: [["box_proposals", "AR@1000", 58.16, 0.02]]
+FLOAT32_PRECISION: "highest"
diff --git a/detectron2/config/defaults.py b/detectron2/config/defaults.py
index 5d97ec92d2..506651730e 100644
--- a/detectron2/config/defaults.py
+++ b/detectron2/config/defaults.py
@@ -636,6 +636,10 @@
 # for about 10k iterations. It usually hurts total time, but can benefit for certain models.
 # If input images have the same or similar sizes, benchmark is often helpful.
 _C.CUDNN_BENCHMARK = False
+# Option to set PyTorch matmul and CuDNN's float32 precision. When set to non-empty string,
+# the corresponding precision ("highest", "high" or "medium") will be used. The highest
+# precision will effectively disable tf32.
+_C.FLOAT32_PRECISION = ""
 # The period (in terms of steps) for minibatch visualization at train time.
 # Set to 0 to disable.
 _C.VIS_PERIOD = 0
diff --git a/detectron2/engine/defaults.py b/detectron2/engine/defaults.py
index c649bf8ff7..3dbcd86b75 100644
--- a/detectron2/engine/defaults.py
+++ b/detectron2/engine/defaults.py
@@ -171,6 +171,30 @@ def _highlight(code, filename):
     return code
 
 
+# adapted from:
+# https://github.com/pytorch/tnt/blob/ebda066f8f55af6a906807d35bc829686618074d/torchtnt/utils/device.py#L328-L346
+def _set_float32_precision(precision: str = "high") -> None:
+    """Sets the precision of float32 matrix multiplications and convolution operations.
+
+    For more information, see the PyTorch docs:
+    - https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html
+    - https://pytorch.org/docs/stable/backends.html#torch.backends.cudnn.allow_tf32
+
+    Args:
+        precision: The setting to determine which datatypes to use for matrix
+        multiplication and convolution operations.
+    """
+    if not (torch.cuda.is_available()):  # Not relevant for non-CUDA devices
+        return
+    # set precision for matrix multiplications
+    torch.set_float32_matmul_precision(precision)
+    # set precision for convolution operations
+    if precision == "highest":
+        torch.backends.cudnn.allow_tf32 = False
+    else:
+        torch.backends.cudnn.allow_tf32 = True
+
+
 def default_setup(cfg, args):
     """
     Perform some basic common setups at the beginning of a job, including:
@@ -226,6 +250,14 @@ def default_setup(cfg, args):
             cfg, "CUDNN_BENCHMARK", "train.cudnn_benchmark", default=False
         )
 
+    fp32_precision = _try_get_key(cfg, "FLOAT32_PRECISION", "train.float32_precision", default="")
+    if fp32_precision != "":
+        logger.info(f"Set fp32 precision to {fp32_precision}")
+        _set_float32_precision(fp32_precision)
+        logger.info(f"{torch.get_float32_matmul_precision()=}")
+        logger.info(f"{torch.backends.cuda.matmul.allow_tf32=}")
+        logger.info(f"{torch.backends.cudnn.allow_tf32=}")
+
 
 def default_writers(output_dir: str, max_iter: Optional[int] = None):
     """