Conformance: nncf.quantize_pt2e and OpenVINOQuantize support

openvinotoolkit · Dec 2, 2024 · 9e90443 · 9e90443
1 parent 500e549
commit 9e90443
Show file tree

Hide file tree

Showing 7 changed files with 190 additions and 19 deletions.
diff --git a/tests/post_training/conftest.py b/tests/post_training/conftest.py
@@ -19,6 +19,11 @@ def pytest_addoption(parser):
     parser.addoption("--fp32", action="store_true", help="Test original model")
     parser.addoption("--cuda", action="store_true", help="Enable CUDA_TORCH backend")
     parser.addoption("--benchmark", action="store_true", help="Run benchmark_app")
+    parser.addoption(
+        "--validate-in-backend",
+        action="store_true",
+        help="Validate quantized model in native backend, not in openvino.",
+    )
     parser.addoption(
         "--extra-columns",
         action="store_true",

diff --git a/tests/post_training/data/ptq_reference_data.yaml b/tests/post_training/data/ptq_reference_data.yaml
@@ -38,6 +38,14 @@ torchvision/resnet18_backend_CUDA_TORCH:
   metric_value: 0.69152
 torchvision/resnet18_backend_FX_TORCH:
   metric_value: 0.6946
+torchvision/resnet18_backend_OV_QUANTIZER_NNCF:
+  metric_value: 0.6946
+torchvision/resnet18_backend_OV_QUANTIZER_AO:
+  metric_value: 0.6946
+torchvision/resnet18_backend_X86_QUANTIZER_NNCF:
+  metric_value: 0.6946
+torchvision/resnet18_backend_X86_QUANTIZER_AO:
+  metric_value: 0.6946
 torchvision/mobilenet_v3_small_BC_backend_FP32:
   metric_value: 0.6766
 torchvision/mobilenet_v3_small_BC_backend_OV:
@@ -46,18 +54,42 @@ torchvision/mobilenet_v3_small_BC_backend_ONNX:
   metric_value: 0.6679
 torchvision/mobilenet_v3_small_BC_backend_FX_TORCH:
   metric_value: 0.6679
+torchvision/mobilenet_v3_small_BC_backend_OV_QUANTIZER_NNCF:
+  metric_value: 0.6679
+torchvision/mobilenet_v3_small_BC_backend_OV_QUANTIZER_AO:
+  metric_value: 0.6679
+torchvision/mobilenet_v3_small_BC_backend_X86_QUANTIZER_NNCF:
+  metric_value: 0.6679
+torchvision/mobilenet_v3_small_BC_backend_X86_QUANTIZER_AO:
+  metric_value: 0.6679
 torchvision/vit_b_16_backend_FP32:
   metric_value: 0.8107
 torchvision/vit_b_16_backend_OV:
   metric_value: 0.80948
 torchvision/vit_b_16_backend_FX_TORCH:
   metric_value: 0.80922
+torchvision/vit_b_16_backend_OV_QUANTIZER_NNCF:
+  metric_value: 0.80922
+torchvision/vit_b_16_backend_OV_QUANTIZER_AO:
+  metric_value: 0.80922
+torchvision/vit_b_16_backend_X86_QUANTIZER_NNCF:
+  metric_value: 0.80922
+torchvision/vit_b_16_backend_X86_QUANTIZER_AO:
+  metric_value: 0.80922
 torchvision/swin_v2_s_backend_FP32:
   metric_value: 0.83712
 torchvision/swin_v2_s_backend_OV:
   metric_value: 0.83638
 torchvision/swin_v2_s_backend_FX_TORCH:
   metric_value: 0.8360
+torchvision/swin_v2_s_backend_OV_QUANTIZER_NNCF:
+  metric_value: 0.8360
+torchvision/swin_v2_s_backend_OV_QUANTIZER_AO:
+  metric_value: 0.8360
+torchvision/swin_v2_s_backend_X86_QUANTIZER_NNCF:
+  metric_value: 0.8360
+torchvision/swin_v2_s_backend_X86_QUANTIZER_AO:
+  metric_value: 0.8360
 timm/crossvit_9_240_backend_CUDA_TORCH:
   metric_value: 0.7275
 timm/crossvit_9_240_backend_FP32:

diff --git a/tests/post_training/model_scope.py b/tests/post_training/model_scope.py
@@ -24,6 +24,7 @@
 from nncf.quantization.advanced_parameters import AdvancedScaleEstimationParameters
 from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters
 from tests.post_training.pipelines.base import ALL_PTQ_BACKENDS
+from tests.post_training.pipelines.base import FX_BACKENDS
 from tests.post_training.pipelines.base import NNCF_PTQ_BACKENDS
 from tests.post_training.pipelines.base import BackendType
 from tests.post_training.pipelines.causal_language_model import CausalLMHF
@@ -87,7 +88,7 @@
         "model_id": "resnet18",
         "pipeline_cls": ImageClassificationTorchvision,
         "compression_params": {},
-        "backends": [BackendType.FX_TORCH, BackendType.TORCH, BackendType.CUDA_TORCH, BackendType.OV, BackendType.ONNX],
+        "backends": FX_BACKENDS + [BackendType.TORCH, BackendType.CUDA_TORCH, BackendType.OV, BackendType.ONNX],
         "batch_size": 128,
     },
     {
@@ -98,7 +99,7 @@
             "fast_bias_correction": False,
             "preset": QuantizationPreset.MIXED,
         },
-        "backends": [BackendType.FX_TORCH, BackendType.OV, BackendType.ONNX],
+        "backends": FX_BACKENDS + [BackendType.OV, BackendType.ONNX],
         "batch_size": 128,
     },
     {
@@ -109,7 +110,7 @@
             "model_type": ModelType.TRANSFORMER,
             "advanced_parameters": AdvancedQuantizationParameters(smooth_quant_alpha=0.15),
         },
-        "backends": [BackendType.FX_TORCH, BackendType.OV],
+        "backends": FX_BACKENDS + [BackendType.OV],
         "batch_size": 1,
     },
     {
@@ -120,7 +121,7 @@
             "model_type": ModelType.TRANSFORMER,
             "advanced_parameters": AdvancedQuantizationParameters(smooth_quant_alpha=0.5),
         },
-        "backends": [BackendType.FX_TORCH, BackendType.OV],
+        "backends": FX_BACKENDS + [BackendType.OV],
         "batch_size": 1,
     },
     # Timm models

diff --git a/tests/post_training/pipelines/base.py b/tests/post_training/pipelines/base.py
@@ -44,6 +44,10 @@ class BackendType(Enum):
     TORCH = "TORCH"
     CUDA_TORCH = "CUDA_TORCH"
     FX_TORCH = "FX_TORCH"
+    OV_QUANTIZER_NNCF = "OV_QUANTIZER_NNCF"
+    OV_QUANTIZER_AO = "OV_QUANTIZER_AO"
+    X86_QUANTIZER_NNCF = "X86_QUANTIZER_NNCF"
+    X86_QUANTIZER_AO = "X86_QUANTIZER_AO"
     ONNX = "ONNX"
     OV = "OV"
     OPTIMUM = "OPTIMUM"
@@ -52,6 +56,13 @@ class BackendType(Enum):
 NNCF_PTQ_BACKENDS = [BackendType.TORCH, BackendType.CUDA_TORCH, BackendType.ONNX, BackendType.OV]
 ALL_PTQ_BACKENDS = NNCF_PTQ_BACKENDS
 PT_BACKENDS = [BackendType.TORCH, BackendType.CUDA_TORCH]
+FX_BACKENDS = [
+    BackendType.FX_TORCH,
+    BackendType.OV_QUANTIZER_NNCF,
+    BackendType.OV_QUANTIZER_AO,
+    BackendType.X86_QUANTIZER_NNCF,
+    BackendType.X86_QUANTIZER_AO,
+]
 OV_BACKENDS = [BackendType.OV, BackendType.OPTIMUM]
 
 LIMIT_LENGTH_OF_STATUS = 120
@@ -211,6 +222,7 @@ def __init__(
         reference_data: dict,
         no_eval: bool,
         run_benchmark_app: bool,
+        validate_in_backend: bool = False,
         params: dict = None,
         batch_size: int = 1,
         memory_monitor: bool = False,
@@ -227,6 +239,7 @@ def __init__(
         self.memory_monitor = memory_monitor
         self.no_eval = no_eval
         self.run_benchmark_app = run_benchmark_app
+        self.validate_in_backend = validate_in_backend
         self.output_model_dir: Path = self.output_dir / self.reported_name / self.backend.value
         self.output_model_dir.mkdir(parents=True, exist_ok=True)
         self.model_name = f"{self.reported_name}_{self.backend.value}"
@@ -405,8 +418,8 @@ def save_compressed_model(self) -> None:
             )
             self.path_compressed_ir = self.output_model_dir / "model.xml"
             ov.serialize(ov_model, self.path_compressed_ir)
-        elif self.backend == BackendType.FX_TORCH:
-            exported_model = torch.export.export(self.compressed_model, (self.dummy_tensor,))
+        elif self.backend in FX_BACKENDS:
+            exported_model = torch.export.export(self.model, (self.dummy_tensor,))
             ov_model = ov.convert_model(exported_model, example_input=self.dummy_tensor.cpu(), input=self.input_size)
             self.path_compressed_ir = self.output_model_dir / "model.xml"
             ov.serialize(ov_model, self.path_compressed_ir)

diff --git a/tests/post_training/pipelines/image_classification_base.py b/tests/post_training/pipelines/image_classification_base.py
@@ -12,15 +12,27 @@
 import copy
 import os
 
+os.environ["TORCHINDUCTOR_FREEZING"] = "1"
+
+from itertools import islice
+
 import numpy as np
 import openvino as ov
 import torch
 from sklearn.metrics import accuracy_score
+from torch.ao.quantization.quantize_pt2e import convert_pt2e
+from torch.ao.quantization.quantize_pt2e import prepare_pt2e
+from torch.ao.quantization.quantizer.x86_inductor_quantizer import X86InductorQuantizer
+from torch.ao.quantization.quantizer.x86_inductor_quantizer import get_default_x86_inductor_quantization_config
 from torchvision import datasets
 
 import nncf
 from nncf.common.logging.track_progress import track
+from nncf.experimental.common.quantization.algorithms.quantizer.openvino_quantizer import OpenVINOQuantizer
+from nncf.experimental.torch.fx.quantization.quantize_pt2e import quantize_pt2e
 from tests.post_training.pipelines.base import DEFAULT_VAL_THREADS
+from tests.post_training.pipelines.base import FX_BACKENDS
+from tests.post_training.pipelines.base import BackendType
 from tests.post_training.pipelines.base import PTQTestPipeline
 
 
@@ -33,18 +45,15 @@ def prepare_calibration_dataset(self):
 
         self.calibration_dataset = nncf.Dataset(loader, self.get_transform_calibration_fn())
 
-    def _validate(self):
-        val_dataset = datasets.ImageFolder(root=self.data_dir / "imagenet" / "val", transform=self.transform)
-        val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, num_workers=2, shuffle=False)
-
-        dataset_size = len(val_loader)
-
-        # Initialize result tensors for async inference support.
-        predictions = np.zeros((dataset_size))
-        references = -1 * np.ones((dataset_size))
+    def _validate_ov(
+        self,
+        val_loader: torch.utils.data.DataLoader,
+        predictions: np.ndarray,
+        references: np.ndarray,
+        dataset_size: int,
+    ):
 
         core = ov.Core()
-
         if os.environ.get("INFERENCE_NUM_THREADS"):
             # Set CPU_THREADS_NUM for OpenVINO inference
             inference_num_threads = os.environ.get("INFERENCE_NUM_THREADS")
@@ -73,8 +82,111 @@ def process_result(request, userdata):
                 references[i] = target
 
             infer_queue.wait_all()
+        return predictions, references
+
+    def _validate_torch_compile(
+        self, val_loader: torch.utils.data.DataLoader, predictions: np.ndarray, references: np.ndarray
+    ):
+        # compiled_model = torch.compile(self.compressed_model, backend="openvino")
+        q_num = 0
+        for node in self.compressed_model.graph.nodes:
+            if ".quantize_per" in str(node.target):
+                q_num += 1
+
+        print(f"Qunatize ops num: {q_num}")
+
+        if self.backend in [BackendType.X86_QUANTIZER_AO, BackendType.X86_QUANTIZER_NNCF]:
+            compiled_model = torch.compile(self.compressed_model)
+        else:
+            compiled_model = torch.compile(self.compressed_model, backend="openvino")
+
+        for i, (images, target) in enumerate(val_loader):
+            # W/A for memory leaks when using torch DataLoader and OpenVINO
+            pred = compiled_model(images)
+            pred = torch.argmax(pred, dim=1)
+            predictions[i] = pred.numpy()
+            references[i] = target.numpy()
+        return predictions, references
+
+    def _validate(self):
+        val_dataset = datasets.ImageFolder(root=self.data_dir / "imagenet" / "val", transform=self.transform)
+        val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, num_workers=2, shuffle=False)
+
+        dataset_size = len(val_loader)
+
+        # Initialize result tensors for async inference support.
+        predictions = np.zeros((dataset_size))
+        references = -1 * np.ones((dataset_size))
+
+        if self.backend in FX_BACKENDS:
+            predictions, references = self._validate_torch_compile(val_loader, predictions, references)
+        else:
+            predictions, references = self._validate_ov(val_loader, predictions, references, dataset_size)
 
         acc_top1 = accuracy_score(predictions, references)
 
         self.run_info.metric_name = "Acc@1"
         self.run_info.metric_value = acc_top1
+
+    def _compress_torch_ao(self, quantizer):
+
+        prepared_model = prepare_pt2e(self.model, quantizer)
+        subset_size = self.compression_params.get("subset_size", 300)
+        for data in islice(self.calibration_dataset.get_inference_data(), subset_size):
+            prepared_model(data)
+        self.compressed_model = convert_pt2e(prepared_model)
+
+    def _compress_nncf_pt2e(self, quantizer):
+        pt2e_kwargs = {}
+        for key in (
+            "subset_size",
+            "fast_bias_correction",
+            "smooth_quant",
+            "bias_correction_params",
+            "smooth_quant_params",
+            "activations_range_estimator_params",
+            "weights_range_estimator_params",
+        ):
+            if key in self.compression_params:
+                pt2e_kwargs[key] = self.compression_params[key]
+        smooth_quant = False
+        if self.compression_params.get("model_type", False):
+            smooth_quant = self.compression_params["model_type"] == nncf.ModelType.TRANSFORMER
+        self.compressed_model = quantize_pt2e(
+            self.model, quantizer, self.calibration_dataset, smooth_quant=smooth_quant, fold_quantize=False
+        )
+
+    def _compress(self):
+        """
+        Quantize self.model
+        """
+        if self.backend not in FX_BACKENDS or self.backend == BackendType.FX_TORCH:
+            super()._compress()
+            return
+
+        if self.backend in [BackendType.OV_QUANTIZER_AO, BackendType.OV_QUANTIZER_NNCF]:
+            quantizer_kwargs = {}
+            for key in (
+                "mode",
+                "preset",
+                "target_device",
+                "model_type",
+                "ignored_scope",
+                "overflow_fix",
+                "quantize_outputs",
+                "activations_quantization_params",
+                "weights_quantization_params",
+                "quantizer_propagation_rule",
+            ):
+                if key in self.compression_params:
+                    quantizer_kwargs[key] = self.compression_params[key]
+            quantizer = OpenVINOQuantizer(**quantizer_kwargs)
+        else:
+
+            quantizer = X86InductorQuantizer()
+            quantizer.set_global(get_default_x86_inductor_quantization_config())
+
+        if self.backend in [BackendType.OV_QUANTIZER_NNCF, BackendType.X86_QUANTIZER_NNCF]:
+            self._compress_nncf_pt2e(quantizer)
+        else:
+            self._compress_torch_ao(quantizer)
diff --git a/tests/post_training/pipelines/image_classification_torchvision.py b/tests/post_training/pipelines/image_classification_torchvision.py
@@ -20,6 +20,7 @@
 from torchvision import models
 
 from nncf.torch import disable_patching
+from tests.post_training.pipelines.base import FX_BACKENDS
 from tests.post_training.pipelines.base import PT_BACKENDS
 from tests.post_training.pipelines.base import BackendType
 from tests.post_training.pipelines.image_classification_base import ImageClassificationBase
@@ -75,7 +76,7 @@ def prepare_model(self) -> None:
         if self.batch_size > 1:  # Dynamic batch_size shape export
             self.input_size[0] = -1
 
-        if self.backend == BackendType.FX_TORCH:
+        if self.backend in FX_BACKENDS:
             with torch.no_grad():
                 with disable_patching():
                     self.model = self.model_params.export_fn(model, (self.dummy_tensor,))
@@ -121,7 +122,7 @@ def _dump_model_fp32(self) -> None:
                 )
             ov.serialize(ov_model, self.fp32_model_dir / "model_fp32.xml")
 
-        if self.backend == BackendType.FX_TORCH:
+        if self.backend in FX_BACKENDS:
             exported_model = torch.export.export(self.model, (self.dummy_tensor,))
             ov_model = ov.convert_model(exported_model, example_input=self.dummy_tensor, input=self.input_size)
             ov.serialize(ov_model, self.fp32_model_dir / "fx_model_fp32.xml")
@@ -133,7 +134,7 @@ def prepare_preprocessor(self) -> None:
         self.transform = self.model_params.weights.transforms()
 
     def get_transform_calibration_fn(self):
-        if self.backend in [BackendType.FX_TORCH] + PT_BACKENDS:
+        if self.backend in FX_BACKENDS + PT_BACKENDS:
             device = torch.device("cuda" if self.backend == BackendType.CUDA_TORCH else "cpu")
 
             def transform_fn(data_item):

diff --git a/tests/post_training/test_quantize_conformance.py b/tests/post_training/test_quantize_conformance.py
@@ -75,6 +75,11 @@ def fixture_run_benchmark_app(pytestconfig):
     return pytestconfig.getoption("benchmark")
 
 
+@pytest.fixture(scope="session", name="validate_in_backend")
+def fixture_validate_in_backend(pytestconfig):
+    return pytestconfig.getoption("validate_in_backend")
+
+
 @pytest.fixture(scope="session", name="extra_columns")
 def fixture_extra_columns(pytestconfig):
     return pytestconfig.getoption("extra_columns")
@@ -266,6 +271,7 @@ def test_ptq_quantization(
     run_torch_cuda_backend: bool,
     subset_size: Optional[int],
     run_benchmark_app: bool,
+    validate_in_backend: bool,
     capsys: pytest.CaptureFixture,
     extra_columns: bool,
     memory_monitor: bool,
@@ -293,6 +299,7 @@ def test_ptq_quantization(
                 "data_dir": data_dir,
                 "no_eval": no_eval,
                 "run_benchmark_app": run_benchmark_app,
+                "validate_in_backend": validate_in_backend,
                 "batch_size": batch_size,
                 "memory_monitor": memory_monitor,
             }