2024-08-06 nightly release (de300e0)

pytorch · Aug 6, 2024 · 01d4038 · 01d4038
1 parent eccb62b
commit 01d4038
Show file tree

Hide file tree

Showing 128 changed files with 3,617 additions and 1,577 deletions.
diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
@@ -38,26 +38,52 @@ concurrency:
 permissions: read-all
 
 jobs:
-  set-models:
+  set-parameters:
     runs-on: linux.2xlarge
     outputs:
-      models: ${{ steps.set-models.outputs.models }}
+      models: ${{ steps.set-parameters.outputs.models }}
+      devices: ${{ steps.set-parameters.outputs.devices }}
+      delegates: ${{ steps.set-parameters.outputs.delegates }}
     steps:
-      - name: Set models
-        id: set-models
+      - name: Set parameters
+        id: set-parameters
         shell: bash
         run: |
           set -ex
           MODELS="${{ inputs.models }}"
+          DEVICES="${{ inputs.devices }}"
+          DELEGATES="${{ inputs.delegates }}"
+
+          # Mapping devices to their corresponding device-pool-arn
+          declare -A DEVICE_POOL_ARNS
+          DEVICE_POOL_ARNS[samsung_galaxy_s2x]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa"
+
+          # Resolve device names with their corresponding ARNs
+          if [[ ! $(echo "$DEVICES" | jq empty 2>/dev/null) ]]; then
+            DEVICES=$(echo "$DEVICES" | jq -Rc 'split(",")')
+          fi
+          declare -a MAPPED_ARNS=()
+          for DEVICE in $(echo "$DEVICES" | jq -r '.[]'); do
+            if [[ -z "${DEVICE_POOL_ARNS[$DEVICE]}" ]]; then
+              echo "Error: No ARN found for device '$DEVICE'. Abort." >&2
+              exit 1
+            fi
+            MAPPED_ARNS+=("${DEVICE_POOL_ARNS[$DEVICE]}")
+          done
+
           echo "models=$(echo $MODELS | jq -Rc 'split(",")')" >> $GITHUB_OUTPUT
+          MAPPED_ARNS_JSON=$(printf '%s\n' "${MAPPED_ARNS[@]}" | jq -R . | jq -s .)
+          echo "devices=$(echo "$MAPPED_ARNS_JSON" | jq -c .)" >> $GITHUB_OUTPUT
+          echo "delegates=$(echo $DELEGATES | jq -Rc 'split(",")')" >> $GITHUB_OUTPUT
 
   export-models:
     name: export-models
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    needs: set-models
+    needs: set-parameters
     strategy:
       matrix:
-          model: ${{ fromJson(needs.set-models.outputs.models) }}
+          model: ${{ fromJson(needs.set-parameters.outputs.models) }}
+          delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }}
       fail-fast: false
     with:
       runner: linux.2xlarge
@@ -72,32 +98,33 @@ jobs:
 
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
         echo "Exporting model: ${{ matrix.model }}"
-        export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}
+        export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.delegate }}
 
+        # TODO(T197546696): Note that the following scripts/steps only work for llama. It's expected to fail for other models+delegates.
         # Install requirements for export_llama
         PYTHON_EXECUTABLE=python bash examples/models/llama2/install_requirements.sh
         # Test llama2
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh "${{ matrix.model }}.pt" "cmake" "fp32" "xnnpack+custom+qe" "${ARTIFACTS_DIR_NAME}"\
 
-  # Upload artifacts to S3. The artifacts are needed not only by the device farm but also TorchChat
+  # Upload models to S3. The artifacts are needed not only by the device farm but also TorchChat
   upload-models:
     needs: export-models
     runs-on: linux.2xlarge
     steps:
-      - name: Download the artifacts from GitHub
+      - name: Download the models from GitHub
         uses: actions/download-artifact@v3
         with:
           # The name here needs to match the name of the upload-artifact parameter
           name: android-models
           path: ${{ runner.temp }}/artifacts/
 
-      - name: Verify the artifacts
+      - name: Verify the models
         shell: bash
         working-directory: ${{ runner.temp }}/artifacts/
         run: |
           ls -lah ./
 
-      - name: Upload the artifacts to S3
+      - name: Upload the models to S3
         uses: seemethere/upload-artifact-s3@v5
         with:
           s3-bucket: gha-artifacts
@@ -110,7 +137,7 @@ jobs:
   build-llm-demo:
     name: build-llm-demo
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    needs: set-models
+    needs: set-parameters
     strategy:
       matrix:
           tokenizer: [bpe]
@@ -139,20 +166,20 @@ jobs:
     needs: build-llm-demo
     runs-on: linux.2xlarge
     steps:
-      - name: Download the artifacts from GitHub
+      - name: Download the apps from GitHub
         uses: actions/download-artifact@v3
         with:
           # The name here needs to match the name of the upload-artifact parameter
           name: android-apps
           path: ${{ runner.temp }}/artifacts/
 
-      - name: Verify the artifacts
+      - name: Verify the apps
         shell: bash
         working-directory: ${{ runner.temp }}/artifacts/
         run: |
           ls -lah ./
 
-      - name: Upload the artifacts to S3
+      - name: Upload the apps to S3
         uses: seemethere/upload-artifact-s3@v5
         with:
           s3-bucket: gha-artifacts
@@ -169,20 +196,21 @@ jobs:
       contents: read
     uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main
     needs:
-      - set-models
+      - set-parameters
       - upload-models
       - upload-android-apps
     strategy:
       matrix:
-        model: ${{ fromJson(needs.set-models.outputs.models) }}
+        model: ${{ fromJson(needs.set-parameters.outputs.models) }}
+        delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }}
+        device: ${{ fromJson(needs.set-parameters.outputs.devices) }}
     with:
       device-type: android
       runner: linux.2xlarge
       test-infra-ref: ''
       # This is the ARN of ExecuTorch project on AWS
       project-arn: arn:aws:devicefarm:us-west-2:308535385114:project:02a2cf0f-6d9b-45ee-ba1a-a086587469e6
-      # This is the custom Android device pool that only includes Samsung Galaxy S2x
-      device-pool-arn: arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa
+      device-pool-arn: ${{ matrix.device }}
       # Uploaded to S3 from the previous job, the name of the app comes from the project itself.
       # Unlike models there are limited numbers of build flavor for apps, and the model controls whether it should build with bpe/tiktoken tokenizer.
       # It's okay to build all possible apps with all possible flavors in job "build-llm-demo". However, in this job, once a model is given, there is only
@@ -193,4 +221,4 @@ jobs:
       # The test spec can be downloaded from https://ossci-assets.s3.amazonaws.com/android-llama2-device-farm-test-spec.yml
       test-spec: arn:aws:devicefarm:us-west-2:308535385114:upload:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/abd86868-fa63-467e-a5c7-218194665a77
       # Uploaded to S3 from the previous job
-      extra-data: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/${{ matrix.model }}/model.zip
+      extra-data: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/${{ matrix.model }}_${{ matrix.delegate }}/model.zip
diff --git a/backends/cadence/aot/compiler.py b/backends/cadence/aot/compiler.py
@@ -7,6 +7,7 @@
 # pyre-strict
 
 import logging
+from typing import Optional
 
 import torch
 
@@ -36,16 +37,24 @@
 from torch.export.exported_program import ExportedProgram
 
 
-def quantize_pt2(
+# Note: this is not meant as a primary API since it can create inconsistencies
+# if the quantizer here is different from the quantizer used to convert. It is
+# however useful for unit tests to separate the converted model from the fused
+# model, to be able to get reference numerics.
+# If this does not apply, please use quantize_and_fuse_pt2 instead.
+def convert_pt2(
     model: torch.nn.Module,
     inputs: tuple[object, ...],
+    quantizer: CadenceQuantizer,
 ) -> torch.fx.GraphModule:
     """
-    Instantiate the CadenceQuantizer (PTQ), prepare, convert and fuse the model.
-    Returns a GraphModule with the quantized model.
+    Prepare and convert a model using the given quantizer.
+    The quantizer must be supplied and be the same as the one used to
+    fuse the model later, if applicable. If you do not expect that behavior,
+    please use quantize_and_fuse_pt2 instead, which will instantiate a
+    default quantizer for you if needed.
+    Returns a GraphModule with the converted model.
     """
-    # Quantizer
-    quantizer = CadenceQuantizer()
 
     # Export with dynamo
     model_exp = capture_pre_autograd_graph(model, inputs)
@@ -62,12 +71,54 @@ def quantize_pt2(
     # Convert
     converted_model = convert_pt2e(prepared_model)
 
+    return converted_model
+
+
+# Note: this is not meant as a primary API since it can create inconsistencies
+# if the quantizer here is different from the quantizer used to convert. It is
+# however useful for unit tests to separate the converted model from the fused
+# model, to be able to get reference numerics.
+# If this does not apply, please use quantize_and_fuse_pt2 instead.
+def fuse_pt2(
+    converted_graph_module: torch.fx.GraphModule,
+    quantizer: CadenceQuantizer,
+) -> torch.fx.GraphModule:
+    """
+    Fuse a converted graph module using the given quantizer.
+    The quantizer must be the same as the one used to convert the model.
+    If you do not expect that behavior, please use quantize_and_fuse_pt2 instead,
+    which will instantiate a default quantizer for you if needed.
+    Returns a GraphModule with the fused model.
+    """
     # Get patterns and apply fusion of dq -> op -> q to qop
     # pyre-ignore[16]: no attribute
     patterns = [q.pattern for q in quantizer.quantizers]
-    QuantFusion(patterns)(converted_model)
+    QuantFusion(patterns)(converted_graph_module)
 
-    return converted_model
+    return converted_graph_module
+
+
+# Note: this is the one-liner API to quantize and fuse a model.
+def quantize_pt2(
+    model: torch.nn.Module,
+    inputs: tuple[object, ...],
+    quantizer: Optional[CadenceQuantizer] = None,
+) -> torch.fx.GraphModule:
+    """
+    Prepare, convert and fuse the model using the given quantizer.
+    Returns a GraphModule with the quantized model.
+    """
+    # Quantizer
+    if not quantizer:
+        quantizer = CadenceQuantizer()
+
+    # Get converted graph module
+    converted_gm = convert_pt2(model, inputs, quantizer)
+
+    # Get fused model
+    fused_gm = fuse_pt2(converted_gm, quantizer)
+
+    return fused_gm
 
 
 # Export the model and lower it to an ExportedProgram (in aten IR)

diff --git a/backends/cadence/aot/functions.yaml b/backends/cadence/aot/functions.yaml
@@ -145,7 +145,7 @@
     - arg_meta: null
       kernel_name: impl::reference::quantized_linear_out
 
-- func: cadence::quantized_relu.out(Tensor X, Tensor X_zero_point, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_relu.out(Tensor X, Tensor X_zero_point, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
       kernel_name: impl::reference::quantized_relu_out

diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
@@ -43,9 +43,11 @@
     "quantized_linear.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) ->  Tensor(a!)"
 )
 
-lib.define("quantized_relu(Tensor X, Tensor X_zero_point) -> (Tensor Y)")
 lib.define(
-    "quantized_relu.out(Tensor X, Tensor X_zero_point, *, Tensor(a!) out) -> Tensor (a!)"
+    "quantized_relu(Tensor X, Tensor X_zero_point, int out_zero_point, Tensor out_multiplier, Tensor out_shift) -> (Tensor Y)"
+)
+lib.define(
+    "quantized_relu.out(Tensor X, Tensor X_zero_point, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor (a!)"
 )
 
 lib.define(
@@ -168,6 +170,9 @@ def quantized_layer_norm_meta(
 def quantized_relu_meta(
     X: torch.Tensor,
     X_zero_point: torch.Tensor,
+    out_zero_point: int,
+    out_multiplier: torch.Tensor,
+    out_shift: torch.Tensor,
 ):
     return X.new_empty(X.size(), dtype=torch.uint8)
 

diff --git a/backends/cadence/aot/quantizer/fusion_pass.py b/backends/cadence/aot/quantizer/fusion_pass.py
@@ -287,7 +287,15 @@ def get_args_and_kwargs_relu(
     graph_module: GraphModule,
     inputs_inputs: List[fx.Node],
     dequants_inputs: List[fx.Node],
+    quant_node: fx.Node,
 ) -> Tuple[Tuple[ArgsType], Dict[str, ArgsType]]:
+    input_scale = dequants_inputs[0].args[1]
+    # pyre-fixme[58]: Unsupported operand types
+    requantize_scale = input_scale / quant_node.args[1]
+    requantize_scale_t = torch.tensor([requantize_scale])
+
+    (out_multiplier, out_shift) = quantize_tensor_multiplier(requantize_scale_t)
+
     # Make the args and kwargs for the replacement op
     args = tuple(inputs_inputs)
 
@@ -296,9 +304,22 @@ def get_args_and_kwargs_relu(
         ([1], dequants_inputs[0].args[2]),
         {"dtype": torch.int32},
     )
+    out_multiplier_ = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        ([1], out_multiplier[0].item()),
+        {"dtype": torch.int32},
+    )
+    out_shift_ = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        ([1], out_shift[0].item()),
+        {"dtype": torch.int32},
+    )
 
     kwargs = {
         "X_zero_point": X_zero_point,
+        "out_zero_point": quant_node.args[2],
+        "out_multiplier": out_multiplier_,
+        "out_shift": out_shift_,
     }
     return args, kwargs
 
@@ -420,6 +441,7 @@ def call(self, graph_module: fx.GraphModule) -> PassResult:  # noqa: C901
                             graph_module,
                             inputs_inputs,
                             dequants_inputs,
+                            quant_node,
                         )
                     fused = graph_module.graph.call_function(
                         pattern.replacement_op(),

diff --git a/backends/cadence/aot/quantizer/patterns.py b/backends/cadence/aot/quantizer/patterns.py
@@ -303,9 +303,7 @@ def get_anchors(
             inputs=[(relu_node, 0)],
             weights=[],
             biases=[],
-            output=[
-                (relu_node, SharedQuantizationSpec((relu_node.args[0], relu_node)))
-            ],
+            output=[(relu_node,)],
         )
 
     def replacement_op(self) -> OpOverload: