Skip to content

Commit

Permalink
2024-08-06 nightly release (de300e0)
Browse files Browse the repository at this point in the history
  • Loading branch information
pytorchbot committed Aug 6, 2024
1 parent eccb62b commit 01d4038
Show file tree
Hide file tree
Showing 128 changed files with 3,617 additions and 1,577 deletions.
68 changes: 48 additions & 20 deletions .github/workflows/android-perf.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,26 +38,52 @@ concurrency:
permissions: read-all

jobs:
set-models:
set-parameters:
runs-on: linux.2xlarge
outputs:
models: ${{ steps.set-models.outputs.models }}
models: ${{ steps.set-parameters.outputs.models }}
devices: ${{ steps.set-parameters.outputs.devices }}
delegates: ${{ steps.set-parameters.outputs.delegates }}
steps:
- name: Set models
id: set-models
- name: Set parameters
id: set-parameters
shell: bash
run: |
set -ex
MODELS="${{ inputs.models }}"
DEVICES="${{ inputs.devices }}"
DELEGATES="${{ inputs.delegates }}"
# Mapping devices to their corresponding device-pool-arn
declare -A DEVICE_POOL_ARNS
DEVICE_POOL_ARNS[samsung_galaxy_s2x]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa"
# Resolve device names with their corresponding ARNs
if [[ ! $(echo "$DEVICES" | jq empty 2>/dev/null) ]]; then
DEVICES=$(echo "$DEVICES" | jq -Rc 'split(",")')
fi
declare -a MAPPED_ARNS=()
for DEVICE in $(echo "$DEVICES" | jq -r '.[]'); do
if [[ -z "${DEVICE_POOL_ARNS[$DEVICE]}" ]]; then
echo "Error: No ARN found for device '$DEVICE'. Abort." >&2
exit 1
fi
MAPPED_ARNS+=("${DEVICE_POOL_ARNS[$DEVICE]}")
done
echo "models=$(echo $MODELS | jq -Rc 'split(",")')" >> $GITHUB_OUTPUT
MAPPED_ARNS_JSON=$(printf '%s\n' "${MAPPED_ARNS[@]}" | jq -R . | jq -s .)
echo "devices=$(echo "$MAPPED_ARNS_JSON" | jq -c .)" >> $GITHUB_OUTPUT
echo "delegates=$(echo $DELEGATES | jq -Rc 'split(",")')" >> $GITHUB_OUTPUT
export-models:
name: export-models
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
needs: set-models
needs: set-parameters
strategy:
matrix:
model: ${{ fromJson(needs.set-models.outputs.models) }}
model: ${{ fromJson(needs.set-parameters.outputs.models) }}
delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }}
fail-fast: false
with:
runner: linux.2xlarge
Expand All @@ -72,32 +98,33 @@ jobs:
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
echo "Exporting model: ${{ matrix.model }}"
export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}
export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.delegate }}
# TODO(T197546696): Note that the following scripts/steps only work for llama. It's expected to fail for other models+delegates.
# Install requirements for export_llama
PYTHON_EXECUTABLE=python bash examples/models/llama2/install_requirements.sh
# Test llama2
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh "${{ matrix.model }}.pt" "cmake" "fp32" "xnnpack+custom+qe" "${ARTIFACTS_DIR_NAME}"\
# Upload artifacts to S3. The artifacts are needed not only by the device farm but also TorchChat
# Upload models to S3. The artifacts are needed not only by the device farm but also TorchChat
upload-models:
needs: export-models
runs-on: linux.2xlarge
steps:
- name: Download the artifacts from GitHub
- name: Download the models from GitHub
uses: actions/download-artifact@v3
with:
# The name here needs to match the name of the upload-artifact parameter
name: android-models
path: ${{ runner.temp }}/artifacts/

- name: Verify the artifacts
- name: Verify the models
shell: bash
working-directory: ${{ runner.temp }}/artifacts/
run: |
ls -lah ./
- name: Upload the artifacts to S3
- name: Upload the models to S3
uses: seemethere/upload-artifact-s3@v5
with:
s3-bucket: gha-artifacts
Expand All @@ -110,7 +137,7 @@ jobs:
build-llm-demo:
name: build-llm-demo
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
needs: set-models
needs: set-parameters
strategy:
matrix:
tokenizer: [bpe]
Expand Down Expand Up @@ -139,20 +166,20 @@ jobs:
needs: build-llm-demo
runs-on: linux.2xlarge
steps:
- name: Download the artifacts from GitHub
- name: Download the apps from GitHub
uses: actions/download-artifact@v3
with:
# The name here needs to match the name of the upload-artifact parameter
name: android-apps
path: ${{ runner.temp }}/artifacts/

- name: Verify the artifacts
- name: Verify the apps
shell: bash
working-directory: ${{ runner.temp }}/artifacts/
run: |
ls -lah ./
- name: Upload the artifacts to S3
- name: Upload the apps to S3
uses: seemethere/upload-artifact-s3@v5
with:
s3-bucket: gha-artifacts
Expand All @@ -169,20 +196,21 @@ jobs:
contents: read
uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main
needs:
- set-models
- set-parameters
- upload-models
- upload-android-apps
strategy:
matrix:
model: ${{ fromJson(needs.set-models.outputs.models) }}
model: ${{ fromJson(needs.set-parameters.outputs.models) }}
delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }}
device: ${{ fromJson(needs.set-parameters.outputs.devices) }}
with:
device-type: android
runner: linux.2xlarge
test-infra-ref: ''
# This is the ARN of ExecuTorch project on AWS
project-arn: arn:aws:devicefarm:us-west-2:308535385114:project:02a2cf0f-6d9b-45ee-ba1a-a086587469e6
# This is the custom Android device pool that only includes Samsung Galaxy S2x
device-pool-arn: arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa
device-pool-arn: ${{ matrix.device }}
# Uploaded to S3 from the previous job, the name of the app comes from the project itself.
# Unlike models there are limited numbers of build flavor for apps, and the model controls whether it should build with bpe/tiktoken tokenizer.
# It's okay to build all possible apps with all possible flavors in job "build-llm-demo". However, in this job, once a model is given, there is only
Expand All @@ -193,4 +221,4 @@ jobs:
# The test spec can be downloaded from https://ossci-assets.s3.amazonaws.com/android-llama2-device-farm-test-spec.yml
test-spec: arn:aws:devicefarm:us-west-2:308535385114:upload:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/abd86868-fa63-467e-a5c7-218194665a77
# Uploaded to S3 from the previous job
extra-data: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/${{ matrix.model }}/model.zip
extra-data: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/${{ matrix.model }}_${{ matrix.delegate }}/model.zip
65 changes: 58 additions & 7 deletions backends/cadence/aot/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# pyre-strict

import logging
from typing import Optional

import torch

Expand Down Expand Up @@ -36,16 +37,24 @@
from torch.export.exported_program import ExportedProgram


def quantize_pt2(
# Note: this is not meant as a primary API since it can create inconsistencies
# if the quantizer here is different from the quantizer used to convert. It is
# however useful for unit tests to separate the converted model from the fused
# model, to be able to get reference numerics.
# If this does not apply, please use quantize_and_fuse_pt2 instead.
def convert_pt2(
model: torch.nn.Module,
inputs: tuple[object, ...],
quantizer: CadenceQuantizer,
) -> torch.fx.GraphModule:
"""
Instantiate the CadenceQuantizer (PTQ), prepare, convert and fuse the model.
Returns a GraphModule with the quantized model.
Prepare and convert a model using the given quantizer.
The quantizer must be supplied and be the same as the one used to
fuse the model later, if applicable. If you do not expect that behavior,
please use quantize_and_fuse_pt2 instead, which will instantiate a
default quantizer for you if needed.
Returns a GraphModule with the converted model.
"""
# Quantizer
quantizer = CadenceQuantizer()

# Export with dynamo
model_exp = capture_pre_autograd_graph(model, inputs)
Expand All @@ -62,12 +71,54 @@ def quantize_pt2(
# Convert
converted_model = convert_pt2e(prepared_model)

return converted_model


# Note: this is not meant as a primary API since it can create inconsistencies
# if the quantizer here is different from the quantizer used to convert. It is
# however useful for unit tests to separate the converted model from the fused
# model, to be able to get reference numerics.
# If this does not apply, please use quantize_and_fuse_pt2 instead.
def fuse_pt2(
converted_graph_module: torch.fx.GraphModule,
quantizer: CadenceQuantizer,
) -> torch.fx.GraphModule:
"""
Fuse a converted graph module using the given quantizer.
The quantizer must be the same as the one used to convert the model.
If you do not expect that behavior, please use quantize_and_fuse_pt2 instead,
which will instantiate a default quantizer for you if needed.
Returns a GraphModule with the fused model.
"""
# Get patterns and apply fusion of dq -> op -> q to qop
# pyre-ignore[16]: no attribute
patterns = [q.pattern for q in quantizer.quantizers]
QuantFusion(patterns)(converted_model)
QuantFusion(patterns)(converted_graph_module)

return converted_model
return converted_graph_module


# Note: this is the one-liner API to quantize and fuse a model.
def quantize_pt2(
model: torch.nn.Module,
inputs: tuple[object, ...],
quantizer: Optional[CadenceQuantizer] = None,
) -> torch.fx.GraphModule:
"""
Prepare, convert and fuse the model using the given quantizer.
Returns a GraphModule with the quantized model.
"""
# Quantizer
if not quantizer:
quantizer = CadenceQuantizer()

# Get converted graph module
converted_gm = convert_pt2(model, inputs, quantizer)

# Get fused model
fused_gm = fuse_pt2(converted_gm, quantizer)

return fused_gm


# Export the model and lower it to an ExportedProgram (in aten IR)
Expand Down
2 changes: 1 addition & 1 deletion backends/cadence/aot/functions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@
- arg_meta: null
kernel_name: impl::reference::quantized_linear_out

- func: cadence::quantized_relu.out(Tensor X, Tensor X_zero_point, *, Tensor(a!) out) -> Tensor(a!)
- func: cadence::quantized_relu.out(Tensor X, Tensor X_zero_point, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)
kernels:
- arg_meta: null
kernel_name: impl::reference::quantized_relu_out
Expand Down
9 changes: 7 additions & 2 deletions backends/cadence/aot/ops_registrations.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,11 @@
"quantized_linear.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)"
)

lib.define("quantized_relu(Tensor X, Tensor X_zero_point) -> (Tensor Y)")
lib.define(
"quantized_relu.out(Tensor X, Tensor X_zero_point, *, Tensor(a!) out) -> Tensor (a!)"
"quantized_relu(Tensor X, Tensor X_zero_point, int out_zero_point, Tensor out_multiplier, Tensor out_shift) -> (Tensor Y)"
)
lib.define(
"quantized_relu.out(Tensor X, Tensor X_zero_point, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor (a!)"
)

lib.define(
Expand Down Expand Up @@ -168,6 +170,9 @@ def quantized_layer_norm_meta(
def quantized_relu_meta(
X: torch.Tensor,
X_zero_point: torch.Tensor,
out_zero_point: int,
out_multiplier: torch.Tensor,
out_shift: torch.Tensor,
):
return X.new_empty(X.size(), dtype=torch.uint8)

Expand Down
22 changes: 22 additions & 0 deletions backends/cadence/aot/quantizer/fusion_pass.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,15 @@ def get_args_and_kwargs_relu(
graph_module: GraphModule,
inputs_inputs: List[fx.Node],
dequants_inputs: List[fx.Node],
quant_node: fx.Node,
) -> Tuple[Tuple[ArgsType], Dict[str, ArgsType]]:
input_scale = dequants_inputs[0].args[1]
# pyre-fixme[58]: Unsupported operand types
requantize_scale = input_scale / quant_node.args[1]
requantize_scale_t = torch.tensor([requantize_scale])

(out_multiplier, out_shift) = quantize_tensor_multiplier(requantize_scale_t)

# Make the args and kwargs for the replacement op
args = tuple(inputs_inputs)

Expand All @@ -296,9 +304,22 @@ def get_args_and_kwargs_relu(
([1], dequants_inputs[0].args[2]),
{"dtype": torch.int32},
)
out_multiplier_ = graph_module.graph.call_function(
torch.ops.aten.full.default,
([1], out_multiplier[0].item()),
{"dtype": torch.int32},
)
out_shift_ = graph_module.graph.call_function(
torch.ops.aten.full.default,
([1], out_shift[0].item()),
{"dtype": torch.int32},
)

kwargs = {
"X_zero_point": X_zero_point,
"out_zero_point": quant_node.args[2],
"out_multiplier": out_multiplier_,
"out_shift": out_shift_,
}
return args, kwargs

Expand Down Expand Up @@ -420,6 +441,7 @@ def call(self, graph_module: fx.GraphModule) -> PassResult: # noqa: C901
graph_module,
inputs_inputs,
dequants_inputs,
quant_node,
)
fused = graph_module.graph.call_function(
pattern.replacement_op(),
Expand Down
4 changes: 1 addition & 3 deletions backends/cadence/aot/quantizer/patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,9 +303,7 @@ def get_anchors(
inputs=[(relu_node, 0)],
weights=[],
biases=[],
output=[
(relu_node, SharedQuantizationSpec((relu_node.args[0], relu_node)))
],
output=[(relu_node,)],
)

def replacement_op(self) -> OpOverload:
Expand Down
Loading

0 comments on commit 01d4038

Please sign in to comment.