rebase stuff

pytorch · Aug 2, 2024 · 15518b7 · 15518b7
2 parents 6ad6514 + 76f0b61
commit 15518b7
Show file tree

Hide file tree

Showing 275 changed files with 8,015 additions and 2,631 deletions.
diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
@@ -1,5 +1,7 @@
 mpmath==1.3.0
-numpy==1.25.2
+numpy==1.21.3; python_version == '3.10'
+numpy==1.23.2; python_version == '3.11'
+numpy; python_version >= '3.12'
 PyYAML==6.0.1
 ruamel.yaml==0.17.32
 sympy==1.12
@@ -8,6 +10,8 @@ tomli==2.0.1
 torchsr==1.0.4
 transformers==4.38.0
 zstd==1.5.5.1
+pandas==2.0.3; python_version == '3.10'
+pandas; python_version >= '3.11'
 pytest==7.2.0
 pytest-cov==4.1.0
 expecttest==0.1.6

diff --git a/.ci/scripts/gather_test_models.py b/.ci/scripts/gather_test_models.py
@@ -27,6 +27,7 @@
         # This one causes timeout on smaller runner, the root cause is unclear (T161064121)
         "dl3": "linux.12xlarge",
         "emformer_join": "linux.12xlarge",
+        "emformer_predict": "linux.12xlarge",
     }
 }
 
@@ -35,9 +36,11 @@
     # Just some examples on how custom timeout can be set
     "linux": {
         "mobilebert": 90,
+        "emformer_predict": 360,
     },
     "macos": {
         "mobilebert": 90,
+        "emformer_predict": 360,
     },
 }
 
@@ -84,7 +87,11 @@ def model_should_run_on_event(model: str, event: str) -> bool:
     """
     if event == "pull_request":
         return model in ["mv3", "vit"]
-    return True
+    elif event == "push":
+        # 'emformer_predict' is running super slow. Only run it periodically
+        return model not in ["emformer_predict"]
+    else:
+        return True
 
 
 def model_should_run_on_target_os(model: str, target_os: str) -> bool:

diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh
@@ -13,6 +13,7 @@ MODEL_NAME=$1 # stories110M.pt
 BUILD_TOOL=$2 # buck2 or cmake
 DTYPE=$3 # fp16 or fp32
 MODE=${4:-"xnnpack+custom"} # portable or xnnpack+custom or xnnpack+custom+qe
+UPLOAD_DIR=${5:-}
 if [[ $# -lt 4 ]]; then # Assuming 4 mandatory args
     echo "Expecting atleast 4 positional arguments"
     echo "Usage: [...]"
@@ -126,6 +127,15 @@ cleanup_files() {
   rm params.json
 }
 
+prepare_artifacts_upload() {
+  if [ -n "$UPLOAD_DIR" ]; then
+    echo "Preparing for uploading generated artifacs"
+    mkdir -p "${UPLOAD_DIR}"
+    zip -j "model.zip" "${MODEL_NAME}" tokenizer.bin
+    cp "model.zip" "${UPLOAD_DIR}"
+  fi
+}
+
 # Download and create artifacts.
 PARAMS="params.json"
 touch "${PARAMS}"
@@ -205,6 +215,7 @@ if [[ "${RESULT}" == "${EXPECTED_PREFIX}"* ]]; then
   echo "Actual result: ${RESULT}"
   echo "Success"
 
+  prepare_artifacts_upload
   cleanup_files
 else
   echo "Expected result prefix: ${EXPECTED_PREFIX}"

diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
@@ -0,0 +1,224 @@
+name: android-perf
+
+on:
+  schedule:
+    - cron: 0 0 * * *
+  # Note: GitHub has an upper limit of 10 inputs
+  workflow_dispatch:
+    inputs:
+      models:
+        description: Models to be benchmarked
+        required: false
+        type: string
+        default: stories110M
+      devices:
+        description: Target devices to run benchmark
+        required: false
+        type: string
+        default: samsung_galaxy_s2x
+      delegates:
+        description: Backend delegates
+        required: false
+        type: string
+        default: xnnpack
+      threadpool:
+        description: Run with threadpool?
+        required: false
+        type: boolean
+        default: false
+      benchmark_configs:
+        description: The list of configs used the benchmark
+        required: false
+        type: string
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions: read-all
+
+jobs:
+  set-parameters:
+    runs-on: linux.2xlarge
+    outputs:
+      models: ${{ steps.set-parameters.outputs.models }}
+      devices: ${{ steps.set-parameters.outputs.devices }}
+      delegates: ${{ steps.set-parameters.outputs.delegates }}
+    steps:
+      - name: Set parameters
+        id: set-parameters
+        shell: bash
+        run: |
+          set -ex
+          MODELS="${{ inputs.models }}"
+          DEVICES="${{ inputs.devices }}"
+          DELEGATES="${{ inputs.delegates }}"
+
+          # Mapping devices to their corresponding device-pool-arn
+          declare -A DEVICE_POOL_ARNS
+          DEVICE_POOL_ARNS[samsung_galaxy_s2x]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa"
+
+          # Resolve device names with their corresponding ARNs
+          if [[ ! $(echo "$DEVICES" | jq empty 2>/dev/null) ]]; then
+            DEVICES=$(echo "$DEVICES" | jq -Rc 'split(",")')
+          fi
+          declare -a MAPPED_ARNS=()
+          for DEVICE in $(echo "$DEVICES" | jq -r '.[]'); do
+            if [[ -z "${DEVICE_POOL_ARNS[$DEVICE]}" ]]; then
+              echo "Error: No ARN found for device '$DEVICE'. Abort." >&2
+              exit 1
+            fi
+            MAPPED_ARNS+=("${DEVICE_POOL_ARNS[$DEVICE]}")
+          done
+
+          echo "models=$(echo $MODELS | jq -Rc 'split(",")')" >> $GITHUB_OUTPUT
+          MAPPED_ARNS_JSON=$(printf '%s\n' "${MAPPED_ARNS[@]}" | jq -R . | jq -s .)
+          echo "devices=$(echo "$MAPPED_ARNS_JSON" | jq -c .)" >> $GITHUB_OUTPUT
+          echo "delegates=$(echo $DELEGATES | jq -Rc 'split(",")')" >> $GITHUB_OUTPUT
+
+  export-models:
+    name: export-models
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    needs: set-parameters
+    strategy:
+      matrix:
+          model: ${{ fromJson(needs.set-parameters.outputs.models) }}
+          delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }}
+      fail-fast: false
+    with:
+      runner: linux.2xlarge
+      docker-image: executorch-ubuntu-22.04-clang12
+      submodules: 'true'
+      timeout: 60
+      upload-artifact: android-models
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
+        echo "Exporting model: ${{ matrix.model }}"
+        export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.delegate }}
+
+        # TODO(T197546696): Note that the following scripts/steps only work for llama. It's expected to fail for other models+delegates.
+        # Install requirements for export_llama
+        PYTHON_EXECUTABLE=python bash examples/models/llama2/install_requirements.sh
+        # Test llama2
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh "${{ matrix.model }}.pt" "cmake" "fp32" "xnnpack+custom+qe" "${ARTIFACTS_DIR_NAME}"\
+
+  # Upload models to S3. The artifacts are needed not only by the device farm but also TorchChat
+  upload-models:
+    needs: export-models
+    runs-on: linux.2xlarge
+    steps:
+      - name: Download the models from GitHub
+        uses: actions/download-artifact@v3
+        with:
+          # The name here needs to match the name of the upload-artifact parameter
+          name: android-models
+          path: ${{ runner.temp }}/artifacts/
+
+      - name: Verify the models
+        shell: bash
+        working-directory: ${{ runner.temp }}/artifacts/
+        run: |
+          ls -lah ./
+
+      - name: Upload the models to S3
+        uses: seemethere/upload-artifact-s3@v5
+        with:
+          s3-bucket: gha-artifacts
+          s3-prefix: |
+            ${{ github.repository }}/${{ github.run_id }}/artifact
+          retention-days: 1
+          if-no-files-found: ignore
+          path: ${{ runner.temp }}/artifacts/
+
+  build-llm-demo:
+    name: build-llm-demo
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    needs: set-parameters
+    strategy:
+      matrix:
+          tokenizer: [bpe]
+    with:
+      runner: linux.2xlarge
+      docker-image: executorch-ubuntu-22.04-clang12-android
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      upload-artifact: android-apps
+      script: |
+        set -eux
+
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh cmake
+        export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded
+
+        # TODO: This needs to be replaced with a generic loader .apk
+        # Build LLM Demo for Android
+        bash build/build_android_llm_demo.sh ${{ matrix.tokenizer }} ${ARTIFACTS_DIR_NAME}
+
+  # Upload artifacts to S3. The artifacts are needed not only by the device farm but also TorchChat
+  upload-android-apps:
+    needs: build-llm-demo
+    runs-on: linux.2xlarge
+    steps:
+      - name: Download the apps from GitHub
+        uses: actions/download-artifact@v3
+        with:
+          # The name here needs to match the name of the upload-artifact parameter
+          name: android-apps
+          path: ${{ runner.temp }}/artifacts/
+
+      - name: Verify the apps
+        shell: bash
+        working-directory: ${{ runner.temp }}/artifacts/
+        run: |
+          ls -lah ./
+
+      - name: Upload the apps to S3
+        uses: seemethere/upload-artifact-s3@v5
+        with:
+          s3-bucket: gha-artifacts
+          s3-prefix: |
+            ${{ github.repository }}/${{ github.run_id }}/artifact
+          retention-days: 14
+          if-no-files-found: ignore
+          path: ${{ runner.temp }}/artifacts/
+
+  # Let's see how expensive this job is, we might want to tone it down by running it periodically
+  benchmark-on-device:
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main
+    needs:
+      - set-parameters
+      - upload-models
+      - upload-android-apps
+    strategy:
+      matrix:
+        model: ${{ fromJson(needs.set-parameters.outputs.models) }}
+        delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }}
+        device: ${{ fromJson(needs.set-parameters.outputs.devices) }}
+    with:
+      device-type: android
+      runner: linux.2xlarge
+      test-infra-ref: ''
+      # This is the ARN of ExecuTorch project on AWS
+      project-arn: arn:aws:devicefarm:us-west-2:308535385114:project:02a2cf0f-6d9b-45ee-ba1a-a086587469e6
+      device-pool-arn: ${{ matrix.device }}
+      # Uploaded to S3 from the previous job, the name of the app comes from the project itself.
+      # Unlike models there are limited numbers of build flavor for apps, and the model controls whether it should build with bpe/tiktoken tokenizer.
+      # It's okay to build all possible apps with all possible flavors in job "build-llm-demo". However, in this job, once a model is given, there is only
+      # one app+flavor that could load and run the model.
+      # TODO: Hard code llm_demo_bpe for now in this job.
+      android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo_bpe/app-debug.apk
+      android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo_bpe/app-debug-androidTest.apk
+      # The test spec can be downloaded from https://ossci-assets.s3.amazonaws.com/android-llama2-device-farm-test-spec.yml
+      test-spec: arn:aws:devicefarm:us-west-2:308535385114:upload:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/abd86868-fa63-467e-a5c7-218194665a77
+      # Uploaded to S3 from the previous job
+      extra-data: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/${{ matrix.model }}_${{ matrix.delegate }}/model.zip
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -193,7 +193,7 @@ jobs:
     strategy:
       fail-fast: false
     with:
-      runner: linux.12xlarge
+      runner: linux.24xlarge
       docker-image: executorch-ubuntu-22.04-clang12
       submodules: 'true'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
@@ -210,11 +210,19 @@ jobs:
         bash examples/models/llava/install_requirements.sh
 
         # run export_llava.sh
-        python examples/models/llava/export_llava.py
+        python examples/models/llava/export_llava.py --use-sdpa-with-kv-cache --pte-name llava_custom_sdpa.pte
 
         # verify file exists
-        if [ ! -f "llava_combined_xnnpack.pte" ]; then
-            echo "llava_combined_xnnpack.pte not found!"
+        if [ ! -f "llava_custom_sdpa.pte" ]; then
+            echo "llava_custom_sdpa.pte not found!"
+            exit 1
+        fi
+
+        python examples/models/llava/export_llava.py --no-use-sdpa-with-kv-cache --pte-name llava.pte
+
+        # verify file exists
+        if [ ! -f "llava.pte" ]; then
+            echo "llava.pte not found!"
             exit 1
         fi
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -774,7 +774,7 @@ endif()
 if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
   # TODO: move all custom kernels to ${CMAKE_CURRENT_SOURCE_DIR}/kernels/custom
   add_subdirectory(
-    ${CMAKE_CURRENT_SOURCE_DIR}/examples/models/llama2/custom_ops
+    ${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/custom_ops
   )
 endif()
 

diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm
@@ -655,7 +655,7 @@ - (void)prewarmRecentlyUsedAssetsWithMaxCount:(NSUInteger)maxCount {
 
             NSError *prewarmError = nil;
             if (![asset prewarmAndReturnError:&prewarmError]) {
-                ETCoreMLLogError(localError,
+                ETCoreMLLogError(prewarmError,
                                  "%@: Failed to prewarm asset with identifier = %@",
                                  NSStringFromClass(strongSelf.assetManager.class),
                                  asset.identifier);

diff --git a/backends/apple/coreml/runtime/delegate/backend_delegate.mm b/backends/apple/coreml/runtime/delegate/backend_delegate.mm
@@ -157,7 +157,7 @@ - (BOOL)_loadAndReturnError:(NSError * _Nullable __autoreleasing *)error {
     if (self.config.should_prewarm_asset) {
         [modelManager prewarmRecentlyUsedAssetsWithMaxCount:1];
     }
-    
+
     return YES;
 }
 
@@ -188,9 +188,14 @@ - (ModelHandle*)loadModelFromAOTData:(NSData*)data
         return nil;
     }
 
-    return [self.impl loadModelFromAOTData:data
-                             configuration:configuration
-                                     error:error];
+    auto handle = [self.impl loadModelFromAOTData:data
+                                    configuration:configuration
+                                            error:error];
+    if ((handle != NULL) && self.config.should_prewarm_model) {
+        [self.impl prewarmModelWithHandle:handle error:nil];
+    }
+
+    return handle;
 }
 
 - (BOOL)executeModelWithHandle:(ModelHandle*)handle