From 12bc1e421fef8e4091a7bdf2b7cea496c680c572 Mon Sep 17 00:00:00 2001
From: Simon Zhao <simonyansenzhao@gmail.com>
Date: Tue, 12 Nov 2024 08:58:02 +0800
Subject: [PATCH] Use managed identity with OpenID Connect for Azure login
 (#2182)

* Use managed identity with OpenID Connect

Signed-off-by: Simon Zhao <simonyansenzhao@gmail.com>

* Optimise std_log.txt path finding method

Signed-off-by: Simon Zhao <simonyansenzhao@gmail.com>

* statsmodels<0.14.4

Signed-off-by: Simon Zhao <simonyansenzhao@gmail.com>

* numpy<1.25.0;python_version<='3.8'

Signed-off-by: Simon Zhao <simonyansenzhao@gmail.com>

* spacy<=3.7.5;python_version<='3.8'

Signed-off-by: Simon Zhao <simonyansenzhao@gmail.com>

* Update doc

Signed-off-by: Simon Zhao <simonyansenzhao@gmail.com>

---------

Signed-off-by: Simon Zhao <simonyansenzhao@gmail.com>
---
 .github/actions/azureml-test/action.yml       | 14 ++++-
 .github/workflows/azureml-cpu-nightly.yml     |  6 +-
 .github/workflows/azureml-gpu-nightly.yml     |  6 +-
 .../workflows/azureml-release-pipeline.yml    |  2 +-
 .github/workflows/azureml-spark-nightly.yml   |  6 +-
 .github/workflows/azureml-unit-tests.yml      |  6 +-
 setup.py                                      |  6 +-
 tests/README.md                               | 60 ++++++++++++-------
 tests/ci/azureml_tests/post_pytest.py         | 14 +++--
 9 files changed, 83 insertions(+), 37 deletions(-)

diff --git a/.github/actions/azureml-test/action.yml b/.github/actions/azureml-test/action.yml
index 91a437719..7c44abb37 100644
--- a/.github/actions/azureml-test/action.yml
+++ b/.github/actions/azureml-test/action.yml
@@ -15,9 +15,15 @@ inputs:
   TEST_KIND:
     required: true
     description: Type of test - unit or nightly
-  AZUREML_TEST_CREDENTIALS:
+  AZUREML_TEST_UMI_CLIENT_ID:
     required: true
-    description: Credentials for AzureML login
+    description: AzureML User-managed identity client ID
+  AZUREML_TEST_UMI_TENANT_ID:
+    required: true
+    description: AzureML User-managed identity tenant ID
+  AZUREML_TEST_UMI_SUB_ID:
+    required: true
+    description: AzureML User-managed identity subscription ID
   AZUREML_TEST_SUBID:
     required: true
     description: AzureML subscription ID
@@ -53,7 +59,9 @@ runs:
     - name: Log in to Azure
       uses: azure/login@v2
       with:
-        creds: ${{ inputs.AZUREML_TEST_CREDENTIALS }}
+        client-id: ${{ inputs.AZUREML_TEST_UMI_CLIENT_ID }}
+        tenant-id: ${{ inputs.AZUREML_TEST_UMI_TENANT_ID }}
+        subscription-id: ${{ inputs.AZUREML_TEST_UMI_SUB_ID }}
     - name: Submit tests to AzureML
       shell: bash
       run: |
diff --git a/.github/workflows/azureml-cpu-nightly.yml b/.github/workflows/azureml-cpu-nightly.yml
index 89fc64757..616707f7f 100644
--- a/.github/workflows/azureml-cpu-nightly.yml
+++ b/.github/workflows/azureml-cpu-nightly.yml
@@ -64,6 +64,8 @@ jobs:
     needs: get-test-groups
     name: ${{ join(matrix.*, ', ') }}
     runs-on: ubuntu-latest
+    permissions:
+      id-token: write # This is required for requesting the JWT
     strategy:
       max-parallel: 50 # Usage limits: https://docs.github.com/en/actions/learn-github-actions/usage-limits-billing-and-administration
       matrix:
@@ -79,7 +81,9 @@ jobs:
           EXP_NAME: recommenders-nightly-${{ matrix.test-group }}-python${{ matrix.python-version }}-${{ github.ref }}
           ENV_NAME: recommenders-${{ github.sha }}-python${{ matrix.python-version }}${{ contains(matrix.test-group, 'gpu') && '-gpu' || '' }}${{ contains(matrix.test-group, 'spark') && '-spark' || '' }}
           TEST_KIND: 'nightly'
-          AZUREML_TEST_CREDENTIALS: ${{ secrets.AZUREML_TEST_CREDENTIALS }}
+          AZUREML_TEST_UMI_CLIENT_ID: ${{ secrets.AZUREML_TEST_UMI_CLIENT_ID }}
+          AZUREML_TEST_UMI_TENANT_ID: ${{ secrets.AZUREML_TEST_UMI_TENANT_ID }}
+          AZUREML_TEST_UMI_SUB_ID: ${{ secrets.AZUREML_TEST_UMI_SUB_ID }}
           AZUREML_TEST_SUBID: ${{ secrets.AZUREML_TEST_SUBID }}
           PYTHON_VERSION: ${{ matrix.python-version }}
           TEST_GROUP: ${{ matrix.test-group }}
diff --git a/.github/workflows/azureml-gpu-nightly.yml b/.github/workflows/azureml-gpu-nightly.yml
index 16e3e6ed2..23cffda0a 100644
--- a/.github/workflows/azureml-gpu-nightly.yml
+++ b/.github/workflows/azureml-gpu-nightly.yml
@@ -64,6 +64,8 @@ jobs:
     needs: get-test-groups
     name: ${{ join(matrix.*, ', ') }}
     runs-on: ubuntu-latest
+    permissions:
+      id-token: write # This is required for requesting the JWT
     strategy:
       max-parallel: 50 # Usage limits: https://docs.github.com/en/actions/learn-github-actions/usage-limits-billing-and-administration
       matrix:
@@ -79,7 +81,9 @@ jobs:
           EXP_NAME: recommenders-nightly-${{ matrix.test-group }}-python${{ matrix.python-version }}-${{ github.ref }}
           ENV_NAME: recommenders-${{ github.sha }}-python${{ matrix.python-version }}${{ contains(matrix.test-group, 'gpu') && '-gpu' || '' }}${{ contains(matrix.test-group, 'spark') && '-spark' || '' }}
           TEST_KIND: 'nightly'
-          AZUREML_TEST_CREDENTIALS: ${{ secrets.AZUREML_TEST_CREDENTIALS }}
+          AZUREML_TEST_UMI_CLIENT_ID: ${{ secrets.AZUREML_TEST_UMI_CLIENT_ID }}
+          AZUREML_TEST_UMI_TENANT_ID: ${{ secrets.AZUREML_TEST_UMI_TENANT_ID }}
+          AZUREML_TEST_UMI_SUB_ID: ${{ secrets.AZUREML_TEST_UMI_SUB_ID }}
           AZUREML_TEST_SUBID: ${{ secrets.AZUREML_TEST_SUBID }}
           PYTHON_VERSION: ${{ matrix.python-version }}
           TEST_GROUP: ${{ matrix.test-group }}
diff --git a/.github/workflows/azureml-release-pipeline.yml b/.github/workflows/azureml-release-pipeline.yml
index d9899658e..983cce9db 100644
--- a/.github/workflows/azureml-release-pipeline.yml
+++ b/.github/workflows/azureml-release-pipeline.yml
@@ -37,7 +37,7 @@ jobs:
       - name: Setup python
         uses: actions/setup-python@v5
         with: 
-          python-version: "3.8"
+          python-version: "3.10"
       - name: Install wheel package
         run: pip install wheel
       - name: Create wheel from setup.py
diff --git a/.github/workflows/azureml-spark-nightly.yml b/.github/workflows/azureml-spark-nightly.yml
index 97789fccf..da508ebe4 100644
--- a/.github/workflows/azureml-spark-nightly.yml
+++ b/.github/workflows/azureml-spark-nightly.yml
@@ -63,6 +63,8 @@ jobs:
     needs: get-test-groups
     name: ${{ join(matrix.*, ', ') }}
     runs-on: ubuntu-latest
+    permissions:
+      id-token: write # This is required for requesting the JWT
     strategy:
       max-parallel: 50 # Usage limits: https://docs.github.com/en/actions/learn-github-actions/usage-limits-billing-and-administration
       matrix:
@@ -78,7 +80,9 @@ jobs:
           EXP_NAME: recommenders-nightly-${{ matrix.test-group }}-python${{ matrix.python-version }}-${{ github.ref }}
           ENV_NAME: recommenders-${{ github.sha }}-python${{ matrix.python-version }}${{ contains(matrix.test-group, 'gpu') && '-gpu' || '' }}${{ contains(matrix.test-group, 'spark') && '-spark' || '' }}
           TEST_KIND: 'nightly'
-          AZUREML_TEST_CREDENTIALS: ${{ secrets.AZUREML_TEST_CREDENTIALS }}
+          AZUREML_TEST_UMI_CLIENT_ID: ${{ secrets.AZUREML_TEST_UMI_CLIENT_ID }}
+          AZUREML_TEST_UMI_TENANT_ID: ${{ secrets.AZUREML_TEST_UMI_TENANT_ID }}
+          AZUREML_TEST_UMI_SUB_ID: ${{ secrets.AZUREML_TEST_UMI_SUB_ID }}
           AZUREML_TEST_SUBID: ${{ secrets.AZUREML_TEST_SUBID }}
           PYTHON_VERSION: ${{ matrix.python-version }}
           TEST_GROUP: ${{ matrix.test-group }}
diff --git a/.github/workflows/azureml-unit-tests.yml b/.github/workflows/azureml-unit-tests.yml
index ed3b5a98d..0f7ed2a18 100644
--- a/.github/workflows/azureml-unit-tests.yml
+++ b/.github/workflows/azureml-unit-tests.yml
@@ -53,6 +53,8 @@ jobs:
     needs: get-test-groups
     name: ${{ join(matrix.*, ', ') }}
     runs-on: ubuntu-latest
+    permissions:
+      id-token: write # This is required for requesting the JWT
     strategy:
       max-parallel: 50 # Usage limits: https://docs.github.com/en/actions/learn-github-actions/usage-limits-billing-and-administration
       matrix:
@@ -68,7 +70,9 @@ jobs:
           EXP_NAME: recommenders-unit-${{ matrix.test-group }}-python${{ matrix.python-version }}-${{ github.sha }}
           ENV_NAME: recommenders-${{ github.sha }}-python${{ matrix.python-version }}${{ contains(matrix.test-group, 'gpu') && '-gpu' || '' }}${{ contains(matrix.test-group, 'spark') && '-spark' || '' }}
           TEST_KIND: 'unit'
-          AZUREML_TEST_CREDENTIALS: ${{ secrets.AZUREML_TEST_CREDENTIALS }}
+          AZUREML_TEST_UMI_CLIENT_ID: ${{ secrets.AZUREML_TEST_UMI_CLIENT_ID }}
+          AZUREML_TEST_UMI_TENANT_ID: ${{ secrets.AZUREML_TEST_UMI_TENANT_ID }}
+          AZUREML_TEST_UMI_SUB_ID: ${{ secrets.AZUREML_TEST_UMI_SUB_ID }}
           AZUREML_TEST_SUBID: ${{ secrets.AZUREML_TEST_SUBID }}
           PYTHON_VERSION: ${{ matrix.python-version }}
           TEST_GROUP: ${{ matrix.test-group }}
diff --git a/setup.py b/setup.py
index 03df519ed..41b4c8aed 100644
--- a/setup.py
+++ b/setup.py
@@ -36,15 +36,15 @@
     "nltk>=3.8.1,<4",  # requires tqdm
     "notebook>=6.5.5,<8",  # requires ipykernel, jinja2, jupyter, nbconvert, nbformat, packaging, requests
     "numba>=0.57.0,<1",
-    "numpy<2.0.0",  # FIXME: Remove numpy<2.0.0 once cornac release a version newer than 2.2.1 that resolve ImportError: numpy.core.multiarray failed to import.
     "pandas>2.0.0,<3.0.0",  # requires numpy
     "pandera[strategies]>=0.6.5,<0.18;python_version<='3.8'",  # For generating fake datasets
     "pandera[strategies]>=0.15.0;python_version>='3.9'",
     "retrying>=1.3.4,<2",
     "scikit-learn>=1.2.0,<2",  # requires scipy, and introduce breaking change affects feature_extraction.text.TfidfVectorizer.min_df
     "scikit-surprise>=1.1.3",
-    "scipy>=1.10.1,<=1.13.1",  # FIXME: Remove scipy<=1.13.1 once cornac release a version newer than 2.2.1.  See #2128
     "seaborn>=0.13.0,<1",  # requires matplotlib, packaging
+    "statsmodels<=0.14.1;python_version<='3.8'",
+    "statsmodels>=0.14.4;python_version>='3.9'",
     "transformers>=4.27.0,<5",  # requires packaging, pyyaml, requests, tqdm
 ]
 
@@ -52,7 +52,9 @@
 extras_require = {
     "gpu": [
         "fastai>=2.7.11,<3",
+        "numpy<1.25.0;python_version<='3.8'",
         "nvidia-ml-py>=11.525.84",
+        "spacy<=3.7.5;python_version<='3.8'",
         "tensorflow>=2.8.4,!=2.9.0.*,!=2.9.1,!=2.9.2,!=2.10.0.*,<2.16",  # Fixed TF due to constant security problems and breaking changes #2073
         "tf-slim>=1.1.0",  # No python_requires in its setup.py
         "torch>=2.0.1,<3",
diff --git a/tests/README.md b/tests/README.md
index 4958cedd2..893df94c2 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -216,30 +216,46 @@ Then, follow the steps below to create the AzureML infrastructure:
     - Name: `azureml-test-workspace`
     - Resource group: `recommenders_project_resources`
     - Location: *Make sure you have enough quota in the location you choose*
-2. Create two new clusters: `cpu-cluster` and `gpu-cluster`. Go to compute, then compute cluster, then new.
+1. Create two new clusters: `cpu-cluster` and `gpu-cluster`. Go to compute, then compute cluster, then new.
     - Select the CPU VM base. Anything above 64GB of RAM, and 8 cores should be fine.
     - Select the GPU VM base. Anything above 56GB of RAM, and 6 cores, and an NVIDIA K80 should be fine.
-3. Add the subscription ID to GitHub action secrets [here](https://github.com/recommenders-team/recommenders/settings/secrets/actions). Create a new repository secret called `AZUREML_TEST_SUBID` and add the subscription ID as the value.
-4. Make sure you have installed [Azure CLI](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli), and that you are logged in: `az login`.
-5. Select your subscription: `az account set -s $AZURE_SUBSCRIPTION_ID`.
-6. Create a Service Principal: `az ad sp create-for-rbac --name $SERVICE_PRINCIPAL_NAME --role "AzureML Compute Operator" --scopes /subscriptions/$AZURE_SUBSCRIPTION_ID --json-auth`. This will output a JSON blob with the credentials of the Service Principal:
-    ```
-    {
-        "clientId": "XXXXXXXXXXXXXXXXXXXXX",
-        "clientSecret": "XXXXXXXXXXXXXXXXXXXXX",
-        "subscriptionId": "XXXXXXXXXXXXXXXXXXXXX",
-        "tenantId": "XXXXXXXXXXXXXXXXXXXXX",
-        "activeDirectoryEndpointUrl": "https://login.microsoftonline.com",
-        "resourceManagerEndpointUrl": "https://management.azure.com/",
-        "activeDirectoryGraphResourceId": "https://graph.windows.net/",
-        "sqlManagementEndpointUrl": "https://management.core.windows.net:8443/",
-        "galleryEndpointUrl": "https://gallery.azure.com/",
-        "managementEndpointUrl": "https://management.core.windows.net/"
-    }
-    ```
-7. Assign AzureML Data Scientist role: `az role assignment create --assignee $SERVICE_PRINCIPAL_NAME --role "AzureML Data Scientist" --scope /subscriptions/$AZURE_SUBSCRIPTION_ID` 
-8. Assign Reader role: `az role assignment create --assignee $SERVICE_PRINCIPAL_NAME --role "Reader" --scope /subscriptions/$AZURE_SUBSCRIPTION_ID`
-9. Add the output as github's action secret `AZUREML_TEST_CREDENTIALS` under repository's **Settings > Security > Secrets and variables > Actions**.
+1. Add the subscription ID to GitHub action secrets
+   [here](https://github.com/recommenders-team/recommenders/settings/secrets/actions).
+   * Create a new repository secret called `AZUREML_TEST_SUBID` and
+     add the subscription ID as the value.
+1. Set up [login with OpenID Connect
+   (OIDC)](https://github.com/marketplace/actions/azure-login#login-with-openid-connect-oidc-recommended)
+   for GitHub Actions.
+   1. Create a user-assigned managed identity (UMI) and assign the
+      following 3 roles of the AzureML workspace created above to the
+      UMI (See [Create a user-assigned managed
+      identity](https://learn.microsoft.com/en-us/entra/identity/managed-identities-azure-resources/how-manage-user-assigned-managed-identities?pivots=identity-mi-methods-azp#create-a-user-assigned-managed-identity)):
+      * AzureML Compute Operator
+      * AzureML Data Scientist
+      * Reader
+   1. [Create a federated identiy credential on the
+      UMI](https://learn.microsoft.com/en-us/entra/workload-id/workload-identity-federation-create-trust-user-assigned-managed-identity?pivots=identity-wif-mi-methods-azp#github-actions-deploying-azure-resources)
+      with the following settings:
+      * Name: A unique name for the federated identity credential
+        within your application.
+      * Issuer: Set to `https://token.actions.githubusercontent.com`
+        for GitHub Actions.
+      * Subject: The subject claim format, e.g.,
+        `repo:recommenders-team/recommenders:ref:refs/heads/<branch-name>`:
+        + `repo:recommenders-team/recommenders:pull_request`
+        + `repo:recommenders-team/recommenders:ref:refs/heads/staging`
+        + `repo:recommenders-team/recommenders:ref:refs/heads/main`
+      * Description: (Optional) A description of the credential.
+      * Audiences: Specifies who can use this credential; for GitHub
+        Actions, use `api://AzureADTokenExchange`.
+1. Create 3 Actions secrets
+   * `AZUREML_TEST_UMI_TENANT_ID`
+   * `AZUREML_TEST_UMI_SUB_ID`
+   * `AZUREML_TEST_UMI_CLIENT_ID`
+   
+   and use the UMI's tenant ID, subscription ID and client ID as the
+   values of the secrets, respectively, under the repository's
+   **Settings > Security > Secrets and variables > Actions**.
 
 
 ## How to execute tests in your local environment
diff --git a/tests/ci/azureml_tests/post_pytest.py b/tests/ci/azureml_tests/post_pytest.py
index b457e709d..26472ea46 100644
--- a/tests/ci/azureml_tests/post_pytest.py
+++ b/tests/ci/azureml_tests/post_pytest.py
@@ -89,8 +89,12 @@ def parse_args():
             run_id=run.info.run_id,
             dst_path=args.log_dir,
         )
-        log_path = pathlib.Path("user_logs/std_log.txt")
-        with open(pathlib.Path(args.log_dir) / log_path, "r") as file:
-            print(f"\nDumping logs in {log_path}")
-            print("=====================================")
-            print(file.read())
+        log_path = next(
+            (path for path in pathlib.Path(args.log_dir).rglob("std_log.txt")),
+            None
+        )
+        if log_path is not None:
+            with open(log_path, "r") as file:
+                print(f"\nDumping logs in {log_path}")
+                print("=====================================")
+                print(file.read())