diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 7737b39..856d127 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -29,7 +29,7 @@ If applicable, add screenshots to help explain your problem. **Desktop (please complete the following information):** - OS: [e.g. Windows, Mac] - OpenLineage Version: [e.g. name of jar] - - Databricks Runtime Version: [e.g. 6.4, 9.1, 10.1] + - Databricks Runtime Version: [e.g. 9.1, 10.1, 11.3] - Cluster Type: [e.g. Job, Interactive] - Cluster Mode: [e.g. Standard, High Concurrency, Single] - Using Credential Passthrough: [e.g. Yes, No] diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml index 1de752d..2af5975 100644 --- a/.github/workflows/build-release.yml +++ b/.github/workflows/build-release.yml @@ -9,6 +9,11 @@ on: - '**.csproj' - 'tests/integration/**' workflow_dispatch: + inputs: + tags: + description: 'Flag as workflow dispatch' + required: true + type: boolean env: DOTNET_VERSION: '6.x.x' # The .NET SDK version to use @@ -25,11 +30,13 @@ jobs: echo "Github Event Name: ${{ github.event_name }}" echo "Github Ref: ${{ github.ref }}" echo "Github Ref Type: ${{ github.ref_type }}" + echo "Github Tags: ${{ inputs.tags }}" build: if: | github.event_name == 'pull_request' || - (github.event_name == 'create' && github.ref_type == 'tag') + (github.event_name == 'create' && github.ref_type == 'tag') || + ${{github.event_name == 'create' && inputs.tags}} name: build-${{matrix.os}} runs-on: ${{ matrix.os }} strategy: @@ -69,6 +76,18 @@ jobs: name: FunctionZip path: ~/artifact/FunctionZip.zip + - name: Create One Line OlToPurviewMappings + run: | + mkdir ~/artifact-mappings + python ./deployment/util/mappings-remove-spaces.py ./deployment/infra/OlToPurviewMappings.json > ~/artifact-mappings/one-line-mappings.json + ls ~/artifact-mappings + + - name: Upload One Line OlToPurviewMappings Build Artifact + uses: actions/upload-artifact@v3 + with: + name: Mappings + path: ~/artifact-mappings/one-line-mappings.json + runIntegrationTests: name: Test on Integration Tests needs: [build] @@ -85,18 +104,19 @@ jobs: name: FunctionZip path: ./artifacts - - name: Azure Functions Action + - name: Deploy Azure Function to Integration Env uses: Azure/functions-action@v1.4.6 with: app-name: ${{ secrets.INT_FUNC_NAME }} package: ./artifacts/FunctionZip.zip publish-profile: ${{ secrets.INT_PUBLISH_PROFILE }} - - uses: azure/login@v1 + - name: Azure Login + uses: azure/login@v1 with: creds: ${{ secrets.INT_AZ_CLI_CREDENTIALS }} - - name: Azure CLI script + - name: Compare and Update App Settings on Deployed Function uses: azure/CLI@v1 with: azcliversion: 2.34.1 @@ -108,7 +128,7 @@ jobs: # Start up Synapse Pool and Execute Tests - name: Start Integration Synapse SQL Pool - run: source tests/integration/manage-sql-pool.sh start ${{ secrets.INT_SUBSCRIPTION_ID }} ${{ secrets.INT_RG_NAME }} ${{ secrets.INT_SYNAPSE_WKSP_NAME }} ${{ secrets.INT_SYNAPSE_SQLPOOL_NAME }} + run: source tests/integration/manage-sql-pool.sh start ${{ secrets.INT_SUBSCRIPTION_ID }} ${{ secrets.INT_SYNAPSE_SQLPOOL_RG_NAME }} ${{ secrets.INT_SYNAPSE_WKSP_NAME }} ${{ secrets.INT_SYNAPSE_SQLPOOL_NAME }} env: AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }} AZURE_CLIENT_SECRET: ${{ secrets.AZURE_CLIENT_SECRET }} @@ -124,6 +144,10 @@ jobs: token = ${{ secrets.INT_DATABRICKS_ACCESS_TOKEN }}" > ./config.ini export DATABRICKS_CONFIG_FILE=./config.ini + - name: Confirm Databricks CLI is configured + run: databricks clusters spark-versions + env: + DATABRICKS_CONFIG_FILE: ./config.ini - name: Cleanup Integration Environment run: python ./tests/integration/runner.py --cleanup --dontwait None None None @@ -144,7 +168,7 @@ jobs: DATABRICKS_CONFIG_FILE: ./config.ini - name: Stop Integration Synapse SQL Pool - run: source tests/integration/manage-sql-pool.sh stop ${{ secrets.INT_SUBSCRIPTION_ID }} ${{ secrets.INT_RG_NAME }} ${{ secrets.INT_SYNAPSE_WKSP_NAME }} ${{ secrets.INT_SYNAPSE_SQLPOOL_NAME }} + run: source tests/integration/manage-sql-pool.sh stop ${{ secrets.INT_SUBSCRIPTION_ID }} ${{ secrets.INT_SYNAPSE_SQLPOOL_RG_NAME }} ${{ secrets.INT_SYNAPSE_WKSP_NAME }} ${{ secrets.INT_SYNAPSE_SQLPOOL_NAME }} env: AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }} AZURE_CLIENT_SECRET: ${{ secrets.AZURE_CLIENT_SECRET }} @@ -172,25 +196,3 @@ jobs: with: artifacts: ~/artifacts/FunctionZip.zip token: ${{ secrets.GITHUB_TOKEN }} - - deployProductionEnvironment: - name: Release to Production Environment - needs: [createRelease] - runs-on: ubuntu-latest - environment: - name: Production - steps: - - uses: actions/checkout@v3 - - - name: Download Artifact - uses: actions/download-artifact@v3 - with: - name: FunctionZip - path: ./artifacts - - - name: Azure Functions Action - uses: Azure/functions-action@v1.4.6 - with: - app-name: ${{ secrets.FUNC_NAME }} - package: ./artifacts/FunctionZip.zip - publish-profile: ${{ secrets.PUBLISH_PROFILE }} diff --git a/.gitignore b/.gitignore index 0c3849b..9343284 100644 --- a/.gitignore +++ b/.gitignore @@ -161,3 +161,4 @@ build # Ignore local settings localsettingsdutils.py +*.ini diff --git a/LIMITATIONS.md b/LIMITATIONS.md index b5def59..5aed8b9 100644 --- a/LIMITATIONS.md +++ b/LIMITATIONS.md @@ -10,7 +10,16 @@ The solution accelerator supports a limited set of data sources to be ingested i * [Azure Synapse SQL Pools](#azure-synapse-sql-pools) * [Azure SQL DB](#azure-sql-db) * [Delta Lake](#delta-lake-file-format) +<<<<<<< HEAD +* [Azure MySQL](#azure-mysql) +* [PostgreSQL](#postgresql) +* [Azure Data Explorer](#azure-data-explorer) +* [Azure Cosmos DB]() +======= +* [Azure Cosmos DB](#azure-cosmos-db) +>>>>>>> 85ddab3 (Update LIMITATIONS) * [Other Data Sources and Limitations](#other-data-sources-and-limitations) +* [Column Level Mapping Supported Sources](#column-level-mapping-supported-sources) ## Connecting to Assets in Purview @@ -67,10 +76,43 @@ Supports Azure SQL DB through the [Apache Spark Connector for Azure SQL DB](http Supports [Delta File Format](https://delta.io/). +* Does NOT support MERGE INTO statement on Databricks due to differences in Databricsk and Open Source classes. + * An earlier release mistakenly indicated support * Does not support Delta on Spark 2 Databricks Runtimes. -* Does not currently support the MERGE INTO statement due to differences between proprietary Databricks and Open Source Delta implementations. * Commands such as [Vacuum](https://docs.delta.io/latest/delta-utility.html#toc-entry-1) or [Optimize](https://docs.microsoft.com/en-us/azure/databricks/spark/latest/spark-sql/language-manual/delta-optimize) do not emit any lineage information and will not result in a Purview asset. +## Azure MySQL + +Supports Azure MySQL through [JDBC](https://learn.microsoft.com/en-us/azure/databricks/external-data/jdbc). + +## PostgreSQL + +Supports both Azure PostgreSQL and on-prem/VM installations of PostgreSQL through [JDBC](https://learn.microsoft.com/en-us/azure/databricks/external-data/jdbc). + +* If you specify the `dbTable` value without the database schema (e.g. `dbo`), the connector assumes you are using the default `public` schema. + * For users and Service Principals with different default schemas, this may result in incorrect lineage. + * This can be corrected by specifying the database schema in the Spark job. +* Default configuration supports using multiple strings divided by dots to define a custom schema. For example ```myschema.mytable```. +* If you register and scan your postgres server as `localhost` in Microsoft Purview, but use the IP within the Databricks notebook, the assets will not be matched correctly. You need to use the IP when registering the Postgres server. + +## Azure Data Explorer + +Supports Azure Data Explorer (aka Kusto) through the [Azure Data Explorer Connector for Apache Spark](https://learn.microsoft.com/en-us/azure/data-explorer/spark-connector). + +* Only supports the `kustoTable` option. +* If you use the `kustoQuery` option, it will return a Purview Generic Connector entity with a name of `COMPLEX` to capture the lineage but we are not able to parse arbitrary kusto queries at this time. + +## Azure Data Factory + +Supports capturing lineage for Databricks Notebook activities in Azure Data Factory (ADF). After running a notebook through ADF on an interactive or job cluster, you will see a Databricks Job asset in Microsoft Purview with a name similar to `ADF__`. For each Databricks notebook activity, you will also see a Databricks Task with a name similar to `ADF___`. + +* At this time, the Microsoft Purview view of Azure Data Factory lineage will not contain these tasks unless the Databricks Task uses or feeds a data source to a Data Flow or Copy activity. +* Copy Activities may not show lineage connecting to these Databricks tasks since it emits individual file assets rather than folder or resource set assets. + +## Azure Cosmos DB + +Supports querying [Azure Cosmos DB (SQL API)](https://github.com/Azure/azure-sdk-for-java/tree/main/sdk/cosmos/azure-cosmos-spark_3_2-12) + ## Other Data Sources and Limitations ### Lineage for Unsupported Data Sources @@ -87,14 +129,6 @@ Microsoft Purview's Fully Qualified Names are case sensitive. Spark Jobs may hav As a result, this solution attempts to find the best matching *existing* asset. If no existing asset is found to match based on qualified name, the data source name as found in the Spark query will be used toe create a dummy asset. On a subsequent scan of the data source in Purview and another run of the Spark query with the connector enabled will resolve the linkage. -### Column Level Mapping - -The solution currently does not provide column level mapping within the Microsoft Purview lineage tab. - -### Data Factory - -The solution currently reflects the unfriendly job name provided by Data Factory to Databricks as noted in [issue 72](https://github.com/microsoft/Purview-ADB-Lineage-Solution-Accelerator/issues/72#issuecomment-1211202405). You will see jobs with names similar to `ADF____`. - ### Hive Metastore / Delta Table Names The solution currently does not support emitting the Hive Metastore / Delta table SQL names. For example, if you have a Delta table name `default.events` and it's physical location is `abfss://container@storage/path`, the solution will report `abfss://container@storage/path`. @@ -115,8 +149,22 @@ The solution supports Spark 2 job cluster jobs. Databricks has removed Spark 2 f ### Spark 3.3+ Support -The solution supports Spark 3.0, 3.1, and 3.2 interactive and job clusters. We are working with the OpenLineage community to enable support of Spark 3.3 on Databricks Runtime 11.0 and higher. +The solution supports Spark 3.0, 3.1, 3.2, and 3.3 interactive and job clusters. The solution has been tested on the Databricks Runtime 11.3LTS version. ### Private Endpoints on Microsoft Purview Currently, the solution does not support pushing lineage to a Private Endpoint backed Microsoft Purview service. The solution may be customized to deploy the Azure Function to connect to Microsoft Purview. Consider reviewing the documentation to [Connect privately and securely to your Microsoft Purview account](https://docs.microsoft.com/en-us/azure/purview/catalog-private-link-account-portal). + +## Column Level Mapping Supported Sources + +Starting with OpenLineage 0.18.0 and release 2.3.0 of the solution accelerator, we support emitting column level mapping from the following sources and their combinations: + +* Read / Write to ABFSS file paths (mount or explicit path `abfss://`) +* Read / Write to WASBS file paths (mount or explicit path `wasbs://`) +* Read / Write to the default metastore in Azure Databricks + * Does NOT support custom hive metastores + +### Column Mapping Support for Delta Format + +* Delta Merge statements are not supported at this time +* Delta to Delta is NOT supported at this time diff --git a/README.md b/README.md index 6148d77..e627e7e 100644 --- a/README.md +++ b/README.md @@ -48,13 +48,19 @@ Gathering lineage data is performed in the following steps: * Supports table level lineage from Spark Notebooks and jobs for the following data sources: * Azure SQL - * Azure Synapse Analytics + * Azure Synapse Analytics (as input) * Azure Data Lake Gen 2 * Azure Blob Storage - * Delta Lake -* Supports Spark 3.0, 3.1, and 3.2 (Interactive and Job clusters) / Spark 2.x (Job clusters) - * Databricks Runtimes between 6.4 and 10.4 are currently supported -* Can be configured per cluster or for all clusters as a global configuration + * Delta Lake (Merge command not supported) + * Azure Data Explorer + * Azure Data Factory orchestration + * Hive Tables (in default metastore) + * MySQL + * PostgreSQL +* Supports Spark 3.0, 3.1, 3.2, and 3.3 (Interactive and Job clusters) / Spark 2.x (Job clusters) + * Databricks Runtimes between 9.1 and 11.3 LTS are currently supported +* Can be configured per cluster or for all clusters as a global configuration +* Support **column level lineage** for ABFSS, WASBS, and default metastore hive tables (see [Limitations](./LIMITATIONS.md#column-level-mapping-supported-sources) for more detail) * Once configured, **does not require any code changes to notebooks or jobs** * Can [add new source support through configuration](./docs/extending-source-support.md) @@ -92,26 +98,24 @@ There are two deployment options for this solution accelerator: 1. Once complete, open your Purview workspace and click the "Browse assets" button near the center of the page 1. Click on the "By source type" tab -You should see several items listed under the heading of "Custom source types". There will be a Databricks section and possibly a Purview Custom Connector section under this heading +You should see at least one item listed under the heading of "Azure Databricks". In addition there will possibly be a Purview Custom Connector section under the Custom source types heading ![browse_assets.png](./assets/img/readme/browse_assets.png) -1. Click on the "Databricks" section, then click on the "Databricks Notebook" tile which corresponds to the notebook you ran. In the Properties or Related tabs select one of the "Notebook Tasks" which represent a task in a Databricks job. From the "Databricks Notebook Task", you may see the lineage of one or many of the different spark actions in the notebook. This application may have a number of "Databricks Processes" linked under it which represent the data lineage. To see these, see the Properties or Related tabs +1. Click on the "Databricks" section, then click on the link to the Azure Databricks workspace which the sample notebook was ran. Then select the notebook which you ran (for those running Databricks Jobs, you can also select the job and drill into the related tasks) + * After running a Databricks Notebook on an Interactive Cluster, you will see lineage directly in the Notebook asset under the Lineage tab. + * After running a Databricks Job on a Job Cluster, you will see lineage in the Notebook Task asset. To navigate from a Notebook to a Notebook Task select the Properties tab and choose the Notebook Tasks from the Related Assets section. Please note that Databricks Jobs lineage require [additional setup](./deploy-base.md#support-extracting-lineage-from-databricks-jobs) outside of the demo deployment. ![databricks_task_related.png](./assets/img/readme/databricks_task_related.png) -1. From the Related view, click on the processes icon, then click on one of the links representing the associated process objects - -1. Click on the properties tab to view the properties associated with the process. Note that the full Spark Plan is included - - ![spark_plan.png](./assets/img/readme/spark_plan.png) - 1. Click to the lineage view to see the lineage graph ![lineage_view.png](./assets/img/readme/lineage_view.png) **Note**: If you are viewing the Databricks Process shortly after it was created, sometimes the lineage tab takes some time to display. If you do not see the lineage tab, wait a few minutes and then refresh the browser. + **Lineage Note**: The screenshot above shows lineage to an Azure Data Lake Gen 2 folder, you must have scanned your Data Lake prior to running a notebook for it to be able to match to a Microsoft Purview built-in type like folders or resource sets. + ## Troubleshooting **When filing a new issue, [please include associated log message(s) from Azure Functions](./TROUBLESHOOTING.md#debug-logs).** This will allow the core team to debug within our test environment to validate the issue and develop a solution. diff --git a/TROUBLESHOOTING.md b/TROUBLESHOOTING.md index 1a9fd31..b7170a1 100644 --- a/TROUBLESHOOTING.md +++ b/TROUBLESHOOTING.md @@ -48,6 +48,57 @@ In this case, use the databricks CLI to upload the jar to the expected location to avoid changes in the file name. +* ### Internal Error Resolving Secrets + + For the demo deployment, if your cluster fails and returns the error "Internal Error resolving secrets" and "Failed to fetch secrets referred to in Spark Conf", the deployment script may have failed to add an Access Policy to the Azure Key Vault or the secret scope was not created. + + **Solution**: Update the values in the below script and execute it in the cloud shell. This script deletes the demo deployment's secret scope and then recreates it. After executing the script, you should see an access policy for "AzureDatabricks" in your Azure Key Vault. + + ```bash + adb_ws_url=adb-DATABRICKS_WORKSPACE.ID.azuredatabricks.net + global_adb_token=$(az account get-access-token --resource 2ff814a6-3304-4ab8-85cb-cd0e6f879c1d -o tsv --query '[accessToken]') + adb_ws_id=/subscriptions/SUBSCRIPTION_ID/resourceGroups/RESOURCE_GROUP_NAME/providers/Microsoft.Databricks/workspaces/DATABRICKS_WORKSPACE_NAME + subscription_id=123acb-456-def + akv_name=AKV_NAME + akv_resource_id=/subscriptions/SUBSCRIPTION_ID/resourceGroups/RESOURCE_GROUP_NAME/providers/Microsoft.KeyVault/vaults/AKV_NAME + + # Remove the Secret Scope if it exists + cat << EOF > delete-scope.json + { + "scope": "purview-to-adb-kv" + } + EOF + + curl \ + -X POST https://$adb_ws_url/api/2.0/secrets/scopes/delete \ + -H "Authorization: Bearer $global_adb_token" \ + -H "X-Databricks-Azure-Workspace-Resource-Id: $adb_ws_id" \ + --data @delete-scope.json + + # If the above fails, that's okay + # Ultimately, we just need a clean slate + + cat << EOF > create-scope.json + { + "scope": "purview-to-adb-kv", + "scope_backend_type": "AZURE_KEYVAULT", + "backend_azure_keyvault": + { + "resource_id": "$akv_resource_id", + "dns_name": "https://$akv_name.vault.azure.net/" + }, + "initial_manage_principal": "users" + } + EOF + + + curl \ + -X POST https://$adb_ws_url/api/2.0/secrets/scopes/create \ + -H "Authorization: Bearer $global_adb_token" \ + -H "X-Databricks-Azure-Workspace-Resource-Id: $adb_ws_id" \ + --data @create-scope.json + ``` + ## I don't see lineage in Microsoft Purview * ### Try Refreshing the Page @@ -89,10 +140,10 @@ When reviewing the Driver logs, you see an error in the Log4j output that indica * Confirm that `spark.openlineage.version` is set correctly. |SA Release|OpenLineage Jar|spark.openlineage.version| - |----|----|----| - |1.0.0|0.8.2|1 - |1.1.0|0.8.2|1 - |2.0.0|0.11.0|v1 + |-----|----|----| + |1.0.x|0.8.2|1| + |1.1.x|0.8.2|1| + |2.x.x or newer|0.11.0 or newer|v1| ## PurviewOut Logs: Error Loading to Purview: 403 Forbidden diff --git a/assets/img/readme/browse_assets.png b/assets/img/readme/browse_assets.png index f65459c..fac9359 100644 Binary files a/assets/img/readme/browse_assets.png and b/assets/img/readme/browse_assets.png differ diff --git a/assets/img/readme/databricks_task_related.png b/assets/img/readme/databricks_task_related.png index c45ef73..6368d9f 100644 Binary files a/assets/img/readme/databricks_task_related.png and b/assets/img/readme/databricks_task_related.png differ diff --git a/assets/img/readme/lineage.png b/assets/img/readme/lineage.png index 7a2271c..3ab196b 100644 Binary files a/assets/img/readme/lineage.png and b/assets/img/readme/lineage.png differ diff --git a/assets/img/readme/lineage_view.png b/assets/img/readme/lineage_view.png index 61797e8..b5fb204 100644 Binary files a/assets/img/readme/lineage_view.png and b/assets/img/readme/lineage_view.png differ diff --git a/deploy-base.md b/deploy-base.md index 6cde828..1fa5b5c 100644 --- a/deploy-base.md +++ b/deploy-base.md @@ -117,6 +117,8 @@ From the [Azure Portal](https://portal.azure.com) echo $purview_type_resp_custom_type ``` + + If you need a Powershell alternative, see the [docs](./docs/powershell-alternatives.md#upload-custom-types). ## Download the OpenLineage Spark agent and configure with your Azure Databricks clusters @@ -134,7 +136,7 @@ You will need the default API / Host key configured on your Function app. To ret ### Install OpenLineage on Your Databricks Cluster Follow the instructions below and refer to the [OpenLineage Databricks Install Instructions](https://github.com/OpenLineage/OpenLineage/tree/main/integration/spark/databricks#databricks-install-instructions) to enable OpenLineage in Databricks. -1. Download the [OpenLineage-Spark 0.13.0 jar](https://repo1.maven.org/maven2/io/openlineage/openlineage-spark/0.13.0/openlineage-spark-0.13.0.jar) from Maven Central +1. Download the [OpenLineage-Spark 0.18.0 jar](https://repo1.maven.org/maven2/io/openlineage/openlineage-spark/0.18.0/openlineage-spark-0.18.0.jar) from Maven Central 2. Create an init-script named `open-lineage-init-script.sh` ```text diff --git a/deploy-demo.md b/deploy-demo.md index a0c69aa..ef0435f 100644 --- a/deploy-demo.md +++ b/deploy-demo.md @@ -120,3 +120,7 @@ purview_type_resp_custom_type=$(curl -s -X POST $purview_endpoint/catalog/api/at echo $purview_type_resp_custom_type ``` + +If you need a Powershell alternative, see the [docs](./docs/powershell-alternatives.md#upload-custom-types). + +You should now be able to run your demo notebook and receive lineage. diff --git a/deployment/infra/OlToPurviewMappings.json b/deployment/infra/OlToPurviewMappings.json index ad30b9e..23c3044 100644 --- a/deployment/infra/OlToPurviewMappings.json +++ b/deployment/infra/OlToPurviewMappings.json @@ -1,5 +1,23 @@ { "olToPurviewMappings": [ + { + "name": "wasbsNLayerMnt", + "parserConditions": [ + { + "op1": "prefix", + "compare": "=", + "op2": "wasbs" + }, + { + "op1": "nameSpcBodyParts", + "compare": ">", + "op2": "2" + } + ], + "qualifiedName": "https://{nameSpcBodyParts[1]}/{nameSpcBodyParts[0]}/{nameSpaceBodyJoinedBySlashFrom[2]}/{nameGroups[0]}", + "purviewDataType": "azure_blob_path", + "purviewPrefix": "https" + }, { "name": "wasbs", "parserConditions": [ @@ -13,6 +31,24 @@ "purviewDataType": "azure_blob_path", "purviewPrefix": "https" }, + { + "name": "wasbNLayerMnt", + "parserConditions": [ + { + "op1": "prefix", + "compare": "=", + "op2": "wasb" + }, + { + "op1": "nameSpcBodyParts", + "compare": ">", + "op2": "2" + } + ], + "qualifiedName": "https://{nameSpcBodyParts[1]}/{nameSpcBodyParts[0]}/{nameSpaceBodyJoinedBySlashFrom[2]}/{nameGroups[0]}", + "purviewDataType": "azure_blob_path", + "purviewPrefix": "https" + }, { "name": "wasb", "parserConditions": [ @@ -26,6 +62,34 @@ "purviewDataType": "azure_blob_path", "purviewPrefix": "https" }, + { + "name": "abfsBlobRootFSNLayerMnt", + "parserConditions": [ + { + "op1": "prefix", + "compare": "=", + "op2": "abfs" + }, + { + "op1": "nameSpcBodyParts[1]", + "compare": "contains", + "op2": "blob" + }, + { + "op1": "nameGroups[0]", + "compare": "=", + "op2": "" + }, + { + "op1": "nameSpcBodyParts", + "compare": ">", + "op2": "2" + } + ], + "qualifiedName": "https://{nameSpcConParts[0]}.dfs.{nameSpcConParts[2]}.{nameSpcConParts[3]}.{nameSpcConParts[4]}/{nameSpaceBodyJoinedBySlashFrom[2]}/{nameGroups[0]}", + "purviewDataType": "azure_datalake_gen2_filesystem", + "purviewPrefix": "https" + }, { "name": "abfsBlobRootFS", "parserConditions": [ @@ -49,6 +113,29 @@ "purviewDataType": "azure_datalake_gen2_filesystem", "purviewPrefix": "https" }, + { + "name": "abfsRootFSNLayerMnt", + "parserConditions": [ + { + "op1": "prefix", + "compare": "=", + "op2": "abfs" + }, + { + "op1": "nameGroups[0]", + "compare": "=", + "op2": "" + }, + { + "op1": "nameSpcBodyParts", + "compare": ">", + "op2": "2" + } + ], + "qualifiedName": "https://{nameSpcBodyParts[1]}/{nameSpaceBodyJoinedBySlashFrom[2]}/{nameGroups[0]}", + "purviewDataType": "azure_datalake_gen2_filesystem", + "purviewPrefix": "https" + }, { "name": "abfsRootFS", "parserConditions": [ @@ -67,6 +154,34 @@ "purviewDataType": "azure_datalake_gen2_filesystem", "purviewPrefix": "https" }, + { + "name": "abfssBlobRootFSNLayerMnt", + "parserConditions": [ + { + "op1": "prefix", + "compare": "=", + "op2": "abfss" + }, + { + "op1": "nameSpcBodyParts[1]", + "compare": "contains", + "op2": "blob" + }, + { + "op1": "nameGroups[0]", + "compare": "=", + "op2": "" + }, + { + "op1": "nameSpcBodyParts", + "compare": ">", + "op2": "2" + } + ], + "qualifiedName": "https://{nameSpcConParts[0]}.dfs.{nameSpcConParts[2]}.{nameSpcConParts[3]}.{nameSpcConParts[4]}/{nameSpaceBodyJoinedBySlashFrom[2]}/{nameGroups[0]}", + "purviewDataType": "azure_datalake_gen2_filesystem", + "purviewPrefix": "https" + }, { "name": "abfssBlobRootFS", "parserConditions": [ @@ -90,6 +205,29 @@ "purviewDataType": "azure_datalake_gen2_filesystem", "purviewPrefix": "https" }, + { + "name": "abfssRootFSNLayerMnt", + "parserConditions": [ + { + "op1": "prefix", + "compare": "=", + "op2": "abfss" + }, + { + "op1": "nameGroups[0]", + "compare": "=", + "op2": "" + }, + { + "op1": "nameSpcBodyParts", + "compare": ">", + "op2": "2" + } + ], + "qualifiedName": "https://{nameSpcBodyParts[1]}/{nameSpaceBodyJoinedBySlashFrom[2]}/{nameGroups[0]}", + "purviewDataType": "azure_datalake_gen2_filesystem", + "purviewPrefix": "https" + }, { "name": "abfssRootFS", "parserConditions": [ @@ -108,6 +246,30 @@ "purviewDataType": "azure_datalake_gen2_filesystem", "purviewPrefix": "https" }, + { + "name": "abfsBlobNLayerMnt", + "parserConditions": [ + { + "op1": "prefix", + "compare": "=", + "op2": "abfs" + }, + { + "op1": "nameSpcBodyParts[1]", + "compare": "contains", + "op2": "blob" + }, + { + "op1": "nameSpcBodyParts", + "compare": ">", + "op2": "2" + } + + ], + "qualifiedName": "https://{nameSpcConParts[0]}.dfs.{nameSpcConParts[2]}.{nameSpcConParts[3]}.{nameSpcConParts[4]}/{nameSpaceBodyJoinedBySlashFrom[2]}/{nameGroups[0]}", + "purviewDataType": "azure_datalake_gen2_path", + "purviewPrefix": "https" + }, { "name": "abfsBlob", "parserConditions": [ @@ -126,6 +288,25 @@ "purviewDataType": "azure_datalake_gen2_path", "purviewPrefix": "https" }, + { + "name": "abfsNLayerMnt", + "parserConditions": [ + { + "op1": "prefix", + "compare": "=", + "op2": "abfs" + }, + { + "op1": "nameSpcBodyParts", + "compare": ">", + "op2": "2" + } + + ], + "qualifiedName": "https://{nameSpcBodyParts[1]}/{nameSpcBodyParts[0]}/{nameSpaceBodyJoinedBySlashFrom[2]}/{nameGroups[0]}", + "purviewDataType": "azure_datalake_gen2_path", + "purviewPrefix": "https" + }, { "name": "abfs", "parserConditions": [ @@ -139,6 +320,29 @@ "purviewDataType": "azure_datalake_gen2_path", "purviewPrefix": "https" }, + { + "name": "abfssBlobNLayerMnt", + "parserConditions": [ + { + "op1": "prefix", + "compare": "=", + "op2": "abfss" + }, + { + "op1": "nameSpcBodyParts[1]", + "compare": "contains", + "op2": "blob" + }, + { + "op1": "nameSpcBodyParts", + "compare": ">", + "op2": "2" + } + ], + "qualifiedName": "https://{nameSpcConParts[0]}.dfs.{nameSpcConParts[2]}.{nameSpcConParts[3]}.{nameSpcConParts[4]}/{nameSpaceBodyJoinedBySlashFrom[2]}/{nameGroups[0]}", + "purviewDataType": "azure_datalake_gen2_path", + "purviewPrefix": "https" + }, { "name": "abfssBlob", "parserConditions": [ @@ -157,6 +361,25 @@ "purviewDataType": "azure_datalake_gen2_path", "purviewPrefix": "https" }, + { + "name": "abfssNLayerMnt", + "parserConditions": [ + { + "op1": "prefix", + "compare": "=", + "op2": "abfss" + }, + { + "op1": "nameSpcBodyParts", + "compare": ">", + "op2": "2" + } + + ], + "qualifiedName": "https://{nameSpcBodyParts[1]}/{nameSpcBodyParts[0]}/{nameSpaceBodyJoinedBySlashFrom[2]}/{nameGroups[0]}", + "purviewDataType": "azure_datalake_gen2_path", + "purviewPrefix": "https" + }, { "name": "abfss", "parserConditions": [ @@ -395,6 +618,32 @@ "qualifiedName": "mysql://{nameSpcBodyParts[0]}/{nameSpcBodyParts[2]}/{nameGroups[0]}", "purviewDataType": "azure_mysql_table", "purviewPrefix": "mysql" + }, + { + "name": "kusto", + "parserConditions": [ + { + "op1": "prefix", + "compare": "=", + "op2": "azurekusto" + } + ], + "qualifiedName": "https://{nameSpcBodyParts[0]}/{nameSpcBodyParts[1]}/{nameGroups[0]}", + "purviewDataType": "azure_data_explorer_table", + "purviewPrefix": "https" + }, + { + "name": "azureCosmos", + "parserConditions": [ + { + "op1": "prefix", + "compare": "=", + "op2": "azurecosmos" + } + ], + "qualifiedName": "https://{nameSpcBodyParts[0]}/{nameSpcBodyParts[1]}/{nameSpcBodyParts[2]}/{nameGroups[0]}", + "purviewDataType": "azure_cosmosdb_sqlapi_collection", + "purviewPrefix": "https" } ] } \ No newline at end of file diff --git a/deployment/infra/newdeploymenttemp.json b/deployment/infra/newdeploymenttemp.json index 074d98e..cfe7d85 100644 --- a/deployment/infra/newdeploymenttemp.json +++ b/deployment/infra/newdeploymenttemp.json @@ -1,504 +1,522 @@ -{ - "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", - "contentVersion": "1.0.0.0", - "parameters": { - "prefixName": { - "type": "string" - }, - "clientid": { - "type": "string" - }, - "clientsecret": { - "type": "securestring" - }, - "purviewName": { - "type": "string", - "defaultValue": "[concat(resourceGroup().name,'-openlineage-purview')]", - "metadata": { - "description": "User name for the Virtual Machine." - } - }, - "resourceTagValues": { - "type": "object" - }, - "functionSku": { - "type": "string", - "defaultValue": "Dynamic", - "metadata": { - "description": "The tier for the Azure Function." - }, - "allowedValues": [ - "Dynamic", - "EP1", - "EP2", - "EP3" - ] - } - }, - "variables": { - "paramName": "[parameters('prefixName')]", - "rgId": "[resourceGroup().id]", - "uniqueName": "[substring(uniqueString(variables('rgId')),0,4)]", - "functionAppName": "[replace(replace(toLower(concat(concat('functionapp',variables('paramName')),variables('uniqueName'))),'-',''),'_','')]", - "hostingPlanName": "[replace(replace(toLower(concat(concat('functionapphostplan',variables('paramName')),variables('uniqueName'))),'-',''),'_','')]", - "functionName": "OpenLineageIn", - "applicationInsightsName": "[replace(replace(toLower(concat(concat('appinsight',variables('paramName')),variables('uniqueName'))),'-',''),'_','')]", - "storageAccountName": "[replace(replace(toLower(concat(concat('storage',variables('paramName')),variables('uniqueName'))),'-',''),'_','')]", - "functionStorageAccountName": "[replace(replace(toLower(concat(concat('function','storage'),variables('uniqueName'))),'-',''),'_','')]", - "clientidkey": "clientIdKey", - "clientsecretkey": "clientSecretKey", - "storageAccountAccessKey": "storageAccessKey", - "functionStorageAccessKey": "functionStorageAccessKey", - "functionStorageAccountAccessKey": "functionStorageAccountKey", - "functionWorkerRuntime": "dotnet-isolated", - "openlineageEventHubNameSpaceName": "[replace(replace(toLower(concat(concat('eventhubns',variables('paramName')),variables('uniqueName'))),'-',''),'_','')]", - "openlineageNameEventHubName": "[replace(replace(toLower(concat(concat('eventhub',variables('paramName')),variables('uniqueName'))),'-',''),'_','')]", - "openlineageNameEventHubConsumerGroup": "[replace(replace(toLower(concat(concat('consumergroup',variables('paramName')),variables('uniqueName'))),'-',''),'_','')]", - "openlineageKeyVaultName": "[replace(replace(toLower(concat(concat('keyvaut',variables('paramName')),variables('uniqueName'))),'-',''),'_','')]", - "purviewAccountName": "[parameters('purviewName')]", - "eventHubSku": "Standard", - "captureEnabled": true, - "captureEncodingFormat": "Avro", - "captureTime": 60, - "captureSize": 314572800, - "EventHubConnectionSecretNameSend": "ehsecretSend", - "EventHubConnectionSecretNameListen": "ehsecretListen", - "functionStorageSecret": "functionStorageSecret", - "storageAccountSecret": "storageAccountSecret", - "OLOutputAPIKeySecretName": "Ol-Output-Api-Key", - "containerName": "eventhubdata", - "functionSkuDynamic":{"name": "Y1","tier": "Dynamic"}, - "functionSkuElasticPremium":{"tier": "ElasticPremium","name": "[parameters('functionSku')]","family": "EP"} - }, - "resources": [ - { - "type": "Microsoft.Storage/storageAccounts", - "apiVersion": "2019-06-01", - "name": "[variables('storageAccountName')]", - "location": "[resourceGroup().location]", - "sku": { - "name": "Standard_LRS" - }, - "kind": "Storage", - "tags": "[parameters('resourceTagValues')]", - "properties": { - "allowBlobPublicAccess": "False", - "supportsHttpsTrafficOnly": "True" - } - }, - { - "type": "Microsoft.Storage/storageAccounts", - "apiVersion": "2021-06-01", - "name": "[variables('functionStorageAccountName')]", - "location": "[resourceGroup().location]", - "sku": { - "name": "Standard_LRS" - }, - "kind": "Storage", - "tags": "[parameters('resourceTagValues')]", - "properties": { - "allowBlobPublicAccess": "False" - } - }, - { - "type": "Microsoft.Storage/storageAccounts/blobServices/containers", - "apiVersion": "2021-06-01", - "name": "[format('{0}/default/{1}', variables('functionStorageAccountName'), variables('containerName'))]", - "dependsOn": [ - "[resourceId('Microsoft.Storage/storageAccounts', variables('functionStorageAccountName'))]" - ] - }, - { - "type": "Microsoft.Web/serverfarms", - "apiVersion": "2020-06-01", - "name": "[variables('hostingPlanName')]", - "location": "[resourceGroup().location]", - "sku": "[if(equals(parameters('functionSku'), 'Dynamic'), variables('functionSkuDynamic'), variables('functionSkuElasticPremium'))]", - "tags": "[parameters('resourceTagValues')]", - "properties": { - "name": "[variables('hostingPlanName')]", - "computeMode": "Dynamic" - } - }, - { - "type": "Microsoft.Web/sites", - "apiVersion": "2020-06-01", - "name": "[variables('functionAppName')]", - "location": "[resourceGroup().location]", - "identity": { - "type": "SystemAssigned" - }, - "kind": "functionapp", - "tags": "[parameters('resourceTagValues')]", - "dependsOn": [ - "[resourceId('Microsoft.Web/serverfarms', variables('hostingPlanName'))]", - "[resourceId('Microsoft.Storage/storageAccounts', variables('storageAccountName'))]", - "[resourceId('Microsoft.Insights/components', variables('applicationInsightsName'))]" - ], - "properties": { - "serverFarmId": "[resourceId('Microsoft.Web/serverfarms', variables('hostingPlanName'))]", - "httpsOnly": true, - "siteConfig": { - "appSettings": [ - { - "name": "AzureWebJobsStorage", - "value": "[concat('DefaultEndpointsProtocol=https;AccountName=', variables('storageAccountName'), ';EndpointSuffix=', environment().suffixes.storage, ';AccountKey=',listKeys(resourceId('Microsoft.Storage/storageAccounts', variables('storageAccountName')), '2019-06-01').keys[0].value)]" - }, - { - "name": "FunctionStorage", - "value": "[concat('@Microsoft.KeyVault(VaultName=', variables('openlineageKeyVaultName'),';SecretName=',variables('functionStorageSecret'),')')]" - }, - { - "name": "WEBSITE_CONTENTAZUREFILECONNECTIONSTRING", - "value": "[concat('DefaultEndpointsProtocol=https;AccountName=', variables('storageAccountName'), ';EndpointSuffix=', environment().suffixes.storage, ';AccountKey=',listKeys(resourceId('Microsoft.Storage/storageAccounts', variables('storageAccountName')), '2019-06-01').keys[0].value)]" - }, - { - "name": "WEBSITE_CONTENTSHARE", - "value": "[toLower(variables('functionAppName'))]" - }, - { - "name": "FUNCTIONS_EXTENSION_VERSION", - "value": "~4" - }, - { - "name": "WEBSITE_NODE_DEFAULT_VERSION", - "value": "~10" - }, - { - "name": "APPINSIGHTS_INSTRUMENTATIONKEY", - "value": "[reference(resourceId('microsoft.insights/components', variables('applicationInsightsName')), '2020-02-02-preview').InstrumentationKey]" - }, - { - "name": "FUNCTIONS_WORKER_RUNTIME", - "value": "[variables('functionWorkerRuntime')]" - }, - { - "name": "EventHubName", - "value": "[variables('openlineageNameEventHubName')]" - }, - { - "name": "ListenToMessagesFromEventHub", - "value": "[concat('@Microsoft.KeyVault(VaultName=', variables('openlineageKeyVaultName'),';SecretName=',variables('EventHubConnectionSecretNameListen'),')')]" - }, - { - "name": "SendMessagesToEventHub", - "value": "[concat('@Microsoft.KeyVault(VaultName=', variables('openlineageKeyVaultName'),';SecretName=',variables('EventHubConnectionSecretNameSend'),')')]" - }, - { - "name": "EventHubConsumerGroup", - "value": "read" - }, - { - "name": "OlToPurviewMappings", - "value": "{\"olToPurviewMappings\":[{\"name\":\"wasbs\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"wasbs\"}],\"qualifiedName\":\"https://{nameSpcBodyParts[1]}/{nameSpcBodyParts[0]}/{nameGroups[0]}\",\"purviewDataType\":\"azure_blob_path\",\"purviewPrefix\":\"https\"},{\"name\":\"wasb\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"wasb\"}],\"qualifiedName\":\"https://{nameSpcBodyParts[1]}/{nameSpcBodyParts[0]}/{nameGroups[0]}\",\"purviewDataType\":\"azure_blob_path\",\"purviewPrefix\":\"https\"},{\"name\":\"abfsBlobRootFS\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"abfs\"},{\"op1\":\"nameSpcBodyParts[1]\",\"compare\":\"contains\",\"op2\":\"blob\"},{\"op1\":\"nameGroups[0]\",\"compare\":\"=\",\"op2\":\"\"}],\"qualifiedName\":\"https://{nameSpcConParts[0]}.dfs.{nameSpcConParts[2]}.{nameSpcConParts[3]}.{nameSpcConParts[4]}/{nameSpcBodyParts[0]}/{nameGroups[0]}\",\"purviewDataType\":\"azure_datalake_gen2_filesystem\",\"purviewPrefix\":\"https\"},{\"name\":\"abfsRootFS\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"abfs\"},{\"op1\":\"nameGroups[0]\",\"compare\":\"=\",\"op2\":\"\"}],\"qualifiedName\":\"https://{nameSpcBodyParts[1]}/{nameSpcBodyParts[0]}/{nameGroups[0]}\",\"purviewDataType\":\"azure_datalake_gen2_filesystem\",\"purviewPrefix\":\"https\"},{\"name\":\"abfssBlobRootFS\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"abfss\"},{\"op1\":\"nameSpcBodyParts[1]\",\"compare\":\"contains\",\"op2\":\"blob\"},{\"op1\":\"nameGroups[0]\",\"compare\":\"=\",\"op2\":\"\"}],\"qualifiedName\":\"https://{nameSpcConParts[0]}.dfs.{nameSpcConParts[2]}.{nameSpcConParts[3]}.{nameSpcConParts[4]}/{nameSpcBodyParts[0]}/{nameGroups[0]}\",\"purviewDataType\":\"azure_datalake_gen2_filesystem\",\"purviewPrefix\":\"https\"},{\"name\":\"abfssRootFS\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"abfss\"},{\"op1\":\"nameGroups[0]\",\"compare\":\"=\",\"op2\":\"\"}],\"qualifiedName\":\"https://{nameSpcBodyParts[1]}/{nameSpcBodyParts[0]}/{nameGroups[0]}\",\"purviewDataType\":\"azure_datalake_gen2_filesystem\",\"purviewPrefix\":\"https\"},{\"name\":\"abfsBlob\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"abfs\"},{\"op1\":\"nameSpcBodyParts[1]\",\"compare\":\"contains\",\"op2\":\"blob\"}],\"qualifiedName\":\"https://{nameSpcConParts[0]}.dfs.{nameSpcConParts[2]}.{nameSpcConParts[3]}.{nameSpcConParts[4]}/{nameSpcBodyParts[0]}/{nameGroups[0]}\",\"purviewDataType\":\"azure_datalake_gen2_path\",\"purviewPrefix\":\"https\"},{\"name\":\"abfs\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"abfs\"}],\"qualifiedName\":\"https://{nameSpcBodyParts[1]}/{nameSpcBodyParts[0]}/{nameGroups[0]}\",\"purviewDataType\":\"azure_datalake_gen2_path\",\"purviewPrefix\":\"https\"},{\"name\":\"abfssBlob\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"abfss\"},{\"op1\":\"nameSpcBodyParts[1]\",\"compare\":\"contains\",\"op2\":\"blob\"}],\"qualifiedName\":\"https://{nameSpcConParts[0]}.dfs.{nameSpcConParts[2]}.{nameSpcConParts[3]}.{nameSpcConParts[4]}/{nameSpcBodyParts[0]}/{nameGroups[0]}\",\"purviewDataType\":\"azure_datalake_gen2_path\",\"purviewPrefix\":\"https\"},{\"name\":\"abfss\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"abfss\"}],\"qualifiedName\":\"https://{nameSpcBodyParts[1]}/{nameSpcBodyParts[0]}/{nameGroups[0]}\",\"purviewDataType\":\"azure_datalake_gen2_path\",\"purviewPrefix\":\"https\"},{\"name\":\"synapseSqlNonDbo\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"sqlserver\"},{\"op1\":\"nameSpcBodyParts[0]\",\"compare\":\"contains\",\"op2\":\"azuresynapse\"},{\"op1\":\"nameGroups[0].parts\",\"compare\":\">\",\"op2\":\"1\"}],\"qualifiedName\":\"mssql://{nameSpcBodyParts[0]}/{nameSpcNameVals['database']}/{nameGroups[0].parts[0]}/{nameGroups[0].parts[1]}\",\"purviewDataType\":\"azure_synapse_dedicated_sql_table\",\"purviewPrefix\":\"mssql\"},{\"name\":\"synapseSql\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"sqlserver\"},{\"op1\":\"nameSpcBodyParts[0]\",\"compare\":\"contains\",\"op2\":\"azuresynapse\"}],\"qualifiedName\":\"mssql://{nameSpcBodyParts[0]}/{nameSpcNameVals['database']}/dbo/{nameGroups[0].parts[0]}\",\"purviewDataType\":\"azure_synapse_dedicated_sql_table\",\"purviewPrefix\":\"mssql\"},{\"name\":\"azureSQLNonDbo\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"sqlserver\"},{\"op1\":\"nameGroups\",\"compare\":\">\",\"op2\":\"1\"}],\"qualifiedName\":\"mssql://{nameSpcBodyParts[0]}/{nameSpcNameVals['database']}/{nameGroups[0]}/{nameGroups[1]}\",\"purviewDataType\":\"azure_sql_table\",\"purviewPrefix\":\"mssql\"},{\"name\":\"azureSQLNonDboNoDotsInNames\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"sqlserver\"},{\"op1\":\"nameGroups[0].parts\",\"compare\":\">\",\"op2\":\"1\"}],\"qualifiedName\":\"mssql://{nameSpcBodyParts[0]}/{nameSpcNameVals['database']}/{nameGroups[0].parts[0]}/{nameGroups[0].parts[1]}\",\"purviewDataType\":\"azure_sql_table\",\"purviewPrefix\":\"mssql\"},{\"name\":\"azureSQL\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"sqlserver\"}],\"qualifiedName\":\"mssql://{nameSpcBodyParts[0]}/{nameSpcNameVals['database']}/dbo/{nameGroups[0]}\",\"purviewDataType\":\"azure_sql_table\",\"purviewPrefix\":\"mssql\"},{\"name\":\"azurePostgresNonPublic\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"postgresql\"},{\"op1\":\"nameGroups[0].parts\",\"compare\":\">\",\"op2\":\"1\"},{\"op1\":\"nameSpcConParts\",\"compare\":\">\",\"op2\":\"4\"},{\"op1\":\"nameSpcConParts[3]\",\"compare\":\"=\",\"op2\":\"azure\"}],\"qualifiedName\":\"postgresql://{nameSpcBodyParts[0]}/{nameSpcBodyParts[2]}/{nameGroups[0].parts[0]}/{nameGroups[0].parts[1]}\",\"purviewDataType\":\"azure_postgresql_table\",\"purviewPrefix\":\"postgresql\"},{\"name\":\"azurePostgres\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"postgresql\"},{\"op1\":\"nameSpcConParts\",\"compare\":\">\",\"op2\":\"4\"},{\"op1\":\"nameSpcConParts[3]\",\"compare\":\"=\",\"op2\":\"azure\"}],\"qualifiedName\":\"postgresql://{nameSpcBodyParts[0]}/{nameSpcBodyParts[2]}/public/{nameGroups[0]}\",\"purviewDataType\":\"azure_postgresql_table\",\"purviewPrefix\":\"postgresql\"},{\"name\":\"postgresNonPublic\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"postgresql\"},{\"op1\":\"nameGroups[0].parts\",\"compare\":\">\",\"op2\":\"1\"}],\"qualifiedName\":\"postgresql://servers/{nameSpcBodyParts[0]}:{nameSpcBodyParts[1]}/dbs/{nameSpcBodyParts[2]}/schemas/{nameGroups[0].parts[0]}/tables/{nameGroups[0].parts[1]}\",\"purviewDataType\":\"postgresql_table\",\"purviewPrefix\":\"postgresql\"},{\"name\":\"postgres\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"postgresql\"}],\"qualifiedName\":\"postgresql://servers/{nameSpcBodyParts[0]}:{nameSpcBodyParts[1]}/dbs/{nameSpcBodyParts[2]}/schemas/public/tables/{nameGroups[0]}\",\"purviewDataType\":\"postgresql_table\",\"purviewPrefix\":\"postgresql\"},{\"name\":\"hiveManagedTableNotDefault\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"dbfs\"},{\"op1\":\"nameGroups[0]\",\"compare\":\"contains\",\"op2\":\"hive/warehouse\"},{\"op1\":\"nameGroups[0].parts\",\"compare\":\">\",\"op2\":\"4\"}],\"qualifiedName\":\"{nameGroups[0].parts[3]}.{nameGroups[0].parts[5]}@{AdbWorkspaceUrl}\",\"purviewDataType\":\"hive_table\",\"purviewPrefix\":\"hive\"},{\"name\":\"hiveManagedTableDefault\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"dbfs\"},{\"op1\":\"nameGroups[0]\",\"compare\":\"contains\",\"op2\":\"hive/warehouse\"}],\"qualifiedName\":\"default.{nameGroups[0].parts[3]}@{AdbWorkspaceUrl}\",\"purviewDataType\":\"hive_table\",\"purviewPrefix\":\"hive\"},{\"name\":\"azureMySql\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"mysql\"}],\"qualifiedName\":\"mysql://{nameSpcBodyParts[0]}/{nameSpcBodyParts[2]}/{nameGroups[0]}\",\"purviewDataType\":\"azure_mysql_table\",\"purviewPrefix\":\"mysql\"}]}" - }, - { - "name": "PurviewAccountName", - "value": "[variables('purviewAccountName')]" - }, - { - "name": "ClientID", - "value": "[concat('@Microsoft.KeyVault(VaultName=', variables('openlineageKeyVaultName'),';SecretName=',variables('clientidkey'),')')]" - }, - { - "name": "ClientSecret", - "value": "[concat('@Microsoft.KeyVault(VaultName=', variables('openlineageKeyVaultName'),';SecretName=',variables('clientsecretkey'),')')]" - }, - { - "name": "TenantId", - "value": "[subscription().tenantId]" - } - ] - } - }, - "resources": [ - { - "name": "MSDeploy", - "type": "extensions", - "location": "[resourceGroup().location]", - "apiVersion": "2020-06-01", - "dependsOn": [ - "[concat('Microsoft.Web/sites/', variables('functionAppName'))]" - ], - "properties": { - "packageUri": "http://aka.ms/APFunctions2-2" - } - } - ] - }, - { - "type": "microsoft.insights/components", - "apiVersion": "2020-02-02-preview", - "name": "[variables('applicationInsightsName')]", - "location": "[resourceGroup().location]", - "tags": { - "[concat('hidden-link:', resourceId('Microsoft.Web/sites', variables('applicationInsightsName')))]": "Resource" - }, - "properties": { - "ApplicationId": "[variables('applicationInsightsName')]", - "Request_Source": "IbizaWebAppExtensionCreate" - } - }, - { - "type": "Microsoft.EventHub/namespaces", - "apiVersion": "2018-01-01-preview", - "name": "[variables('openlineageEventHubNameSpaceName')]", - "location": "[resourceGroup().location]", - "sku": { - "name": "[variables('eventHubSku')]", - "tier": "[variables('eventHubSku')]", - "capacity": 1 - }, - "tags": "[parameters('resourceTagValues')]", - "properties": { - "isAutoInflateEnabled": false, - "maximumThroughputUnits": 0 - } - }, - { - "type": "Microsoft.EventHub/namespaces/eventhubs", - "apiVersion": "2017-04-01", - "name": "[concat(variables('openlineageEventHubNameSpaceName'), '/', variables('openlineageNameEventHubName'))]", - "location": "[resourceGroup().location]", - "tags": "[parameters('resourceTagValues')]", - "dependsOn": [ - "[resourceId('Microsoft.EventHub/namespaces', variables('openlineageEventHubNameSpaceName'))]" - ], - "properties": { - "messageRetentionInDays": 1, - "partitionCount": 1, - "captureDescription": { - "enabled": "[variables('captureEnabled')]", - "skipEmptyArchives": false, - "encoding": "[variables('captureEncodingFormat')]", - "intervalInSeconds": "[variables('captureTime')]", - "sizeLimitInBytes": "[variables('captureSize')]", - "destination": { - "name": "EventHubArchive.AzureBlockBlob", - "properties": { - "archiveNameFormat": "{Namespace}/{EventHub}/{PartitionId}/{Year}/{Month}/{Day}/{Hour}/{Minute}/{Second}", - "blobContainer": "eventhubdata", - "storageAccountResourceId": "[resourceId('Microsoft.Storage/storageAccounts', variables('functionStorageAccountName'))]" - } - } - } - }, - "resources": [ - { - "apiVersion": "2017-04-01", - "name": "read", - "type": "consumergroups", - "dependsOn": [ - "[variables('openlineageNameEventHubName')]" - ], - "properties": {} - } - ] - }, - { - "type": "Microsoft.KeyVault/vaults", - "name": "[variables('openlineageKeyVaultName')]", - "apiVersion": "2019-09-01", - "location": "[resourceGroup().location]", - "tags": "[parameters('resourceTagValues')]", - "properties": { - "sku": { - "family": "A", - "name": "Standard" - }, - "tenantId": "[subscription().tenantId]", - "accessPolicies": [ - { - "tenantId": "[subscription().tenantid]", - "objectId": "[reference(resourceId('Microsoft.Web/sites', variables('functionAppName')),'2020-06-01', 'full').identity.principalId]", - "permissions": { - "keys": [], - "secrets": [ - "get" - ], - "certificates": [] - } - } - ], - "enableSoftDelete": false, - "enabledForDeployment": false, - "enabledForDiskEncryption": false, - "enabledForTemplateDeployment": false - }, - "dependsOn": [ - "[resourceId('Microsoft.Web/sites', variables('functionAppName'))]" - ] - }, - { - "type": "Microsoft.KeyVault/vaults/secrets", - "apiVersion": "2019-09-01", - "name": "[format('{0}/{1}', variables('openlineageKeyVaultName'), variables('EventHubConnectionSecretNameSend'))]", - "tags": "[parameters('resourceTagValues')]", - "properties": { - "value": "[listkeys(resourceId('Microsoft.Eventhub/namespaces/authorizationRules',variables('openlineageEventHubNameSpaceName'), 'SendMessages'),'2017-04-01').primaryConnectionString]" - }, - "dependsOn": [ - "[resourceId('Microsoft.EventHub/namespaces', variables('openlineageEventHubNameSpaceName'))]", - "[resourceId('Microsoft.KeyVault/vaults', variables('openlineageKeyVaultName'))]" - ] - }, - { - "type": "Microsoft.KeyVault/vaults/secrets", - "apiVersion": "2019-09-01", - "name": "[format('{0}/{1}', variables('openlineageKeyVaultName'), variables('EventHubConnectionSecretNameListen'))]", - "tags": "[parameters('resourceTagValues')]", - "properties": { - "value": "[listkeys(resourceId('Microsoft.Eventhub/namespaces/authorizationRules',variables('openlineageEventHubNameSpaceName'), 'ListenMessages'),'2017-04-01').primaryConnectionString]" - }, - "dependsOn": [ - "[resourceId('Microsoft.EventHub/namespaces', variables('openlineageEventHubNameSpaceName'))]", - "[resourceId('Microsoft.KeyVault/vaults', variables('openlineageKeyVaultName'))]" - ] - }, - { - "type": "Microsoft.KeyVault/vaults/secrets", - "apiVersion": "2019-09-01", - "name": "[format('{0}/{1}', variables('openlineageKeyVaultName'),variables('storageAccountName'))]", - "tags": "[parameters('resourceTagValues')]", - "properties": { - "value": "[concat('DefaultEndpointsProtocol=https;AccountName=',variables('storageAccountName'),';AccountKey=',listKeys(resourceId('Microsoft.Storage/storageAccounts', variables('storageAccountName')), '2019-06-01').keys[0].value,';EndpointSuffix=','core.windows.net')]" - }, - "dependsOn": [ - "[concat('Microsoft.Storage/storageAccounts/', variables('storageAccountName'))]", - "[resourceId('Microsoft.KeyVault/vaults', variables('openlineageKeyVaultName'))]" - ] - }, - { - "type": "Microsoft.KeyVault/vaults/secrets", - "apiVersion": "2019-09-01", - "name": "[format('{0}/{1}', variables('openlineageKeyVaultName'),variables('functionStorageSecret'))]", - "tags": "[parameters('resourceTagValues')]", - "properties": { - "value": "[concat('DefaultEndpointsProtocol=https;AccountName=',variables('functionStorageAccountName'),';AccountKey=',listKeys(resourceId('Microsoft.Storage/storageAccounts',variables('functionStorageAccountName')),'2019-06-01').keys[0].value,';EndpointSuffix=','core.windows.net')]" - }, - "dependsOn": [ - "[concat('Microsoft.Storage/storageAccounts/', variables('functionStorageAccountName'))]", - "[resourceId('Microsoft.KeyVault/vaults', variables('openlineageKeyVaultName'))]" - ] - }, - { - "type": "Microsoft.KeyVault/vaults/secrets", - "apiVersion": "2019-09-01", - "name": "[format('{0}/{1}', variables('openlineageKeyVaultName'),variables('OLOutputAPIKeySecretName'))]", - "tags": "[parameters('resourceTagValues')]", - "properties": { - "value": "[listKeys(concat(resourceId('Microsoft.Web/sites', variables('functionAppName')), '/host/default'), '2016-08-01').functionKeys.default]" - }, - "dependsOn": [ - "[resourceId('Microsoft.KeyVault/vaults', variables('openlineageKeyVaultName'))]" - ] - }, - { - "type": "Microsoft.KeyVault/vaults/secrets", - "apiVersion": "2019-09-01", - "name": "[format('{0}/{1}', variables('openlineageKeyVaultName'),variables('storageAccountAccessKey'))]", - "tags": "[parameters('resourceTagValues')]", - "properties": { - "value": "[listKeys(resourceId('Microsoft.Storage/storageAccounts', variables('storageAccountName')), '2019-04-01').keys[0].value]" - }, - "dependsOn": [ - "[resourceId('Microsoft.KeyVault/vaults', variables('openlineageKeyVaultName'))]" - ] - }, - { - "type": "Microsoft.KeyVault/vaults/secrets", - "apiVersion": "2019-09-01", - "name": "[format('{0}/{1}', variables('openlineageKeyVaultName'),variables('functionStorageAccessKey'))]", - "tags": "[parameters('resourceTagValues')]", - "properties": { - "value": "[listKeys(resourceId('Microsoft.Storage/storageAccounts', variables('functionStorageAccountName')), '2019-04-01').keys[0].value]" - }, - "dependsOn": [ - "[resourceId('Microsoft.KeyVault/vaults', variables('openlineageKeyVaultName'))]" - ] - }, - { - "type": "Microsoft.KeyVault/vaults/secrets", - "apiVersion": "2019-09-01", - "name": "[format('{0}/{1}', variables('openlineageKeyVaultName'),variables('clientidkey'))]", - "tags": "[parameters('resourceTagValues')]", - "properties": { - "value": "[parameters('clientid')]" - }, - "dependsOn": [ - "[resourceId('Microsoft.KeyVault/vaults', variables('openlineageKeyVaultName'))]" - ] - }, - { - "type": "Microsoft.KeyVault/vaults/secrets", - "apiVersion": "2019-09-01", - "name": "[format('{0}/{1}', variables('openlineageKeyVaultName'),variables('clientsecretkey'))]", - "tags": "[parameters('resourceTagValues')]", - "properties": { - "value": "[parameters('clientsecret')]" - }, - "dependsOn": [ - "[resourceId('Microsoft.KeyVault/vaults', variables('openlineageKeyVaultName'))]" - ] - }, - { - "type": "Microsoft.EventHub/namespaces/AuthorizationRules", - "apiVersion": "2021-11-01", - "name": "[concat(variables('openlineageEventHubNameSpaceName'), '/ListenMessages')]", - "tags": "[parameters('resourceTagValues')]", - "dependsOn": [ - "[resourceId('Microsoft.EventHub/namespaces', variables('openlineageEventHubNameSpaceName'))]" - ], - "properties": { - "rights": [ - "Listen" - ] - } - }, - { - "type": "Microsoft.EventHub/namespaces/AuthorizationRules", - "apiVersion": "2021-11-01", - "name": "[concat(variables('openlineageEventHubNameSpaceName'), '/SendMessages')]", - "tags": "[parameters('resourceTagValues')]", - "dependsOn": [ - "[resourceId('Microsoft.EventHub/namespaces', variables('openlineageEventHubNameSpaceName'))]" - ], - "properties": { - "rights": [ - "Send" - ] - } - }, - { - "apiVersion": "2020-06-01", - "name": "pid-1e23d6fb-478f-4b04-bfa3-70db11929652", - "type": "Microsoft.Resources/deployments", - "properties": { - "mode": "Incremental", - "template": { - "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", - "contentVersion": "1.0.0.0", - "resources": [] - } - } - } - ], - "outputs": { - "functionAppName": { - "type": "string", - "value": "[variables('functionAppName')]" - }, - "kvName": { - "type": "string", - "value": "[variables('openlineageKeyVaultName')]" - }, - "storageAccountName": { - "type": "string", - "value": "[variables('storageAccountName')]" - }, - "resourcegroupLocation": { - "type": "string", - "value": "[resourceGroup().location]" - } - } -} +{ + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "prefixName": { + "type": "string" + }, + "clientid": { + "type": "string" + }, + "clientsecret": { + "type": "securestring" + }, + "purviewName": { + "type": "string", + "defaultValue": "[concat(resourceGroup().name,'-openlineage-purview')]", + "metadata": { + "description": "User name for the Virtual Machine." + } + }, + "resourceTagValues": { + "type": "object" + }, + "functionSku": { + "type": "string", + "defaultValue": "Dynamic", + "metadata": { + "description": "The tier for the Azure Function." + }, + "allowedValues": [ + "Dynamic", + "EP1", + "EP2", + "EP3" + ] + } + }, + "variables": { + "paramName": "[parameters('prefixName')]", + "rgId": "[resourceGroup().id]", + "uniqueName": "[substring(uniqueString(variables('rgId')),0,4)]", + "functionAppName": "[replace(replace(toLower(concat(concat('functionapp',variables('paramName')),variables('uniqueName'))),'-',''),'_','')]", + "hostingPlanName": "[replace(replace(toLower(concat(concat('functionapphostplan',variables('paramName')),variables('uniqueName'))),'-',''),'_','')]", + "functionName": "OpenLineageIn", + "applicationInsightsName": "[replace(replace(toLower(concat(concat('appinsight',variables('paramName')),variables('uniqueName'))),'-',''),'_','')]", + "storageAccountName": "[replace(replace(toLower(concat(concat('storage',variables('paramName')),variables('uniqueName'))),'-',''),'_','')]", + "functionStorageAccountName": "[replace(replace(toLower(concat(concat('function','storage'),variables('uniqueName'))),'-',''),'_','')]", + "clientidkey": "clientIdKey", + "clientsecretkey": "clientSecretKey", + "storageAccountAccessKey": "storageAccessKey", + "functionStorageAccessKey": "functionStorageAccessKey", + "functionStorageAccountAccessKey": "functionStorageAccountKey", + "functionWorkerRuntime": "dotnet-isolated", + "openlineageEventHubNameSpaceName": "[replace(replace(toLower(concat(concat('eventhubns',variables('paramName')),variables('uniqueName'))),'-',''),'_','')]", + "openlineageNameEventHubName": "[replace(replace(toLower(concat(concat('eventhub',variables('paramName')),variables('uniqueName'))),'-',''),'_','')]", + "openlineageNameEventHubConsumerGroup": "[replace(replace(toLower(concat(concat('consumergroup',variables('paramName')),variables('uniqueName'))),'-',''),'_','')]", + "openlineageKeyVaultName": "[replace(replace(toLower(concat(concat('keyvaut',variables('paramName')),variables('uniqueName'))),'-',''),'_','')]", + "purviewAccountName": "[parameters('purviewName')]", + "eventHubSku": "Standard", + "captureEnabled": true, + "captureEncodingFormat": "Avro", + "captureTime": 60, + "captureSize": 314572800, + "EventHubConnectionSecretNameSend": "ehsecretSend", + "EventHubConnectionSecretNameListen": "ehsecretListen", + "functionStorageSecret": "functionStorageSecret", + "storageAccountSecret": "storageAccountSecret", + "OLOutputAPIKeySecretName": "Ol-Output-Api-Key", + "containerName": "eventhubdata", + "functionSkuDynamic": { + "name": "Y1", + "tier": "Dynamic" + }, + "functionSkuElasticPremium": { + "tier": "ElasticPremium", + "name": "[parameters('functionSku')]", + "family": "EP" + } + }, + "resources": [ + { + "type": "Microsoft.Storage/storageAccounts", + "apiVersion": "2019-06-01", + "name": "[variables('storageAccountName')]", + "location": "[resourceGroup().location]", + "sku": { + "name": "Standard_LRS" + }, + "kind": "Storage", + "tags": "[parameters('resourceTagValues')]", + "properties": { + "allowBlobPublicAccess": "False", + "supportsHttpsTrafficOnly": "True" + } + }, + { + "type": "Microsoft.Storage/storageAccounts", + "apiVersion": "2021-06-01", + "name": "[variables('functionStorageAccountName')]", + "location": "[resourceGroup().location]", + "sku": { + "name": "Standard_LRS" + }, + "kind": "Storage", + "tags": "[parameters('resourceTagValues')]", + "properties": { + "allowBlobPublicAccess": "False" + } + }, + { + "type": "Microsoft.Storage/storageAccounts/blobServices/containers", + "apiVersion": "2021-06-01", + "name": "[format('{0}/default/{1}', variables('functionStorageAccountName'), variables('containerName'))]", + "dependsOn": [ + "[resourceId('Microsoft.Storage/storageAccounts', variables('functionStorageAccountName'))]" + ] + }, + { + "type": "Microsoft.Web/serverfarms", + "apiVersion": "2020-06-01", + "name": "[variables('hostingPlanName')]", + "location": "[resourceGroup().location]", + "sku": "[if(equals(parameters('functionSku'), 'Dynamic'), variables('functionSkuDynamic'), variables('functionSkuElasticPremium'))]", + "tags": "[parameters('resourceTagValues')]", + "properties": { + "name": "[variables('hostingPlanName')]", + "computeMode": "Dynamic" + } + }, + { + "type": "Microsoft.Web/sites", + "apiVersion": "2020-06-01", + "name": "[variables('functionAppName')]", + "location": "[resourceGroup().location]", + "identity": { + "type": "SystemAssigned" + }, + "kind": "functionapp", + "tags": "[parameters('resourceTagValues')]", + "dependsOn": [ + "[resourceId('Microsoft.Web/serverfarms', variables('hostingPlanName'))]", + "[resourceId('Microsoft.Storage/storageAccounts', variables('storageAccountName'))]", + "[resourceId('Microsoft.Insights/components', variables('applicationInsightsName'))]" + ], + "properties": { + "serverFarmId": "[resourceId('Microsoft.Web/serverfarms', variables('hostingPlanName'))]", + "httpsOnly": true, + "siteConfig": {} + }, + "resources": [ + { + "name": "MSDeploy", + "type": "extensions", + "location": "[resourceGroup().location]", + "apiVersion": "2020-06-01", + "dependsOn": [ + "[concat('Microsoft.Web/sites/', variables('functionAppName'))]", + "[concat('Microsoft.Web/sites/', variables('functionAppName'), '/config/web')]" + ], + "properties": { + "packageUri": "http://aka.ms/APFunctions2-3" + } + }, + { + "apiVersion": "2020-06-01", + "type": "config", + "name": "web", + "dependsOn": [ + "[concat('Microsoft.Web/sites/', variables('functionAppName'))]" + ], + "properties": { + "appSettings": [ + { + "name": "AzureWebJobsStorage", + "value": "[concat('DefaultEndpointsProtocol=https;AccountName=', variables('storageAccountName'), ';EndpointSuffix=', environment().suffixes.storage, ';AccountKey=',listKeys(resourceId('Microsoft.Storage/storageAccounts', variables('storageAccountName')), '2019-06-01').keys[0].value)]" + }, + { + "name": "FunctionStorage", + "value": "[concat('@Microsoft.KeyVault(VaultName=', variables('openlineageKeyVaultName'),';SecretName=',variables('functionStorageSecret'),')')]" + }, + { + "name": "WEBSITE_CONTENTAZUREFILECONNECTIONSTRING", + "value": "[concat('DefaultEndpointsProtocol=https;AccountName=', variables('storageAccountName'), ';EndpointSuffix=', environment().suffixes.storage, ';AccountKey=',listKeys(resourceId('Microsoft.Storage/storageAccounts', variables('storageAccountName')), '2019-06-01').keys[0].value)]" + }, + { + "name": "WEBSITE_CONTENTSHARE", + "value": "[toLower(variables('functionAppName'))]" + }, + { + "name": "FUNCTIONS_EXTENSION_VERSION", + "value": "~4" + }, + { + "name": "WEBSITE_NODE_DEFAULT_VERSION", + "value": "~10" + }, + { + "name": "APPINSIGHTS_INSTRUMENTATIONKEY", + "value": "[reference(resourceId('microsoft.insights/components', variables('applicationInsightsName')), '2020-02-02-preview').InstrumentationKey]" + }, + { + "name": "FUNCTIONS_WORKER_RUNTIME", + "value": "[variables('functionWorkerRuntime')]" + }, + { + "name": "EventHubName", + "value": "[variables('openlineageNameEventHubName')]" + }, + { + "name": "ListenToMessagesFromEventHub", + "value": "[concat('@Microsoft.KeyVault(VaultName=', variables('openlineageKeyVaultName'),';SecretName=',variables('EventHubConnectionSecretNameListen'),')')]" + }, + { + "name": "SendMessagesToEventHub", + "value": "[concat('@Microsoft.KeyVault(VaultName=', variables('openlineageKeyVaultName'),';SecretName=',variables('EventHubConnectionSecretNameSend'),')')]" + }, + { + "name": "EventHubConsumerGroup", + "value": "read" + }, + { + "name": "OlToPurviewMappings", + "value": "{\"olToPurviewMappings\":[{\"name\":\"wasbs\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"wasbs\"}],\"qualifiedName\":\"https://{nameSpcBodyParts[1]}/{nameSpcBodyParts[0]}/{nameGroups[0]}\",\"purviewDataType\":\"azure_blob_path\",\"purviewPrefix\":\"https\"},{\"name\":\"wasb\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"wasb\"}],\"qualifiedName\":\"https://{nameSpcBodyParts[1]}/{nameSpcBodyParts[0]}/{nameGroups[0]}\",\"purviewDataType\":\"azure_blob_path\",\"purviewPrefix\":\"https\"},{\"name\":\"abfsBlobRootFS\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"abfs\"},{\"op1\":\"nameSpcBodyParts[1]\",\"compare\":\"contains\",\"op2\":\"blob\"},{\"op1\":\"nameGroups[0]\",\"compare\":\"=\",\"op2\":\"\"}],\"qualifiedName\":\"https://{nameSpcConParts[0]}.dfs.{nameSpcConParts[2]}.{nameSpcConParts[3]}.{nameSpcConParts[4]}/{nameSpcBodyParts[0]}/{nameGroups[0]}\",\"purviewDataType\":\"azure_datalake_gen2_filesystem\",\"purviewPrefix\":\"https\"},{\"name\":\"abfsRootFS\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"abfs\"},{\"op1\":\"nameGroups[0]\",\"compare\":\"=\",\"op2\":\"\"}],\"qualifiedName\":\"https://{nameSpcBodyParts[1]}/{nameSpcBodyParts[0]}/{nameGroups[0]}\",\"purviewDataType\":\"azure_datalake_gen2_filesystem\",\"purviewPrefix\":\"https\"},{\"name\":\"abfssBlobRootFS\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"abfss\"},{\"op1\":\"nameSpcBodyParts[1]\",\"compare\":\"contains\",\"op2\":\"blob\"},{\"op1\":\"nameGroups[0]\",\"compare\":\"=\",\"op2\":\"\"}],\"qualifiedName\":\"https://{nameSpcConParts[0]}.dfs.{nameSpcConParts[2]}.{nameSpcConParts[3]}.{nameSpcConParts[4]}/{nameSpcBodyParts[0]}/{nameGroups[0]}\",\"purviewDataType\":\"azure_datalake_gen2_filesystem\",\"purviewPrefix\":\"https\"},{\"name\":\"abfssRootFS\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"abfss\"},{\"op1\":\"nameGroups[0]\",\"compare\":\"=\",\"op2\":\"\"}],\"qualifiedName\":\"https://{nameSpcBodyParts[1]}/{nameSpcBodyParts[0]}/{nameGroups[0]}\",\"purviewDataType\":\"azure_datalake_gen2_filesystem\",\"purviewPrefix\":\"https\"},{\"name\":\"abfsBlob\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"abfs\"},{\"op1\":\"nameSpcBodyParts[1]\",\"compare\":\"contains\",\"op2\":\"blob\"}],\"qualifiedName\":\"https://{nameSpcConParts[0]}.dfs.{nameSpcConParts[2]}.{nameSpcConParts[3]}.{nameSpcConParts[4]}/{nameSpcBodyParts[0]}/{nameGroups[0]}\",\"purviewDataType\":\"azure_datalake_gen2_path\",\"purviewPrefix\":\"https\"},{\"name\":\"abfs\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"abfs\"}],\"qualifiedName\":\"https://{nameSpcBodyParts[1]}/{nameSpcBodyParts[0]}/{nameGroups[0]}\",\"purviewDataType\":\"azure_datalake_gen2_path\",\"purviewPrefix\":\"https\"},{\"name\":\"abfssBlob\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"abfss\"},{\"op1\":\"nameSpcBodyParts[1]\",\"compare\":\"contains\",\"op2\":\"blob\"}],\"qualifiedName\":\"https://{nameSpcConParts[0]}.dfs.{nameSpcConParts[2]}.{nameSpcConParts[3]}.{nameSpcConParts[4]}/{nameSpcBodyParts[0]}/{nameGroups[0]}\",\"purviewDataType\":\"azure_datalake_gen2_path\",\"purviewPrefix\":\"https\"},{\"name\":\"abfss\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"abfss\"}],\"qualifiedName\":\"https://{nameSpcBodyParts[1]}/{nameSpcBodyParts[0]}/{nameGroups[0]}\",\"purviewDataType\":\"azure_datalake_gen2_path\",\"purviewPrefix\":\"https\"},{\"name\":\"synapseSqlNonDbo\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"sqlserver\"},{\"op1\":\"nameSpcBodyParts[0]\",\"compare\":\"contains\",\"op2\":\"azuresynapse\"},{\"op1\":\"nameGroups[0].parts\",\"compare\":\">\",\"op2\":\"1\"}],\"qualifiedName\":\"mssql://{nameSpcBodyParts[0]}/{nameSpcNameVals['database']}/{nameGroups[0].parts[0]}/{nameGroups[0].parts[1]}\",\"purviewDataType\":\"azure_synapse_dedicated_sql_table\",\"purviewPrefix\":\"mssql\"},{\"name\":\"synapseSql\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"sqlserver\"},{\"op1\":\"nameSpcBodyParts[0]\",\"compare\":\"contains\",\"op2\":\"azuresynapse\"}],\"qualifiedName\":\"mssql://{nameSpcBodyParts[0]}/{nameSpcNameVals['database']}/dbo/{nameGroups[0].parts[0]}\",\"purviewDataType\":\"azure_synapse_dedicated_sql_table\",\"purviewPrefix\":\"mssql\"},{\"name\":\"azureSQLNonDbo\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"sqlserver\"},{\"op1\":\"nameGroups\",\"compare\":\">\",\"op2\":\"1\"}],\"qualifiedName\":\"mssql://{nameSpcBodyParts[0]}/{nameSpcNameVals['database']}/{nameGroups[0]}/{nameGroups[1]}\",\"purviewDataType\":\"azure_sql_table\",\"purviewPrefix\":\"mssql\"},{\"name\":\"azureSQLNonDboNoDotsInNames\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"sqlserver\"},{\"op1\":\"nameGroups[0].parts\",\"compare\":\">\",\"op2\":\"1\"}],\"qualifiedName\":\"mssql://{nameSpcBodyParts[0]}/{nameSpcNameVals['database']}/{nameGroups[0].parts[0]}/{nameGroups[0].parts[1]}\",\"purviewDataType\":\"azure_sql_table\",\"purviewPrefix\":\"mssql\"},{\"name\":\"azureSQL\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"sqlserver\"}],\"qualifiedName\":\"mssql://{nameSpcBodyParts[0]}/{nameSpcNameVals['database']}/dbo/{nameGroups[0]}\",\"purviewDataType\":\"azure_sql_table\",\"purviewPrefix\":\"mssql\"},{\"name\":\"azurePostgresNonPublic\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"postgresql\"},{\"op1\":\"nameGroups[0].parts\",\"compare\":\">\",\"op2\":\"1\"},{\"op1\":\"nameSpcConParts\",\"compare\":\">\",\"op2\":\"4\"},{\"op1\":\"nameSpcConParts[3]\",\"compare\":\"=\",\"op2\":\"azure\"}],\"qualifiedName\":\"postgresql://{nameSpcBodyParts[0]}/{nameSpcBodyParts[2]}/{nameGroups[0].parts[0]}/{nameGroups[0].parts[1]}\",\"purviewDataType\":\"azure_postgresql_table\",\"purviewPrefix\":\"postgresql\"},{\"name\":\"azurePostgres\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"postgresql\"},{\"op1\":\"nameSpcConParts\",\"compare\":\">\",\"op2\":\"4\"},{\"op1\":\"nameSpcConParts[3]\",\"compare\":\"=\",\"op2\":\"azure\"}],\"qualifiedName\":\"postgresql://{nameSpcBodyParts[0]}/{nameSpcBodyParts[2]}/public/{nameGroups[0]}\",\"purviewDataType\":\"azure_postgresql_table\",\"purviewPrefix\":\"postgresql\"},{\"name\":\"postgresNonPublic\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"postgresql\"},{\"op1\":\"nameGroups[0].parts\",\"compare\":\">\",\"op2\":\"1\"}],\"qualifiedName\":\"postgresql://servers/{nameSpcBodyParts[0]}:{nameSpcBodyParts[1]}/dbs/{nameSpcBodyParts[2]}/schemas/{nameGroups[0].parts[0]}/tables/{nameGroups[0].parts[1]}\",\"purviewDataType\":\"postgresql_table\",\"purviewPrefix\":\"postgresql\"},{\"name\":\"postgres\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"postgresql\"}],\"qualifiedName\":\"postgresql://servers/{nameSpcBodyParts[0]}:{nameSpcBodyParts[1]}/dbs/{nameSpcBodyParts[2]}/schemas/public/tables/{nameGroups[0]}\",\"purviewDataType\":\"postgresql_table\",\"purviewPrefix\":\"postgresql\"},{\"name\":\"hiveManagedTableNotDefault\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"dbfs\"},{\"op1\":\"nameGroups[0]\",\"compare\":\"contains\",\"op2\":\"hive/warehouse\"},{\"op1\":\"nameGroups[0].parts\",\"compare\":\">\",\"op2\":\"4\"}],\"qualifiedName\":\"{nameGroups[0].parts[3]}.{nameGroups[0].parts[5]}@{AdbWorkspaceUrl}\",\"purviewDataType\":\"hive_table\",\"purviewPrefix\":\"hive\"},{\"name\":\"hiveManagedTableDefault\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"dbfs\"},{\"op1\":\"nameGroups[0]\",\"compare\":\"contains\",\"op2\":\"hive/warehouse\"}],\"qualifiedName\":\"default.{nameGroups[0].parts[3]}@{AdbWorkspaceUrl}\",\"purviewDataType\":\"hive_table\",\"purviewPrefix\":\"hive\"},{\"name\":\"azureMySql\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"mysql\"}],\"qualifiedName\":\"mysql://{nameSpcBodyParts[0]}/{nameSpcBodyParts[2]}/{nameGroups[0]}\",\"purviewDataType\":\"azure_mysql_table\",\"purviewPrefix\":\"mysql\"},{\"name\":\"kusto\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"azurekusto\"}],\"qualifiedName\":\"https://{nameSpcBodyParts[0]}/{nameSpcBodyParts[1]}/{nameGroups[0]}\",\"purviewDataType\":\"azure_data_explorer_table\",\"purviewPrefix\":\"https\"},{\"name\":\"azureCosmos\",\"parserConditions\":[{\"op1\":\"prefix\",\"compare\":\"=\",\"op2\":\"azurecosmos\"}],\"qualifiedName\":\"https://{nameSpcBodyParts[0]}/{nameSpcBodyParts[1]}/{nameSpcBodyParts[2]}/{nameGroups[0]}\",\"purviewDataType\":\"azure_cosmosdb_sqlapi_collection\",\"purviewPrefix\":\"https\"}]}" + }, + { + "name": "PurviewAccountName", + "value": "[variables('purviewAccountName')]" + }, + { + "name": "ClientID", + "value": "[concat('@Microsoft.KeyVault(VaultName=', variables('openlineageKeyVaultName'),';SecretName=',variables('clientidkey'),')')]" + }, + { + "name": "ClientSecret", + "value": "[concat('@Microsoft.KeyVault(VaultName=', variables('openlineageKeyVaultName'),';SecretName=',variables('clientsecretkey'),')')]" + }, + { + "name": "TenantId", + "value": "[subscription().tenantId]" + } + ] + } + } + ] + }, + { + "type": "microsoft.insights/components", + "apiVersion": "2020-02-02-preview", + "name": "[variables('applicationInsightsName')]", + "location": "[resourceGroup().location]", + "tags": { + "[concat('hidden-link:', resourceId('Microsoft.Web/sites', variables('applicationInsightsName')))]": "Resource" + }, + "properties": { + "ApplicationId": "[variables('applicationInsightsName')]", + "Request_Source": "IbizaWebAppExtensionCreate" + } + }, + { + "type": "Microsoft.EventHub/namespaces", + "apiVersion": "2018-01-01-preview", + "name": "[variables('openlineageEventHubNameSpaceName')]", + "location": "[resourceGroup().location]", + "sku": { + "name": "[variables('eventHubSku')]", + "tier": "[variables('eventHubSku')]", + "capacity": 1 + }, + "tags": "[parameters('resourceTagValues')]", + "properties": { + "isAutoInflateEnabled": false, + "maximumThroughputUnits": 0 + } + }, + { + "type": "Microsoft.EventHub/namespaces/eventhubs", + "apiVersion": "2017-04-01", + "name": "[concat(variables('openlineageEventHubNameSpaceName'), '/', variables('openlineageNameEventHubName'))]", + "location": "[resourceGroup().location]", + "tags": "[parameters('resourceTagValues')]", + "dependsOn": [ + "[resourceId('Microsoft.EventHub/namespaces', variables('openlineageEventHubNameSpaceName'))]" + ], + "properties": { + "messageRetentionInDays": 1, + "partitionCount": 1, + "captureDescription": { + "enabled": "[variables('captureEnabled')]", + "skipEmptyArchives": false, + "encoding": "[variables('captureEncodingFormat')]", + "intervalInSeconds": "[variables('captureTime')]", + "sizeLimitInBytes": "[variables('captureSize')]", + "destination": { + "name": "EventHubArchive.AzureBlockBlob", + "properties": { + "archiveNameFormat": "{Namespace}/{EventHub}/{PartitionId}/{Year}/{Month}/{Day}/{Hour}/{Minute}/{Second}", + "blobContainer": "eventhubdata", + "storageAccountResourceId": "[resourceId('Microsoft.Storage/storageAccounts', variables('functionStorageAccountName'))]" + } + } + } + }, + "resources": [ + { + "apiVersion": "2017-04-01", + "name": "read", + "type": "consumergroups", + "dependsOn": [ + "[variables('openlineageNameEventHubName')]" + ], + "properties": {} + } + ] + }, + { + "type": "Microsoft.KeyVault/vaults", + "name": "[variables('openlineageKeyVaultName')]", + "apiVersion": "2019-09-01", + "location": "[resourceGroup().location]", + "tags": "[parameters('resourceTagValues')]", + "properties": { + "sku": { + "family": "A", + "name": "Standard" + }, + "tenantId": "[subscription().tenantId]", + "accessPolicies": [ + { + "tenantId": "[subscription().tenantid]", + "objectId": "[reference(resourceId('Microsoft.Web/sites', variables('functionAppName')),'2020-06-01', 'full').identity.principalId]", + "permissions": { + "keys": [], + "secrets": [ + "get" + ], + "certificates": [] + } + } + ], + "enableSoftDelete": false, + "enabledForDeployment": false, + "enabledForDiskEncryption": false, + "enabledForTemplateDeployment": false + }, + "dependsOn": [ + "[resourceId('Microsoft.Web/sites', variables('functionAppName'))]" + ] + }, + { + "type": "Microsoft.KeyVault/vaults/secrets", + "apiVersion": "2019-09-01", + "name": "[format('{0}/{1}', variables('openlineageKeyVaultName'), variables('EventHubConnectionSecretNameSend'))]", + "tags": "[parameters('resourceTagValues')]", + "properties": { + "value": "[listkeys(resourceId('Microsoft.Eventhub/namespaces/authorizationRules',variables('openlineageEventHubNameSpaceName'), 'SendMessages'),'2017-04-01').primaryConnectionString]" + }, + "dependsOn": [ + "[resourceId('Microsoft.EventHub/namespaces', variables('openlineageEventHubNameSpaceName'))]", + "[resourceId('Microsoft.KeyVault/vaults', variables('openlineageKeyVaultName'))]" + ] + }, + { + "type": "Microsoft.KeyVault/vaults/secrets", + "apiVersion": "2019-09-01", + "name": "[format('{0}/{1}', variables('openlineageKeyVaultName'), variables('EventHubConnectionSecretNameListen'))]", + "tags": "[parameters('resourceTagValues')]", + "properties": { + "value": "[listkeys(resourceId('Microsoft.Eventhub/namespaces/authorizationRules',variables('openlineageEventHubNameSpaceName'), 'ListenMessages'),'2017-04-01').primaryConnectionString]" + }, + "dependsOn": [ + "[resourceId('Microsoft.EventHub/namespaces', variables('openlineageEventHubNameSpaceName'))]", + "[resourceId('Microsoft.KeyVault/vaults', variables('openlineageKeyVaultName'))]" + ] + }, + { + "type": "Microsoft.KeyVault/vaults/secrets", + "apiVersion": "2019-09-01", + "name": "[format('{0}/{1}', variables('openlineageKeyVaultName'),variables('storageAccountName'))]", + "tags": "[parameters('resourceTagValues')]", + "properties": { + "value": "[concat('DefaultEndpointsProtocol=https;AccountName=',variables('storageAccountName'),';AccountKey=',listKeys(resourceId('Microsoft.Storage/storageAccounts', variables('storageAccountName')), '2019-06-01').keys[0].value,';EndpointSuffix=','core.windows.net')]" + }, + "dependsOn": [ + "[concat('Microsoft.Storage/storageAccounts/', variables('storageAccountName'))]", + "[resourceId('Microsoft.KeyVault/vaults', variables('openlineageKeyVaultName'))]" + ] + }, + { + "type": "Microsoft.KeyVault/vaults/secrets", + "apiVersion": "2019-09-01", + "name": "[format('{0}/{1}', variables('openlineageKeyVaultName'),variables('functionStorageSecret'))]", + "tags": "[parameters('resourceTagValues')]", + "properties": { + "value": "[concat('DefaultEndpointsProtocol=https;AccountName=',variables('functionStorageAccountName'),';AccountKey=',listKeys(resourceId('Microsoft.Storage/storageAccounts',variables('functionStorageAccountName')),'2019-06-01').keys[0].value,';EndpointSuffix=','core.windows.net')]" + }, + "dependsOn": [ + "[concat('Microsoft.Storage/storageAccounts/', variables('functionStorageAccountName'))]", + "[resourceId('Microsoft.KeyVault/vaults', variables('openlineageKeyVaultName'))]" + ] + }, + { + "type": "Microsoft.KeyVault/vaults/secrets", + "apiVersion": "2019-09-01", + "name": "[format('{0}/{1}', variables('openlineageKeyVaultName'),variables('OLOutputAPIKeySecretName'))]", + "tags": "[parameters('resourceTagValues')]", + "properties": { + "value": "[listKeys(concat(resourceId('Microsoft.Web/sites', variables('functionAppName')), '/host/default'), '2016-08-01').functionKeys.default]" + }, + "dependsOn": [ + "[resourceId('Microsoft.KeyVault/vaults', variables('openlineageKeyVaultName'))]" + ] + }, + { + "type": "Microsoft.KeyVault/vaults/secrets", + "apiVersion": "2019-09-01", + "name": "[format('{0}/{1}', variables('openlineageKeyVaultName'),variables('storageAccountAccessKey'))]", + "tags": "[parameters('resourceTagValues')]", + "properties": { + "value": "[listKeys(resourceId('Microsoft.Storage/storageAccounts', variables('storageAccountName')), '2019-04-01').keys[0].value]" + }, + "dependsOn": [ + "[resourceId('Microsoft.KeyVault/vaults', variables('openlineageKeyVaultName'))]" + ] + }, + { + "type": "Microsoft.KeyVault/vaults/secrets", + "apiVersion": "2019-09-01", + "name": "[format('{0}/{1}', variables('openlineageKeyVaultName'),variables('functionStorageAccessKey'))]", + "tags": "[parameters('resourceTagValues')]", + "properties": { + "value": "[listKeys(resourceId('Microsoft.Storage/storageAccounts', variables('functionStorageAccountName')), '2019-04-01').keys[0].value]" + }, + "dependsOn": [ + "[resourceId('Microsoft.KeyVault/vaults', variables('openlineageKeyVaultName'))]" + ] + }, + { + "type": "Microsoft.KeyVault/vaults/secrets", + "apiVersion": "2019-09-01", + "name": "[format('{0}/{1}', variables('openlineageKeyVaultName'),variables('clientidkey'))]", + "tags": "[parameters('resourceTagValues')]", + "properties": { + "value": "[parameters('clientid')]" + }, + "dependsOn": [ + "[resourceId('Microsoft.KeyVault/vaults', variables('openlineageKeyVaultName'))]" + ] + }, + { + "type": "Microsoft.KeyVault/vaults/secrets", + "apiVersion": "2019-09-01", + "name": "[format('{0}/{1}', variables('openlineageKeyVaultName'),variables('clientsecretkey'))]", + "tags": "[parameters('resourceTagValues')]", + "properties": { + "value": "[parameters('clientsecret')]" + }, + "dependsOn": [ + "[resourceId('Microsoft.KeyVault/vaults', variables('openlineageKeyVaultName'))]" + ] + }, + { + "type": "Microsoft.EventHub/namespaces/AuthorizationRules", + "apiVersion": "2021-11-01", + "name": "[concat(variables('openlineageEventHubNameSpaceName'), '/ListenMessages')]", + "tags": "[parameters('resourceTagValues')]", + "dependsOn": [ + "[resourceId('Microsoft.EventHub/namespaces', variables('openlineageEventHubNameSpaceName'))]", + "[resourceId('Microsoft.EventHub/namespaces/AuthorizationRules', variables('openlineageEventHubNameSpaceName'), 'SendMessages')]" + ], + "properties": { + "rights": [ + "Listen" + ] + } + }, + { + "type": "Microsoft.EventHub/namespaces/AuthorizationRules", + "apiVersion": "2021-11-01", + "name": "[concat(variables('openlineageEventHubNameSpaceName'), '/SendMessages')]", + "tags": "[parameters('resourceTagValues')]", + "dependsOn": [ + "[resourceId('Microsoft.EventHub/namespaces', variables('openlineageEventHubNameSpaceName'))]" + ], + "properties": { + "rights": [ + "Send" + ] + } + }, + { + "apiVersion": "2020-06-01", + "name": "pid-1e23d6fb-478f-4b04-bfa3-70db11929652", + "type": "Microsoft.Resources/deployments", + "properties": { + "mode": "Incremental", + "template": { + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "resources": [] + } + } + } + ], + "outputs": { + "functionAppName": { + "type": "string", + "value": "[variables('functionAppName')]" + }, + "kvName": { + "type": "string", + "value": "[variables('openlineageKeyVaultName')]" + }, + "storageAccountName": { + "type": "string", + "value": "[variables('storageAccountName')]" + }, + "resourcegroupLocation": { + "type": "string", + "value": "[resourceGroup().location]" + } + } +} \ No newline at end of file diff --git a/deployment/infra/openlineage-deployment.sh b/deployment/infra/openlineage-deployment.sh index 5af7d42..9d7a1d2 100644 --- a/deployment/infra/openlineage-deployment.sh +++ b/deployment/infra/openlineage-deployment.sh @@ -174,7 +174,7 @@ ADLSKEY=$(jq -r '.[1].value' <<< $adls_keys) CLUSTERNAME="openlineage-demo" ### Download Jar File -curl -O -L https://repo1.maven.org/maven2/io/openlineage/openlineage-spark/0.13.0/openlineage-spark-0.13.0.jar +curl -O -L https://repo1.maven.org/maven2/io/openlineage/openlineage-spark/0.18.0/openlineage-spark-0.18.0.jar ### az storage container create -n rawdata --account-name $ADLSNAME --account-key $ADLSKEY sampleA_resp=$(az storage blob upload --account-name $ADLSNAME --account-key $ADLSKEY -f exampleInputA.csv -c rawdata -n examples/data/csv/exampleInputA/exampleInputA.csv) diff --git a/deployment/util/README.md b/deployment/util/README.md new file mode 100644 index 0000000..ed60061 --- /dev/null +++ b/deployment/util/README.md @@ -0,0 +1,35 @@ +# Utilities for Deployment + +## mappings-remove-spaces + +Used in the Github Action for creating a deployment artifact that is easier to copy / paste or upload into an app setting for Azure Functions. + +``` +usage: mappings-remove-spaces.py [-h] mappings_json output_path + +positional arguments: + mappings_json File path of the mappings json +``` + +Sample: +``` +python ./deployment/util/mappings-remove-spaces.py ./deployment/infra/OlToPurviewMappings.json > test.json +``` + +## mappings-update-arm + +Used to update the ARM template in a standardized way + +``` +usage: mappings-update-arm.py [-h] mappings_json template_file output_path + +positional arguments: + mappings_json File path of the mappings json + template_file File path to the ARM template to be updated + output_path File path to the output +``` + +Sample: +``` +python ./deployment/util/mappings-update-arm.py ./deployment/infra/OlToPurviewMappings.json ./deployment/infra/newdeploymenttemp.json ./deployment/infra/newdeploymenttemp.json +``` diff --git a/deployment/util/mappings-remove-spaces.py b/deployment/util/mappings-remove-spaces.py new file mode 100644 index 0000000..f50989c --- /dev/null +++ b/deployment/util/mappings-remove-spaces.py @@ -0,0 +1,14 @@ +import argparse +import os + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("mappings_json", help="File path of the mappings json") + args, unknown_args = parser.parse_known_args() + + with open(args.mappings_json, 'r') as fp: + mappings = fp.read() + + oneliner = mappings.replace("\n", "").replace(" ", "") + print(oneliner) diff --git a/deployment/util/mappings-update-arm.py b/deployment/util/mappings-update-arm.py new file mode 100644 index 0000000..ed21869 --- /dev/null +++ b/deployment/util/mappings-update-arm.py @@ -0,0 +1,34 @@ +import argparse +import json + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("mappings_json", help="File path of the mappings json") + parser.add_argument("template_file", help="File path to the ARM template to be updated") + parser.add_argument("output_path", help="File path to the output") + args, unknown_args = parser.parse_known_args() + + with open(args.mappings_json, 'r') as fp: + mappings = json.load(fp) + + with open(args.template_file, 'r') as arm_input: + arm = json.load(arm_input) + + for resource in arm["resources"]: + if resource["type"] != "Microsoft.Web/sites": + continue + + child_resources = resource["resources"] + for child_resource in child_resources: + if child_resource["type"] != "config": + continue + + for setting in child_resource["properties"]["appSettings"]: + if setting["name"] != "OlToPurviewMappings": + continue + setting["value"] = json.dumps(mappings).replace(" ", "") + print("Successfully updated mappings setting") + + + with open(args.output_path, 'w') as output: + json.dump(arm, output, indent="\t") diff --git a/docs/configuration.md b/docs/configuration.md index b045b84..25ce4e6 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -30,7 +30,6 @@ The following app settings are experimental and may be removed in future release | App Setting| Default Value in Code| Note| |----|----|----| |useResourceSet|true|Experimental feature| -|usePurviewTypes|false| Experimental feature| |maxQueryPlanSize|null|If the query plan bytes is greater than this value it will be removed from the databricks_process| |prioritizeFirstResourceSet|true|When matching against existing assets, the first resource set found will be prioritized over other assets like folders or purview custom connector entities.| |Spark_Entities|databricks_workspace;databricks_job;databricks_notebook;databricks_notebook_task|| diff --git a/docs/mappings/README.md b/docs/mappings/README.md index 0a3060e..f7909e3 100644 --- a/docs/mappings/README.md +++ b/docs/mappings/README.md @@ -14,3 +14,10 @@ This directory contains a "gallery" of sample OpenLineage to Purview Mappings th * [Prioritize Azure SQL Non DBO](./az-sql.json) * Default mappings treat an Azure SQL table named `myschema.mytable` as schema of `myschema` and table of `mytable`. * If you remove the `azureSQLNonDboNoDotsInNames` mapping, the above example would default to `dbo.[myschema.mytable]`. + +## Snowflake + +* [Snowflake](./snowflake.json) + * Supports mapping Snowflake tables in Purview. + * OpenLineage returns a DataSet with `"namespace":"snowflake://","name":"..` + * Microsoft Purview expects a fully qualified name of `snowflake:///databases//schemas//tables/
` \ No newline at end of file diff --git a/docs/mappings/adlsg1.json b/docs/mappings/adlsg1.json index 402af39..ace5c93 100644 --- a/docs/mappings/adlsg1.json +++ b/docs/mappings/adlsg1.json @@ -1,3 +1,21 @@ +{ + "name": "adlsg1", + "parserConditions": [ + { + "op1": "prefix", + "compare": "=", + "op2": "adl" + }, + { + "op1": "nameSpcBodyParts", + "compare": ">", + "op2": "1" + } + ], + "qualifiedName": "adl://{nameSpcBodyParts[0]}/{nameSpaceBodyJoinedBySlashFrom[1]}/{nameGroups[0]}", + "purviewDataType": "azure_datalake_gen1_path", + "purviewPrefix": "adl" +}, { "name": "adlsg1", "parserConditions": [ diff --git a/docs/mappings/snowflake.json b/docs/mappings/snowflake.json new file mode 100644 index 0000000..339ef54 --- /dev/null +++ b/docs/mappings/snowflake.json @@ -0,0 +1,13 @@ +{ + "name": "snowflake", + "parserConditions": [ + { + "op1": "prefix", + "compare": "=", + "op2": "snowflake" + } + ], + "qualifiedName": "snowflake://{nameSpcBodyParts[0]}/databases/{nameGroups[0].parts[0]}/schemas/{nameGroups[0].parts[1]}/tables/{nameGroups[0].parts[2]}", + "purviewDataType": "snowflake_table", + "purviewPrefix": "https" +} \ No newline at end of file diff --git a/docs/powershell-alternatives.md b/docs/powershell-alternatives.md new file mode 100644 index 0000000..258100c --- /dev/null +++ b/docs/powershell-alternatives.md @@ -0,0 +1,25 @@ +# Powershell Alternative Scripts + +In some cases, you're not able to use the cloud shell or you don't have access to a machine that can run wsl / curl. This doc provides alternatives to select + +## Upload Custom Types + +Assumes you are in the `deployment/infra` folder of the repo. + +```powershell +$purview_endpoint="https://PURVIEW_ACCOUNT_NAME.purview.azure.com" +$TENANT_ID="TENANT_ID" +$CLIENT_ID="CLIENT_ID" +$CLIENT_SECRET="CLIENT_SECRET" + +$get_token=(Invoke-RestMethod -Method 'Post' -Uri "https://login.microsoftonline.com/$TENANT_ID/oauth2/token" -Body "resource=https://purview.azure.net&client_id=$CLIENT_ID&client_secret=$CLIENT_SECRET&grant_type=client_credentials") +$token=$get_token.access_token +$body=(Get-Content -Path .\Custom_Types.json) +$headers = @{ +'Content-Type'='application/json' +'Authorization'= "Bearer $token" +} + +Invoke-RestMethod -Method 'Post' -Uri "$purview_endpoint/catalog/api/atlas/v2/types/typedefs" -Body $body -Headers $headers + +``` diff --git a/function-app/adb-to-purview/src/Function.Domain/Helpers/OlProcessing/OlMessageConsolodation.cs b/function-app/adb-to-purview/src/Function.Domain/Helpers/OlProcessing/OlMessageConsolodation.cs index 2a4687b..5e8dc48 100644 --- a/function-app/adb-to-purview/src/Function.Domain/Helpers/OlProcessing/OlMessageConsolodation.cs +++ b/function-app/adb-to-purview/src/Function.Domain/Helpers/OlProcessing/OlMessageConsolodation.cs @@ -2,6 +2,7 @@ // Licensed under the MIT License. using System; +using System.Linq; using System.Threading.Tasks; using System.Threading; using System.Collections.Generic; @@ -143,12 +144,26 @@ private async Task ProcessStartEvent(Event olEvent, string jobRunId, Envir } try { - var entity = new TableEntity(TABLE_PARTITION, olEvent.Run.RunId) + if (isDataSourceV2Event(olEvent)) + // Store inputs and env facet. { - { "EnvFacet", JsonConvert.SerializeObject(olEvent.Run.Facets.EnvironmentProperties) } - }; + var entity = new TableEntity(TABLE_PARTITION, olEvent.Run.RunId) + { + { "EnvFacet", JsonConvert.SerializeObject(olEvent.Run.Facets.EnvironmentProperties) }, + { "Inputs", JsonConvert.SerializeObject(olEvent.Inputs) } + + }; + await _tableClient.AddEntityAsync(entity); + } + else { + // Store only env facet. + var entity = new TableEntity(TABLE_PARTITION, olEvent.Run.RunId) + { + { "EnvFacet", JsonConvert.SerializeObject(olEvent.Run.Facets.EnvironmentProperties) } - await _tableClient.AddEntityAsync(entity); + }; + await _tableClient.AddEntityAsync(entity); + } } catch (RequestFailedException ex) { @@ -159,6 +174,7 @@ private async Task ProcessStartEvent(Event olEvent, string jobRunId, Envir _log.LogError(ex, $"OlMessageConsolodation-ProcessStartEvent: Error {ex.Message} when processing entity"); return false; } + return true; } @@ -168,15 +184,18 @@ private async Task JoinEventData(Event olEvent, string jobRunId) { return false; } - - TableEntity te; + + TableEntity te = null; + TableEntity te_inputs = null; + + bool ret_val = true; // Processing time can sometimes cause complete events int retryCount = 4; int currentRetry = 0; TimeSpan delay = TimeSpan.FromSeconds(1); - while (true) + while (ret_val) { try { @@ -189,7 +208,30 @@ private async Task JoinEventData(Event olEvent, string jobRunId) _log.LogWarning($"Start event was missing, retrying to consolidate message. Retry count: {currentRetry}"); if (currentRetry > retryCount) { - return false; + ret_val = false; + break; + } + } + await Task.Delay(delay); + } + + // Get inputs. TODO: Check if more efficient to get inputs within the same while loop above. Can we get 2 entities at the same time? + currentRetry = 0; + while (ret_val) // use a variable instead of just true, because if we didn't have the env_facet then we don't need to get inputs + { + try + { + te_inputs = await _tableClient.GetEntityAsync(TABLE_PARTITION, olEvent.Run.RunId, new string[] { "Inputs" }); + break; + } + catch (RequestFailedException) + { + currentRetry++; + _log.LogWarning($"Start event was missing, retrying to consolidate message to get inputs. Retry count: {currentRetry}"); + if (currentRetry > retryCount) + { + ret_val = false; + break; } } await Task.Delay(delay); @@ -200,31 +242,91 @@ private async Task JoinEventData(Event olEvent, string jobRunId) if (envFacet is null) { _log.LogWarning($"OlMessageConsolodation-JoinEventData: Warning environment facet for COMPLETE event is null"); - return false; + ret_val = false; } olEvent.Run.Facets.EnvironmentProperties = envFacet; - // clean up table over time - try - { - var delresp = await _tableClient.DeleteEntityAsync(TABLE_PARTITION, olEvent.Run.RunId); - } - catch (Exception ex) - { - _log.LogError(ex, $"OlMessageConsolodation-JoinEventData: Error {ex.Message} when deleting entity"); + // Check if saved any inputs from the START event (will only be done for events containing DataSourceV2 sources) + if (te_inputs is not null) { + + if (te_inputs.ContainsKey("Inputs")) { + try { + //TODO: Find out why inputs might be null. Technically inputs are only added to the table if they exist. This is also not an issue when running locally. + if (te_inputs["Inputs"] != null) { + + var saved_inputs = JsonConvert.DeserializeObject>(te_inputs["Inputs"].ToString() ?? ""); + + if (saved_inputs is null) { + _log.LogInformation($"OlMessageConsolodation-JoinEventData: No inputs found for COMPLETE event"); + } + + else { + // Check inputs saved against inputs captured in this COMPLETE event and combine while removing any duplicates. + // Checking for duplicates needed since we save all the inputs captured from the START event. Perhaps it may be better to + // only save the DataSourceV2 inputs? + var inputs = new List(saved_inputs.Count + olEvent.Inputs.Count); + inputs.AddRange(saved_inputs); + inputs.AddRange(olEvent.Inputs); + var unique_inputs = inputs.Distinct(); + olEvent.Inputs = unique_inputs.ToList(); + _log.LogInformation($"OlMessageConsolodation-JoinEventData: Captured inputs for COMPLETE event"); + } + } + } + catch (System.Exception ex) { + _log.LogError(ex, $"OlMessageConsolodation-JoinEventData: Error {ex.Message} when deserializing inputs"); + ret_val = false; + } + } + + } - return true; + // clean up table over time. + try + { + var delresp = await _tableClient.DeleteEntityAsync(TABLE_PARTITION, olEvent.Run.RunId); + } + catch (Exception ex) + { + _log.LogError(ex, $"OlMessageConsolodation-JoinEventData: Error {ex.Message} when deleting entity"); + } + + // Need to make sure we're only processing this COMPLETE event if it has both + // inputs and outputs (reflects original logic, prior to supporting DataSourceV2 events) + if (!(olEvent.Inputs.Count > 0 && olEvent.Outputs.Count > 0)) { + ret_val = false; + } + + return ret_val; } // Returns true if olEvent is of type START and has the environment facet private bool IsStartEventEnvironment(Event olEvent) { if (olEvent.EventType == START_EVENT && olEvent.Run.Facets.EnvironmentProperties != null) - { - return true; - } + { + return true; + } + + return false; + } + + /// + /// Helper function to determine if the event is one of + /// the data source v2 ones which needs us to save the + /// inputs from the start event + /// + private bool isDataSourceV2Event(Event olEvent) { + string[] special_cases = {"azurecosmos://", "iceberg://"}; // todo: make this configurable? + foreach (var inp in olEvent.Inputs) + { + foreach (var source in special_cases) + { + if (inp.NameSpace.StartsWith(source)) return true; + } + } return false; } @@ -232,7 +334,7 @@ private bool IsJoinEvent(Event olEvent) { if (olEvent.EventType == COMPLETE_EVENT) { - if (olEvent.Inputs.Count > 0 && olEvent.Outputs.Count > 0) + if (olEvent.Outputs.Count > 0) { return true; } diff --git a/function-app/adb-to-purview/src/Function.Domain/Helpers/OlProcessing/ValidateOlEvent.cs b/function-app/adb-to-purview/src/Function.Domain/Helpers/OlProcessing/ValidateOlEvent.cs index 984eb3b..7a554ea 100644 --- a/function-app/adb-to-purview/src/Function.Domain/Helpers/OlProcessing/ValidateOlEvent.cs +++ b/function-app/adb-to-purview/src/Function.Domain/Helpers/OlProcessing/ValidateOlEvent.cs @@ -32,7 +32,7 @@ public ValidateOlEvent(ILoggerFactory loggerFactory) /// /// Performs initial validation of OpenLineage input /// The tested criteria include: - /// 1. Events have both inputs and outputs + /// 1. Events have outputs (not both inputs and outputs, because in the case of DataSourceV2 events, the COMPLETE event will not have inputs) /// 2. Events do not have the same input and output /// 3. EventType is START or COMPLETE /// 4. If EventType is START, there is a Environment Facet @@ -40,23 +40,26 @@ public ValidateOlEvent(ILoggerFactory loggerFactory) /// OpenLineage Event message /// true if input is valid, false if not public bool Validate(Event olEvent){ - if (olEvent.Inputs.Count > 0 && olEvent.Outputs.Count > 0) + if (olEvent.Outputs.Count > 0) + // Want to save COMPLETE events even if they only have outputs, to deal with cosmos { // Need to rework for multiple inputs and outputs in one packet - possibly combine and then hash if (InOutEqual(olEvent)) - { + { return false; } if (olEvent.EventType == "START") { - if (olEvent.Run.Facets.EnvironmentProperties == null) + // START events should contain both inputs and outputs, as well as the EnvironmentProperties facet + if (olEvent.Run.Facets.EnvironmentProperties == null || !(olEvent.Inputs.Count > 0 && olEvent.Outputs.Count > 0)) { return false; } return true; } - else if (olEvent.EventType == "COMPLETE") - { + // COMPLETE events might not contain inputs, but should have at least one output. + else if (olEvent.EventType == "COMPLETE" && olEvent.Outputs.Count > 0) + { return true; } else diff --git a/function-app/adb-to-purview/src/Function.Domain/Helpers/PurviewCustomType.cs b/function-app/adb-to-purview/src/Function.Domain/Helpers/PurviewCustomType.cs index 1385397..af8c97f 100644 --- a/function-app/adb-to-purview/src/Function.Domain/Helpers/PurviewCustomType.cs +++ b/function-app/adb-to-purview/src/Function.Domain/Helpers/PurviewCustomType.cs @@ -25,12 +25,11 @@ public class PurviewCustomType private readonly ILogger _logger; private readonly string EntityType = "purview_custom_connector_generic_entity_with_columns"; private PurviewClient _client; - private JObject? simpleEntity; private JObject? properties; private AppConfigurationSettings? config = new AppConfigurationSettings(); public JObject? Fullentity = new JObject(); bool useResourceSet = true; - bool usePurviewTypes = false; + public string? originalQualifiedName {get; private set;} /// /// Property that contains all Json attributes for the Custom data Entity in Microsoft Purview /// @@ -39,6 +38,13 @@ public JObject Properties get { return properties!; } } /// + /// Get the current qualifedName + /// + public string currentQualifiedName() + { + return properties!["attributes"]!["qualifiedName"]!.ToString(); + } + /// /// Creation of a Microsoft Purview Custom Type entity that initialize all attributes needed /// /// Name of the Data Entity @@ -54,7 +60,6 @@ public PurviewCustomType(string name, string typeName, string qualified_name, st _logger = logger; _client = client; useResourceSet = config!.useResourceSet; - usePurviewTypes = config!.useResourceSet; Init(name , typeName @@ -63,7 +68,7 @@ public PurviewCustomType(string name, string typeName, string qualified_name, st , description , guid ); - _logger.LogInformation($"New Entity Initialized in the process with a passed Purview Client: Nome:{name} - qualified_name:{qualified_name} - Guid:{guid}"); + _logger.LogInformation($"New Entity Initialized in the process with a passed Purview Client: Nome:{name} - qualified_name:{qualified_name} - Type: {typeName} - Guid:{guid}"); } /// /// Creation of a Microsoft Purview Custom Type entity that initialize all attributes needed @@ -115,11 +120,33 @@ public bool IsSpark_Process(string typeName) return false; } + /// + /// Validate if the entity is a Blob or Data Lake entity type but not a resource set + /// + /// Type name + /// boolean + public bool IsBlobOrDataLakeFS_Entity(string typeName) + { + string typeNameLowercase = typeName.ToLower(); + return ((typeNameLowercase == "azure_blob_path") || (typeNameLowercase == "azure_datalake_gen2_path") || (typeNameLowercase == "azure_datalake_gen2_filesystem")); + } + + /// + /// Validate if the entity is a Blob or Data Lake resource type but not a file path or system type + /// + /// Type name + /// boolean + public bool IsBlobOrDataLakeResourceSet_Entity(string typeName) + { + string typeNameLowercase = typeName.ToLower(); + return ((typeNameLowercase == "azure_datalake_gen2_resource_set") || (typeNameLowercase == "azure_blob_resource_set")); + } + private void Init(string name, string typeName, string qualified_name, string data_type, string description, Int64 guid) { is_dummy_asset = false; properties = new JObject(); - simpleEntity = new JObject(); + originalQualifiedName = qualified_name; //Loading Entity properties into Json format of an AtlasEntity properties.Add("typeName", typeName); properties.Add("guid", guid); @@ -130,23 +157,7 @@ private void Init(string name, string typeName, string qualified_name, string da ((JObject)properties["attributes"]!).Add("data_type", data_type); ((JObject)properties["attributes"]!).Add("description", description); } - /// - /// Export basic data entity attribute - /// - /// Json Object - public JObject SimpleEntity - { - get - { - //Simple Json format for use in Atlas requests - simpleEntity = new JObject(); - simpleEntity.Add("typeName", this.Properties["typeName"]!.ToString()); - simpleEntity.Add("guid", this.Properties["guid"]!.ToString()); - simpleEntity.Add("qualifiedName", this.Properties["attributes"]!["qualifiedName"]!.ToString()); - _logger.LogInformation($"Retrived Entity simple Object: {simpleEntity.ToString()}"); - return simpleEntity; - } - } + /// /// Get a list on Data Entities in Microsoft Purview using Entity Qualified Name and Type /// @@ -164,84 +175,6 @@ public async Task> GetEntity() /// /// Boolean public bool is_dummy_asset { get; set; } - /// - /// Add Relationship to entity as columns to tables type - /// - /// Table to be related to - /// bool - public bool AddToTable(PurviewCustomType Table) - { - //Validating if the table attribute exists if not we will initialize - if (!((JObject)properties!["relationshipAttributes"]!).ContainsKey("table")) - ((JObject)properties!["relationshipAttributes"]!).Add("table", new JObject()); - - _logger.LogInformation($"Entity qualifiedName: {properties["attributes"]!["qualifiedName"]}Table.simpleEntity: {Table!.simpleEntity!.ToString()}"); - properties["relationshipAttributes"]!["table"] = Table.simpleEntity; - return true; - } - /// - /// Search an Entity in Microsoft Purview using Qualified Name and Type - /// - /// Type to search for - /// Json object - public async Task FindQualifiedNameInPurview(string typeName) - { - //Search using search method and qualifiedName attribute. Scape needs to be used on some non safe (web URLs) chars - EntityModel results = await this._client.search_entities( - properties!["attributes"]!["qualifiedName"]!.ToString() - , typeName); - - if (results.qualifiedName == null) - { - _logger.LogInformation($"Entity qualifiedName:{properties["attributes"]!["qualifiedName"]!.ToString()} - typeName:{typeName}, not found!"); - this.is_dummy_asset = true; - properties["typeName"] = EntityType; - return new JObject(); - } - var guid = ""; - this.is_dummy_asset = false; - var _qualifiedName = ""; - //validate if qualifiedName is the same - _qualifiedName = results.qualifiedName; - if (results.entityType == "azure_datalake_gen2_resource_set") - properties["attributes"]!["qualifiedName"] = _qualifiedName; - if (_qualifiedName.Trim('/').ToLower() == properties["attributes"]!["qualifiedName"]!.ToString().Trim('/').ToLower()) - { - //search api return quid on ID property - guid = results.id; - properties["typeName"] = results.entityType; - properties!["attributes"]!["qualifiedName"] = results.qualifiedName; - //break if find a non dummy entity with the qualified name - if (results.entityType == EntityType) - { - //log.debug("search_entity_by_qualifiedName: entity \'{name}\' is Dummy Entity"); - //mark entity as dummy to be created - this.is_dummy_asset = true; - } - _logger.LogInformation($"Entity qualifiedName:{properties["attributes"]!["qualifiedName"]!.ToString()} - typeName:{typeName} - guid:{guid}, found!"); - } - - if (!this.is_dummy_asset) - properties!["attributes"]!["qualifiedName"] = results.qualifiedName; - - var content = new JObject(); - content.Add(_qualifiedName, new JObject()); - properties["guid"] = guid; - ((JObject)content[_qualifiedName]!).Add("guid", guid); - ((JObject)content[_qualifiedName]!).Add("qualifiedName", properties!["attributes"]!["qualifiedName"]!.ToString()); - - return content; - } - /// - /// Remove any unused Dummy entities - /// - /// Boolean - public async Task CleanUnusedCustomEntities() - { - return await this._client.Delete_Unused_Entity( - properties!["attributes"]!["qualifiedName"]!.ToString() - , properties!["typeName"]!.ToString()); - } private List? qNames = new List(); /// @@ -341,22 +274,34 @@ public async Task QueryInPurview(string TypeName) } } + String _fqn = properties!["attributes"]!["qualifiedName"]!.ToString(); List results = await this._client.Query_entities(filter["filter"]!); + _logger.LogDebug($"Existing Asset Match Search for {_fqn}: Found {results.Count} candidate matches"); if (results.Count > 0) { - List validentity = await SelectReturnEntity(results); - if (validentity.Count > 0) + _logger.LogDebug($"Existing Asset Match Search for {_fqn}: The first match has a fqn of {results[0].qualifiedName} and type of {results[0].entityType}"); + List validEntitiesAfterFiltering = await SelectReturnEntity(results); + _logger.LogDebug($"Existing Asset Match Search for {_fqn}: Found {validEntitiesAfterFiltering.Count} valid entity matches"); + if (validEntitiesAfterFiltering.Count > 0) { - obj = validentity[0]; - properties["guid"] = validentity[0].id; - properties["typeName"] = validentity[0].entityType; - properties!["attributes"]!["qualifiedName"] = validentity[0].qualifiedName; - this.Fullentity = await this._client.GetByGuid(validentity[0].id); + _logger.LogInformation($"Existing Asset Match Search for {_fqn}: The first valid match has a fqn of {validEntitiesAfterFiltering[0].qualifiedName} and type of {validEntitiesAfterFiltering[0].entityType}"); + obj = validEntitiesAfterFiltering[0]; + properties["guid"] = validEntitiesAfterFiltering[0].id; + properties["typeName"] = validEntitiesAfterFiltering[0].entityType; + properties!["attributes"]!["qualifiedName"] = validEntitiesAfterFiltering[0].qualifiedName; + this.Fullentity = await this._client.GetByGuid(validEntitiesAfterFiltering[0].id); this.is_dummy_asset = false; } + // If there are matches but there are none that are valid, it should still be a dummy asset + else + { + _logger.LogInformation($"Existing Asset Match Search for {_fqn}: Changing type to placeholder type because zero valid entities"); + properties["typeName"] = EntityType; + } } else { + _logger.LogDebug($"Existing Asset Match Search for {_fqn}: Changing type to dummy type because zero search results in general"); properties["typeName"] = EntityType; } return obj; @@ -364,10 +309,10 @@ public async Task QueryInPurview(string TypeName) private async Task> SelectReturnEntity(List results) { List validEntities = new List(); - bool resourceSetHasBeenSeen = false; + bool matchingResourceSetHasBeenSeen = false; foreach (QueryValeuModel entity in results) { - _logger.LogDebug($"Working on {entity.entityType} with score {entity.SearchScore}"); + _logger.LogDebug($"Validating {this.to_compare_QualifiedName} vs {entity.qualifiedName} - Type: {entity.entityType} search score: {entity.SearchScore}"); if (IsSpark_Entity(entity.entityType)) if (results[0].qualifiedName.ToLower().Trim('/') != this.properties!["attributes"]!["qualifiedName"]!.ToString().ToLower().Trim('/')) { @@ -375,117 +320,67 @@ private async Task> SelectReturnEntity(List 0) { - if ((entity.entityType.ToLower() == "azure_blob_path") || (entity.entityType.ToLower() == "azure_datalake_gen2_path") || (entity.entityType.ToLower() == "azure_datalake_gen2_filesystem")) - { - if (validEntities.Count > 0) - { - JObject folder = await _client.GetByGuid(entity.id); - if (folder.ContainsKey("entity")) - { - if (((JArray)folder!["entity"]!["relationshipAttributes"]!["inputToProcesses"]!).Count > 0) - { - foreach (JObject val in ((JArray)folder!["entity"]!["relationshipAttributes"]!["inputToProcesses"]!)) - { - if (val!["typeName"]!.ToString().ToLower().IndexOf("adf_") > -1) - { - validEntities = new List(); - validEntities.Add(entity); - return validEntities; - } - } - } - else - { - if (((JArray)folder!["entity"]!["relationshipAttributes"]!["outputFromProcesses"]!).Count > 0) - { - foreach (JObject val in ((JArray)folder!["entity"]!["relationshipAttributes"]!["outputFromProcesses"]!)) - { - if (val!["typeName"]!.ToString().ToLower().IndexOf("adf_") > -1) - { - validEntities = new List(); - validEntities.Add(entity); - return validEntities; - } - } - } - - } - } - } + JObject folder = await _client.GetByGuid(entity.id); + if (IsInputOrOutputOfAzureDataFactoryEntity(folder)){ + _logger.LogDebug($"Validating {this.to_compare_QualifiedName} vs {entity.qualifiedName} - Discovered entity is part of an ADF process and has been inserted first"); + validEntities.Insert(0,entity); + continue; + } + // If the first valid entity is the default generic entity, insert + // this match into the first position and continue. This helps when + // there is a folder matching but the generic entity is higher up in search + if (validEntities[0].entityType == EntityType){ + validEntities.Insert(0,entity); } - validEntities.Add(entity); } + // Fall through: We know the qualified name matches but it's either the first + // valid entity OR it's not attached to any Azure Data Factory process + validEntities.Add(entity); + } + else + { + // Fall through: We know the qualified name matches but it's not any of the above special cases + validEntities.Add(entity); } } - return validEntities; + return validEntities; } - private string Name_To_Search(string Name) - { - Func isNumber = delegate (string num) - { - return Int64.TryParse(num, out long number)!; - }; - - Func newName = delegate (char separator) - { - string[] partsName = Name.Split(separator); - int index = 0; - foreach (string? part in partsName) - { - if (isNumber(part)) - partsName[index] = "{N}"; - index++; - } - return string.Join(separator, partsName)!; - }; - - if (isNumber(Name)) - { - return "{N}"; - } - - if (Name.Contains('=')) - { - return newName('='); - } - - if (Name.Contains('-')) - { - return newName('-'); - } - - if (Name.Contains('_')) - { - return newName('_'); - } - - return Name; - } private bool Is_Valid_Name(string name) { Func isNumber = delegate (string num) @@ -562,6 +457,50 @@ private string to_compare_QualifiedName return string.Join("/", this.qNames!); } } + + // For resource sets, since Microsoft Purview cannot register the same storage account for + // both ADLS G2 and Blob Storage, we need to match against either pattern (dfs.core.windows.net + // or blob.core.windows.net) since we cannot be certain which one the end user has scanned. + private bool QualifiedNames_Match_After_Normalizing(string entityOfInterestQualifiedName, string candidateQualifiedName) + { + string _entityOfInterestFQN = entityOfInterestQualifiedName.ToLower().Trim().Replace(".dfs.core.windows.net","").Replace(".blob.core.windows.net","").Trim('/'); + string _candidateFQN = candidateQualifiedName.ToLower().Trim().Replace(".dfs.core.windows.net","").Replace(".blob.core.windows.net","").Trim('/'); + return _entityOfInterestFQN == _candidateFQN; + } + + // Given an entity (presumably a blob or data lake folder), check to see if it has + // relationship attributes that indicate the entity is the input to or output of + // an azure data factory (adf_) process. + private bool IsInputOrOutputOfAzureDataFactoryEntity(JObject folder) + { + if (!folder.ContainsKey("entity")){ + return false; + } + + // TODO Refactor this to look across both inputs and outputs from one list + if (((JArray)folder!["entity"]!["relationshipAttributes"]!["inputToProcesses"]!).Count > 0) + { + foreach (JObject val in ((JArray)folder!["entity"]!["relationshipAttributes"]!["inputToProcesses"]!)) + { + if (val!["typeName"]!.ToString().ToLower().IndexOf("adf_") > -1) + { + return true; + } + } + } + else if (((JArray)folder!["entity"]!["relationshipAttributes"]!["outputFromProcesses"]!).Count > 0) + { + foreach (JObject val in ((JArray)folder!["entity"]!["relationshipAttributes"]!["outputFromProcesses"]!)) + { + if (val!["typeName"]!.ToString().ToLower().IndexOf("adf_") > -1) + { + return true; + } + } + } + // Fall through - If we didn't have an adf_ relationship + return false; + } } /// diff --git a/function-app/adb-to-purview/src/Function.Domain/Helpers/parser/ColParser.cs b/function-app/adb-to-purview/src/Function.Domain/Helpers/parser/ColParser.cs index ec24d84..a4defd3 100644 --- a/function-app/adb-to-purview/src/Function.Domain/Helpers/parser/ColParser.cs +++ b/function-app/adb-to-purview/src/Function.Domain/Helpers/parser/ColParser.cs @@ -38,8 +38,12 @@ public ColParser(ParserSettings configuration, ILoggerFactory logger, Event olEv /// This class will be used for the parsing code. /// /// - public List GetColIdentifiers() + { + return GetColIdentifiers(new Dictionary{}); + } + + public List GetColIdentifiers(Dictionary originalToMatchedFqn ) { var col = new List(); @@ -48,8 +52,12 @@ public List GetColIdentifiers() foreach(KeyValuePair colInfo in colId.Facets.ColFacets.fields) { var dataSet = new DatasetMappingClass(); - //dataSet.sink = $"{colId.NameSpace}, {colId.Name}"; dataSet.sink = _qnParser.GetIdentifiers(colId.NameSpace, colId.Name).QualifiedName; + // The identifier from OpenLineage may not be the same as what is discovered on + // the Purview catalog. This includes cases like resource sets or blob vs dfs paths + if (originalToMatchedFqn.ContainsKey(dataSet.sink)){ + dataSet.sink = originalToMatchedFqn[dataSet.sink]; + } var columnLevels = new List(); foreach (ColumnLineageIdentifierClass colInfo2 in colInfo.Value.inputFields) { @@ -63,7 +71,12 @@ public List GetColIdentifiers() } } dataSet.source = _qnParser.GetIdentifiers(colInfo2.nameSpace, colInfo2.name).QualifiedName; - //dataSet.source = "*"; + // The identifier from OpenLineage may not be the same as what is discovered on + // the Purview catalog. This includes cases like resource sets or blob vs dfs paths + if (originalToMatchedFqn.ContainsKey(dataSet.source)){ + dataSet.source = originalToMatchedFqn[dataSet.source]; + } + columnLevel.source = colInfo2.field; columnLevel.sink = colInfo.Key; columnLevels.Add(columnLevel); diff --git a/function-app/adb-to-purview/src/Function.Domain/Helpers/parser/DatabricksToPurviewParser.cs b/function-app/adb-to-purview/src/Function.Domain/Helpers/parser/DatabricksToPurviewParser.cs index 9ac9bfb..f7c7ca4 100644 --- a/function-app/adb-to-purview/src/Function.Domain/Helpers/parser/DatabricksToPurviewParser.cs +++ b/function-app/adb-to-purview/src/Function.Domain/Helpers/parser/DatabricksToPurviewParser.cs @@ -14,6 +14,7 @@ using System.Security.Cryptography; using System.Text; using Newtonsoft.Json; +using System.Text.RegularExpressions; namespace Function.Domain.Helpers { @@ -30,6 +31,7 @@ public class DatabricksToPurviewParser: IDatabricksToPurviewParser private readonly EnrichedEvent _eEvent; private readonly string _adbWorkspaceUrl; const string SETTINGS = "OlToPurviewMappings"; + Regex ADF_JOB_NAME_REGEX = new Regex(@"^ADF_(.*)_(.*)_(.*)_(.*)$", RegexOptions.Compiled ); /// /// Constructor for DatabricksToPurviewParser @@ -108,7 +110,6 @@ public DatabricksWorkspace GetDatabricksWorkspace() DatabricksWorkspace databricksWorkspace = new DatabricksWorkspace(); databricksWorkspace.Attributes.Name = $"{_adbWorkspaceUrl}.azuredatabricks.net"; databricksWorkspace.Attributes.QualifiedName = $"databricks://{_adbWorkspaceUrl}.azuredatabricks.net"; - //databricksWorkspace.Attributes.ColumnMapping = JsonConvert.SerializeObject(_colParser.GetColIdentifiers()); return databricksWorkspace; } @@ -135,8 +136,18 @@ public DatabricksJob GetDatabricksJob(string workspaceQn) adbJobRoot = _eEvent.AdbRoot; } var databricksJob = new DatabricksJob(); - databricksJob.Attributes.Name = adbJobRoot.RunName; - databricksJob.Attributes.QualifiedName = $"databricks://{_adbWorkspaceUrl}.azuredatabricks.net/jobs/{_eEvent.AdbRoot!.JobId}"; + string _jobName = adbJobRoot.RunName; + string _jobId = _eEvent.AdbRoot!.JobId.ToString(); + // Special case for Azure Data Factory + // If we match this pattern in the job name, strip the last element since it's a random guid + // This will allow us to have the same name / qualified name each run + if (IsAdfJobName(_jobName)){ + _logger.LogInformation($"Azure Data Factory Job being processed: ({_jobName})"); + _jobName = TruncateAdfJobName(_jobName); + _jobId = _jobName; + } + databricksJob.Attributes.Name = _jobName; + databricksJob.Attributes.QualifiedName = $"databricks://{_adbWorkspaceUrl}.azuredatabricks.net/jobs/{_jobId}"; databricksJob.Attributes.JobId = adbJobRoot.JobId; databricksJob.Attributes.CreatorUserName = adbJobRoot.CreatorUserName; @@ -188,9 +199,20 @@ private void GetDatabricksJobTaskAttributes(DatabricksJobTaskAttributes taskAttr _logger.LogError(ex, ex.Message); throw ex; } - taskAttributes.Name = _eEvent.AdbRoot.JobTasks[0].TaskKey; - string jobQn = $"databricks://{_adbWorkspaceUrl}.azuredatabricks.net/jobs/{_eEvent.AdbRoot.JobId}"; - taskAttributes.QualifiedName = $"{jobQn}/tasks/{_eEvent.AdbRoot.JobTasks[0].TaskKey}"; + + string _taskKey = _eEvent.AdbRoot.JobTasks[0].TaskKey; + string _taskJobId = _eEvent.AdbRoot.JobId.ToString(); + // Special case for Azure Data Factory + // If we match this pattern in the job name, strip the last element since it's a random guid + // This will allow us to have the same name / qualified name each run + if (IsAdfJobName(_taskKey)){ + _logger.LogInformation($"Azure Data Factory Task being processed: ({_taskKey})"); + _taskJobId = TruncateAdfJobName(_taskKey); + _taskKey = TruncateAdfTaskName(_taskKey); + } + taskAttributes.Name = _taskKey; + string jobQn = $"databricks://{_adbWorkspaceUrl}.azuredatabricks.net/jobs/{_taskJobId}"; + taskAttributes.QualifiedName = $"{jobQn}/tasks/{_taskKey}"; taskAttributes.JobId = _eEvent.AdbRoot.JobId; taskAttributes.ClusterId = _eEvent.AdbRoot.JobTasks[0].ClusterInstance.ClusterId; taskAttributes.SparkVersion = _eEvent.OlEvent?.Run.Facets.SparkVersion.SparkVersion ?? ""; @@ -245,7 +267,7 @@ public DatabricksPythonWheelTask GetDatabricksPythonWheelTask(string jobQn) databricksPythonWheelTask.Attributes.PackageName = _eEvent.AdbRoot?.JobTasks?[0].PythonWheelTask?.PackageName ?? ""; databricksPythonWheelTask.Attributes.EntryPoint = _eEvent.AdbRoot?.JobTasks?[0].PythonWheelTask?.EntryPoint ?? ""; databricksPythonWheelTask.Attributes.Parameters = _eEvent.AdbRoot?.JobTasks?[0].PythonWheelTask?.Parameters ?? new List(); - databricksPythonWheelTask.Attributes.Wheel = _eEvent.AdbRoot?.JobTasks?[0].Libraries?[0]["whl"] ?? ""; + databricksPythonWheelTask.Attributes.Wheel = _eEvent.AdbRoot?.JobTasks?[0].Libraries?[0].wheelName ?? ""; databricksPythonWheelTask.RelationshipAttributes.Job.QualifiedName = jobQn; @@ -264,7 +286,7 @@ public DatabricksSparkJarTask GetDatabricksSparkJarTask(string jobQn) databricksSparkJarTask.Attributes.MainClassName = _eEvent.AdbRoot?.JobTasks?[0].SparkJarTask?.MainClassName ?? ""; databricksSparkJarTask.Attributes.JarUri = _eEvent.AdbRoot?.JobTasks?[0].SparkJarTask?.JarUri ?? ""; databricksSparkJarTask.Attributes.Parameters = _eEvent.AdbRoot?.JobTasks?[0].SparkJarTask?.Parameters ?? new List(); - databricksSparkJarTask.Attributes.Jar = _eEvent.AdbRoot?.JobTasks?[0].Libraries?[0]["jar"] ?? ""; + databricksSparkJarTask.Attributes.Jar = _eEvent.AdbRoot?.JobTasks?[0].Libraries?[0].jarName ?? ""; databricksSparkJarTask.RelationshipAttributes.Job.QualifiedName = jobQn; @@ -294,7 +316,6 @@ public DatabricksProcess GetDatabricksProcess(string taskQn) } databricksProcess.Attributes = GetProcAttributes(taskQn, inputs,outputs,_eEvent.OlEvent); - //databricksProcess.Attributes.ColumnMapping = JsonConvert.SerializeObject(_colParser.GetColIdentifiers()); databricksProcess.RelationshipAttributes.Task.QualifiedName = taskQn; return databricksProcess; } @@ -322,13 +343,6 @@ private InputOutput GetInputOutputs(IInputsOutputs inOut) return inputOutputId; } - // private ColumnLevelAttributes GetColumnLevelAttributes(IInputsOutputs inOut) - // { - // var id = _colParser.GetColIdentifiers(_eEvent.OlEvent.Outputs); - // var columnLevelId = new ColumnLevelAttributes(); - // return columnLevelId; - // } - private string GetInputsOutputsHash(List inputs, List outputs) { inputs.Sort((x, y) => x.UniqueAttributes.QualifiedName.CompareTo(y.UniqueAttributes.QualifiedName));; @@ -366,5 +380,30 @@ private string GenerateMd5Hash(string input) } return sOutput.ToString(); } + + // Special case for Azure Data Factory + // If we match this pattern in the job name, strip the last element since it's a random guid + // This will allow us to have the same name / qualified name each run + private bool IsAdfJobName(string inputName){ + // Follows the pattern ADF_factoryName_pipelineName_notebookName_pipelineRunId + return (ADF_JOB_NAME_REGEX.Matches(inputName).Count > 0); + } + private string TruncateAdfTaskName(string inputName){ + // Return ADF_factoryName_pipelineName_notebookName portions + string[] job_name_parts = inputName.Split("_"); + string[] job_name_except_last_element = job_name_parts.Take(job_name_parts.Count() - 1).ToArray(); + return string.Join("_", job_name_except_last_element); + } + private string TruncateAdfJobName(string inputName){ + // Return ADF_factoryName_pipelineName portions + string[] job_name_parts = inputName.Split("_"); + string[] job_name_except_last_element = job_name_parts.Take(job_name_parts.Count() - 2).ToArray(); + return string.Join("_", job_name_except_last_element); + } + + public IColParser GetColumnParser() + { + return this._colParser; + } } } \ No newline at end of file diff --git a/function-app/adb-to-purview/src/Function.Domain/Helpers/parser/IColParser.cs b/function-app/adb-to-purview/src/Function.Domain/Helpers/parser/IColParser.cs index 2ac9aae..977fb84 100644 --- a/function-app/adb-to-purview/src/Function.Domain/Helpers/parser/IColParser.cs +++ b/function-app/adb-to-purview/src/Function.Domain/Helpers/parser/IColParser.cs @@ -7,9 +7,9 @@ namespace Function.Domain.Helpers { - //Interface for ColParser.cs public interface IColParser { public List GetColIdentifiers(); + public List GetColIdentifiers(Dictionary originalToMatchedFqn); } } \ No newline at end of file diff --git a/function-app/adb-to-purview/src/Function.Domain/Helpers/parser/IDatabricksToPurviewParser.cs b/function-app/adb-to-purview/src/Function.Domain/Helpers/parser/IDatabricksToPurviewParser.cs index 409d4b7..114324b 100644 --- a/function-app/adb-to-purview/src/Function.Domain/Helpers/parser/IDatabricksToPurviewParser.cs +++ b/function-app/adb-to-purview/src/Function.Domain/Helpers/parser/IDatabricksToPurviewParser.cs @@ -17,5 +17,6 @@ public interface IDatabricksToPurviewParser public DatabricksSparkJarTask GetDatabricksSparkJarTask(string jobQn); public DatabricksProcess GetDatabricksProcess(string taskQn); public JobType GetJobType(); + public IColParser GetColumnParser(); } } \ No newline at end of file diff --git a/function-app/adb-to-purview/src/Function.Domain/Helpers/parser/QnParser.cs b/function-app/adb-to-purview/src/Function.Domain/Helpers/parser/QnParser.cs index b02ca72..9c403e6 100644 --- a/function-app/adb-to-purview/src/Function.Domain/Helpers/parser/QnParser.cs +++ b/function-app/adb-to-purview/src/Function.Domain/Helpers/parser/QnParser.cs @@ -22,7 +22,7 @@ public class QnParser: IQnParser private ILogger _logger; private readonly string[] JSON_KEY_NAMES = { "prefix", "nameSpcConParts", "nameSpcBodyParts", "nameSpcNameVals", - "nameGroups"}; + "nameGroups", "nameSpaceBodyJoinedBySlashFrom"}; /// /// Constructor for QnParser diff --git a/function-app/adb-to-purview/src/Function.Domain/Models/Parser/Adb/JobTask.cs b/function-app/adb-to-purview/src/Function.Domain/Models/Parser/Adb/JobTask.cs index 3d0ecb4..e3f5ddf 100644 --- a/function-app/adb-to-purview/src/Function.Domain/Models/Parser/Adb/JobTask.cs +++ b/function-app/adb-to-purview/src/Function.Domain/Models/Parser/Adb/JobTask.cs @@ -29,7 +29,7 @@ public class JobTask [JsonProperty("end_time")] public long EndTime = 0; [JsonProperty("libraries")] - public List>? Libraries = null; + public List? Libraries = null; [JsonProperty("notebook_task")] public NotebookTask? NotebookTask = null; [JsonProperty("spark_jar_task")] @@ -39,4 +39,44 @@ public class JobTask [JsonProperty("python_wheel_task")] public PythonWheelTask? PythonWheelTask = null; } -} \ No newline at end of file + public class JobLibrary + { + [JsonProperty("jar")] + public string? jarName = null; + [JsonProperty("egg")] + public string? eggName = null; + [JsonProperty("whl")] + public string? wheelName = null; + [JsonProperty("pypi")] + public PyPiJobLibrary? pypiLibrary = null; + [JsonProperty("maven")] + public MavenJobLibrary? mavenLibrary = null; + [JsonProperty("cran")] + public CranJobLibrary? cranLibrary = null; + } + public class PyPiJobLibrary + { + [JsonProperty("package")] + public string? package = null; + [JsonProperty("repo")] + public string? repo = null; + } + public class MavenJobLibrary + { + [JsonProperty("coordinates")] + public string? coordinates = null; + [JsonProperty("repo")] + public string? repo = null; + [JsonProperty("exclusions")] + public List? exclusions = null; + } + public class CranJobLibrary + { + [JsonProperty("package")] + public string? package = null; + [JsonProperty("repo")] + public string? repo = null; + } + +} + diff --git a/function-app/adb-to-purview/src/Function.Domain/Models/Parser/OpenLineage/Inputs.cs b/function-app/adb-to-purview/src/Function.Domain/Models/Parser/OpenLineage/Inputs.cs index 4f15245..85ba05e 100644 --- a/function-app/adb-to-purview/src/Function.Domain/Models/Parser/OpenLineage/Inputs.cs +++ b/function-app/adb-to-purview/src/Function.Domain/Models/Parser/OpenLineage/Inputs.cs @@ -10,5 +10,19 @@ public class Inputs: IInputsOutputs public string Name { get; set; } = ""; [JsonProperty("namespace")] public string NameSpace { get; set; } = ""; + + public override bool Equals(object obj) { + if (obj is Inputs other) + { + if (Name == other.Name && NameSpace == other.NameSpace) + return true; + } + return false; + } + + public override int GetHashCode() { + return Name.GetHashCode() ^ + NameSpace.GetHashCode(); + } } } \ No newline at end of file diff --git a/function-app/adb-to-purview/src/Function.Domain/Models/Parser/OpenLineage/OlParts.cs b/function-app/adb-to-purview/src/Function.Domain/Models/Parser/OpenLineage/OlParts.cs index 7a4208c..76502e2 100644 --- a/function-app/adb-to-purview/src/Function.Domain/Models/Parser/OpenLineage/OlParts.cs +++ b/function-app/adb-to-purview/src/Function.Domain/Models/Parser/OpenLineage/OlParts.cs @@ -43,6 +43,14 @@ public OlParts(string nameSpace, string name) } } } + // Add Support for NameSpaceBodyJoinedBySlash to enable mount points with trailing folders + for (int nmSpPos = this._olNameSpaceParts.NameSpaceBodyParts.Count(); nmSpPos > 0; nmSpPos--) + { + this._olNameSpaceParts.NameSpaceBodyJoinedBySlashFrom.Add( + String.Join('/', this._olNameSpaceParts.NameSpaceBodyParts.TakeLast(nmSpPos)) + ); + + } var rgex = new Regex(@"(?<=\[).+?(?=\])"); var rerslt = rgex.Matches(name); if (rerslt.Count > 0) @@ -100,9 +108,9 @@ private bool AddConStringNameValues(string inString, ref Dictionary public Dictionary GetDynamicPairs(string[] keys) { - if (keys == null || keys.Length != 5) + if (keys == null || keys.Length != 6) { - throw new System.ArgumentException("keys must be an array of length 5"); + throw new System.ArgumentException("keys must be an array of length 6"); } var pairs = new Dictionary(); pairs.Add(keys[0], this.Prefix); @@ -110,6 +118,7 @@ public Dictionary GetDynamicPairs(string[] keys) pairs.Add(keys[2], this.OlNameSpaceParts.NameSpaceBodyParts); pairs.Add(keys[3], this.OlNameSpaceParts.NameSpaceConnNameValues); pairs.Add(keys[4], this.OlNameParts.NameGroups); + pairs.Add(keys[5], this.OlNameSpaceParts.NameSpaceBodyJoinedBySlashFrom); return pairs; } @@ -126,6 +135,8 @@ public class OlNameSpaceParts public List NameSpaceConnParts = new List(); // Splits out any name value pairs as identified by = symbol public Dictionary NameSpaceConnNameValues = new Dictionary(); + // Joins the NameSpaceBodyParts joined by a forward slash + public List NameSpaceBodyJoinedBySlashFrom = new List(); } public class OlNameParts diff --git a/function-app/adb-to-purview/src/Function.Domain/Models/Parser/Purview/BaseAttributes.cs b/function-app/adb-to-purview/src/Function.Domain/Models/Parser/Purview/BaseAttributes.cs index 9bb4220..66ea4b5 100644 --- a/function-app/adb-to-purview/src/Function.Domain/Models/Parser/Purview/BaseAttributes.cs +++ b/function-app/adb-to-purview/src/Function.Domain/Models/Parser/Purview/BaseAttributes.cs @@ -12,7 +12,5 @@ public class BaseAttributes public string Name = ""; [JsonProperty("qualifiedName")] public string QualifiedName = ""; - // [JsonProperty("columnMapping")] - // public string ColumnMapping = ""; } } \ No newline at end of file diff --git a/function-app/adb-to-purview/src/Function.Domain/Models/Parser/Settings/AppConfigurationSettings.cs b/function-app/adb-to-purview/src/Function.Domain/Models/Parser/Settings/AppConfigurationSettings.cs index 4200a32..26f6b7a 100644 --- a/function-app/adb-to-purview/src/Function.Domain/Models/Parser/Settings/AppConfigurationSettings.cs +++ b/function-app/adb-to-purview/src/Function.Domain/Models/Parser/Settings/AppConfigurationSettings.cs @@ -26,7 +26,6 @@ public class AppConfigurationSettings public string? ClientSecret { get; set; } public string? TenantId { get; set; } public string EventHubConsumerGroup { get; set; } = "read"; - public bool usePurviewTypes { get; set; } = false; public bool useResourceSet { get; set; } = true; public string AuthEndPoint { get; set; } = "https://login.microsoftonline.com/"; public string Authority diff --git a/function-app/adb-to-purview/src/Function.Domain/Services/IOlToPurviewParsingService.cs b/function-app/adb-to-purview/src/Function.Domain/Services/IOlToPurviewParsingService.cs index 37fbce4..8c52348 100644 --- a/function-app/adb-to-purview/src/Function.Domain/Services/IOlToPurviewParsingService.cs +++ b/function-app/adb-to-purview/src/Function.Domain/Services/IOlToPurviewParsingService.cs @@ -2,12 +2,13 @@ // Licensed under the MIT License. using System.Threading.Tasks; +using Function.Domain.Helpers; using Function.Domain.Models.OL; namespace Function.Domain.Services { public interface IOlToPurviewParsingService { - public string? GetPurviewFromOlEvent(EnrichedEvent eventData); + public string? GetPurviewFromOlEvent(EnrichedEvent eventData, IDatabricksToPurviewParser parser); } } \ No newline at end of file diff --git a/function-app/adb-to-purview/src/Function.Domain/Services/IPurviewIngestion.cs b/function-app/adb-to-purview/src/Function.Domain/Services/IPurviewIngestion.cs index eb75157..5c44432 100644 --- a/function-app/adb-to-purview/src/Function.Domain/Services/IPurviewIngestion.cs +++ b/function-app/adb-to-purview/src/Function.Domain/Services/IPurviewIngestion.cs @@ -3,12 +3,13 @@ using System.Threading.Tasks; using Newtonsoft.Json.Linq; +using Function.Domain.Helpers; namespace Function.Domain.Services { public interface IPurviewIngestion { - public Task SendToPurview(JArray Processes); - public Task SendToPurview(JObject json); + public Task SendToPurview(JArray Processes, IColParser colParser); + public Task SendToPurview(JObject json, IColParser colParser); } } \ No newline at end of file diff --git a/function-app/adb-to-purview/src/Function.Domain/Services/OlConsolodateEnrich.cs b/function-app/adb-to-purview/src/Function.Domain/Services/OlConsolodateEnrich.cs index b0868c6..193f2aa 100644 --- a/function-app/adb-to-purview/src/Function.Domain/Services/OlConsolodateEnrich.cs +++ b/function-app/adb-to-purview/src/Function.Domain/Services/OlConsolodateEnrich.cs @@ -101,10 +101,12 @@ public OlConsolodateEnrich( else { var enrichedEvent = await olEnrichMessage.GetEnrichedEvent(consolodatedEvent); + if (enrichedEvent == null) { return null; } + return enrichedEvent; } } diff --git a/function-app/adb-to-purview/src/Function.Domain/Services/OlToPurviewParsingService.cs b/function-app/adb-to-purview/src/Function.Domain/Services/OlToPurviewParsingService.cs index b8c6199..2d74847 100644 --- a/function-app/adb-to-purview/src/Function.Domain/Services/OlToPurviewParsingService.cs +++ b/function-app/adb-to-purview/src/Function.Domain/Services/OlToPurviewParsingService.cs @@ -42,7 +42,7 @@ public OlToPurviewParsingService(ILoggerFactory loggerFactory, IConfiguration co /// /// Contains OpenLineage and, optionally data obtained from the ADB Jobs API /// Serialized Atlas entities - public string? GetPurviewFromOlEvent(EnrichedEvent eventData) + public string? GetPurviewFromOlEvent(EnrichedEvent eventData, IDatabricksToPurviewParser parser) { if (!verifyEventData(eventData)) { @@ -50,8 +50,6 @@ public OlToPurviewParsingService(ILoggerFactory loggerFactory, IConfiguration co return null; } - IDatabricksToPurviewParser parser = new DatabricksToPurviewParser(_loggerFactory, _config, eventData); - if (eventData.IsInteractiveNotebook) { return ParseInteractiveNotebook(parser); diff --git a/function-app/adb-to-purview/src/Function.Domain/Services/PurviewIngestion.cs b/function-app/adb-to-purview/src/Function.Domain/Services/PurviewIngestion.cs index a7a2c02..a121765 100644 --- a/function-app/adb-to-purview/src/Function.Domain/Services/PurviewIngestion.cs +++ b/function-app/adb-to-purview/src/Function.Domain/Services/PurviewIngestion.cs @@ -14,6 +14,7 @@ using System.Security.Cryptography; using System.Runtime.Caching; using Function.Domain.Models.Settings; +using Newtonsoft.Json; namespace Function.Domain.Services { @@ -23,19 +24,15 @@ namespace Function.Domain.Services /// public class PurviewIngestion : IPurviewIngestion { - private bool useResourceSet = bool.Parse(Environment.GetEnvironmentVariable("useResourceSet") ?? "true"); - private bool usePurviewTypes = bool.Parse(Environment.GetEnvironmentVariable("usePurviewTypes") ?? "false"); private PurviewClient _purviewClient; private Int64 initGuid = -1000; - //stores all mappings of columns for Origin and destination assets - private Hashtable columnmapping = new Hashtable(); //flag use to mark if a data Asset is a Dummy type - private Dictionary entities = new Dictionary(); + private Dictionary entitiesMarkedForDeletion = new Dictionary(); + private Dictionary originalFqnToDiscoveredFqn = new Dictionary(); List inputs_outputs = new List(); private JArray to_purview_Json = new JArray(); private readonly ILogger _logger; - private List found_entities = new List(); - private MemoryCache _payLoad = MemoryCache.Default; + private MemoryCache _cacheOfSeenEvents = MemoryCache.Default; private AppConfigurationSettings? config = new AppConfigurationSettings(); private CacheItemPolicy cacheItemPolicy; /// @@ -58,12 +55,12 @@ public PurviewIngestion(ILogger log) /// /// Array of Entities /// Array on Entities - public async Task SendToPurview(JArray Processes) + public async Task SendToPurview(JArray Processes, IColParser colParser) { foreach (JObject process in Processes) { - if (await SendToPurview(process)) + if (await SendToPurview(process, colParser)) { return new JArray(); } @@ -75,76 +72,83 @@ public async Task SendToPurview(JArray Processes) /// /// Json Object /// Boolean - public async Task SendToPurview(JObject json) + public async Task SendToPurview(JObject json, IColParser colParser) { - var entities = get_attribute("entities", json); + var entitiesFromInitialJson = get_attribute("entities", json); - if (entities == null) + if (entitiesFromInitialJson == null) { - Log("Error", "Not found Attribute entities on " + json.ToString()); + _logger.LogError("Not found Attribute entities on " + json.ToString()); return false; } - string ? dataEvent = CalculateHash(entities.ToString()); - if (!_payLoad.Contains(dataEvent)) + // This hash and cache helps to prevent processing the same event multiple times + string ? dataEvent = CalculateHash(entitiesFromInitialJson.ToString()); + if (!_cacheOfSeenEvents.Contains(dataEvent)) { var cacheItem = new CacheItem(dataEvent, dataEvent); - _payLoad.Add(cacheItem, cacheItemPolicy); + _cacheOfSeenEvents.Add(cacheItem, cacheItemPolicy); - foreach (JObject entity in entities) + foreach (JObject purviewEntityToBeUpdated in entitiesFromInitialJson) { - - - if (Validate_Process_Json(entity)) + if (IsProcessEntity(purviewEntityToBeUpdated)) { - JObject new_entity = await Validate_Process_Entities(entity); + JObject new_entity = await Validate_Process_Entities(purviewEntityToBeUpdated); + // Update Column mapping attribute based on the dictionary and inject the column parser with the openlineage event + // This lets us use the discovered inputs / outputs rather than just what open lineage provides. + string columnMapping = JsonConvert.SerializeObject(colParser.GetColIdentifiers(originalFqnToDiscoveredFqn)); + new_entity["attributes"]!["columnMapping"] = columnMapping; to_purview_Json.Add(new_entity); } else { - if (Validate_Entities_Json(entity)) + if (EntityAttributesHaveBeenPopulated(purviewEntityToBeUpdated)) { - PurviewCustomType new_entity = await Validate_Entities(entity); - //Check Entity Relatioship - // if (new_entity.is_dummy_asset) - // to_purview_Json.Add(new_entity.Properties); + PurviewCustomType new_entity = await Validate_Entities(purviewEntityToBeUpdated); - string qualifiedName = entity["attributes"]!["qualifiedName"]!.ToString(); - if (entity.ContainsKey("relationshipAttributes")) + if (purviewEntityToBeUpdated.ContainsKey("relationshipAttributes")) { - foreach (var rel in entity["relationshipAttributes"]!.Values()) + // For every relationship attribute + foreach (var rel in purviewEntityToBeUpdated["relationshipAttributes"]!.Values()) { - if (((JObject)(entity["relationshipAttributes"]![rel!.Name]!)).ContainsKey("qualifiedName")) + // If the relationship attribute has a qualified name property + if (((JObject)(purviewEntityToBeUpdated["relationshipAttributes"]![rel!.Name]!)).ContainsKey("qualifiedName")) { - if (this.entities.ContainsKey(entity["relationshipAttributes"]![rel!.Name]!["qualifiedName"]!.ToString())) + string _qualifiedNameOfRelatedAsset = purviewEntityToBeUpdated["relationshipAttributes"]![rel!.Name]!["qualifiedName"]!.ToString(); + // If the entitiesMarkedForDeletion dictionary has this related asset + // update the guid of the relationship attribute we're in to be the original one? + if (this.entitiesMarkedForDeletion.ContainsKey(_qualifiedNameOfRelatedAsset)) { - entity["relationshipAttributes"]![rel!.Name]!["guid"] = this.entities[entity["relationshipAttributes"]![rel!.Name]!["qualifiedName"]!.ToString()].Properties["guid"]; + purviewEntityToBeUpdated["relationshipAttributes"]![rel!.Name]!["guid"] = this.entitiesMarkedForDeletion[_qualifiedNameOfRelatedAsset].Properties["guid"]; } else { - string qn = entity["relationshipAttributes"]![rel!.Name]!["qualifiedName"]!.ToString(); + // This entity is created solely to be able to search for the asset based on qualifiedName PurviewCustomType sourceEntity = new PurviewCustomType("search relationship" , "" - , qn + , _qualifiedNameOfRelatedAsset , "" , "search relationship" - , NewGuid() + , NewGuid() // This will be updated after successfully finding the asset via query in purview , _logger , _purviewClient); - + // TODO This should use the qualifiedNamePrefix filter + // Currently fqn may change here QueryValeuModel sourceJson = await sourceEntity.QueryInPurview(); - if (!this.entities.ContainsKey(qn)) - this.entities.Add(qn, sourceEntity); - entity["relationshipAttributes"]![rel!.Name]!["guid"] = sourceEntity.Properties["guid"]; - + // If the related asset has not been seen, add it to the list of assets to be deleted? + if (!this.entitiesMarkedForDeletion.ContainsKey(_qualifiedNameOfRelatedAsset)) + this.entitiesMarkedForDeletion.Add(_qualifiedNameOfRelatedAsset, sourceEntity); + // Update the guid of the relationship attribute with the one that was discovered () + // TODO Handle when sourceJson does not return a typed asset + purviewEntityToBeUpdated["relationshipAttributes"]![rel!.Name]!["guid"] = sourceEntity.Properties["guid"]; } } } } - to_purview_Json.Add(entity); + to_purview_Json.Add(purviewEntityToBeUpdated); } } } @@ -159,53 +163,52 @@ public async Task SendToPurview(JObject json) { if (newEntity.is_dummy_asset) { - if (!usePurviewTypes) - newEntity.Properties["attributes"]!["qualifiedName"] = newEntity.Properties["attributes"]!["qualifiedName"]!.ToString().ToLower(); + newEntity.Properties["attributes"]!["qualifiedName"] = newEntity.Properties["attributes"]!["qualifiedName"]!.ToString().ToLower(); tempEntities.Add(newEntity.Properties); } } payload = "{\"entities\": " + tempEntities.ToString() + "}"; JObject? Jpayload = JObject.Parse(payload); - Log("Info", $"Input/Output Entities to load: {Jpayload.ToString()}"); + _logger.LogInformation($"Input/Output Entities to load: {Jpayload.ToString()}"); results = await _purviewClient.Send_to_Purview(payload); if (results != null) { if (results.ReasonPhrase != "OK") { - Log("Error", $"Error Loading Input/Outputs to Purview: Return Code: {results.StatusCode} - Reason:{results.ReasonPhrase}"); + _logger.LogError($"Error Loading Input/Outputs to Purview: Return Code: {results.StatusCode} - Reason:{results.ReasonPhrase}"); } else { var data = await results.Content.ReadAsStringAsync(); - Log("Info", $"Purview Loaded Relationship, Input and Output Entities: Return Code: {results.StatusCode} - Reason:{results.ReasonPhrase} - Content: {data}"); + _logger.LogInformation($"Purview Loaded Relationship, Input and Output Entities: Return Code: {results.StatusCode} - Reason:{results.ReasonPhrase} - Content: {data}"); } } else { - Log("Error", $"Error Loading to Purview!"); + _logger.LogError($"Error Loading to Purview!"); } } if (to_purview_Json.Count > 0) { - Log("Debug", to_purview_Json.ToString()); + _logger.LogDebug(to_purview_Json.ToString()); payload = "{\"entities\": " + to_purview_Json.ToString() + "}"; JObject? Jpayload = JObject.Parse(payload); - Log("Info", $"To Purview Json Entities to load: {Jpayload.ToString()}"); + _logger.LogInformation($"To Purview Json Entities to load: {Jpayload.ToString()}"); results = await _purviewClient.Send_to_Purview(payload); if (results != null) { if (results.ReasonPhrase != "OK") { - Log("Error", $"Error Loading to Purview JSON Entiitesto Purview: Return Code: {results.StatusCode} - Reason:{results.ReasonPhrase}"); + _logger.LogError($"Error Loading to Purview JSON Entiitesto Purview: Return Code: {results.StatusCode} - Reason:{results.ReasonPhrase}"); } } else { - Log("Error", $"Error Loading to Purview!"); + _logger.LogError($"Error Loading to Purview!"); } - foreach (var entity in this.entities) + foreach (var deletableEntity in this.entitiesMarkedForDeletion) { - await _purviewClient.Delete_Unused_Entity(entity.Key, "purview_custom_connector_generic_entity_with_columns"); + await _purviewClient.Delete_Unused_Entity(deletableEntity.Key, "purview_custom_connector_generic_entity_with_columns"); } return true; } @@ -213,41 +216,37 @@ public async Task SendToPurview(JObject json) { if (json.Count > 0) { - Log("INFO", $"Payload: {json}"); - Log("Error", "Nothing found to load on to Purview, look if the payload is empty."); + _logger.LogInformation($"Payload: {json}"); + _logger.LogError("Nothing found to load on to Purview, look if the payload is empty."); } else { - Log("Error", "No Purview entity to load"); + _logger.LogError("No Purview entity to load"); } - foreach (var entity in this.entities) + foreach (var deletableEntity in this.entitiesMarkedForDeletion) { - await _purviewClient.Delete_Unused_Entity(entity.Key, "purview_custom_connector_generic_entity_with_columns"); + await _purviewClient.Delete_Unused_Entity(deletableEntity.Key, "purview_custom_connector_generic_entity_with_columns"); } return false; } } - Log("INFO", $"Payload already registered in Microsoft Purview: {json.ToString()}"); + _logger.LogInformation($"Payload already registered in Microsoft Purview: {json.ToString()}"); return false; } - private bool Validate_Entities_Json(JObject Process) + private bool EntityAttributesHaveBeenPopulated(JObject questionableEntity) { - if (!Process.ContainsKey("typeName")) + if (!questionableEntity.ContainsKey("typeName")) { return false; } - /* if (!Process.ContainsKey("guid")) - { - return false; - }*/ - if (!Process.ContainsKey("attributes")) + if (!questionableEntity.ContainsKey("attributes")) { return false; } - if (Process["attributes"]!.GetType() != typeof(JObject)) + if (questionableEntity["attributes"]!.GetType() != typeof(JObject)) return false; - if (!((JObject)Process["attributes"]!).ContainsKey("qualifiedName")) + if (!((JObject)questionableEntity["attributes"]!).ContainsKey("qualifiedName")) { return false; } @@ -271,20 +270,43 @@ private async Task Validate_Entities(JObject Process) QueryValeuModel sourceJson = await sourceEntity.QueryInPurview(); + // Capture the updated qualified name mapping in case column mapping needs it + originalFqnToDiscoveredFqn[qualifiedName] = sourceEntity.currentQualifiedName(); + Process["guid"] = sourceEntity.Properties["guid"]; + + String proctype = Process["typeName"]!.ToString(); + if (sourceEntity.Properties.ContainsKey("typeName")){ + String sourcetype = sourceEntity.Properties["typeName"]!.ToString(); + _logger.LogInformation($"PQN:{qualifiedName} Process Type name is {proctype} and sourceEntity original TypeName was {sourcetype}"); + }else{ + _logger.LogInformation($"PQN:{qualifiedName} Process Type name is {proctype} and sourceEntity original TypeName was not set"); + } + if (sourceEntity.is_dummy_asset) { + _logger.LogInformation("IN DUMMY ASSET AND ABOUT TO OVERWRITE"); sourceEntity.Properties["typeName"] = Process["typeName"]!.ToString(); - if (!entities.ContainsKey(qualifiedName)) - entities.Add(qualifiedName, sourceEntity); - Log("Info", $"Entity: {qualifiedName} Type: {typename}, Not found, Creating Dummy Entity"); + if (!entitiesMarkedForDeletion.ContainsKey(qualifiedName)) + entitiesMarkedForDeletion.Add(qualifiedName, sourceEntity); + _logger.LogInformation($"Entity: {qualifiedName} Type: {typename}, Not found, Creating Dummy Entity"); return sourceEntity; } - if (!entities.ContainsKey(qualifiedName)) - entities.Add(qualifiedName, sourceEntity); + if (!entitiesMarkedForDeletion.ContainsKey(qualifiedName)) + entitiesMarkedForDeletion.Add(qualifiedName, sourceEntity); return sourceEntity; } + + /// + /// Transform the provided JSON object (an input or output entity for a Purview process). + /// This entity will have their qualified name and type updated based on searching for + /// an existing entity in the purview instance. + /// In addition the entity is added to the inputs_outputs property of PurviewIngestion. + /// + /// Json Object + /// Should be either 'inputs' or 'outputs' + /// A PurviewCustomType private async Task SetOutputInput(JObject outPutInput, string inorout) { @@ -306,20 +328,16 @@ private async Task SetOutputInput(JObject outPutInput, string , _purviewClient); QueryValeuModel sourceJson = await sourceEntity.QueryInPurview(); + // Capture the updated qualified name mapping in case column mapping needs it + originalFqnToDiscoveredFqn[qualifiedName] = sourceEntity.currentQualifiedName(); + if (sourceEntity.is_dummy_asset) { - if (usePurviewTypes) - { - outPutInput["typeName"] = originalTypeName; - sourceEntity.Properties["typeName"] = originalTypeName; - } - else - { - outPutInput["typeName"] = sourceEntity.Properties["typeName"]; - outPutInput["uniqueAttributes"]!["qualifiedName"] = sourceEntity.Properties!["attributes"]!["qualifiedName"]!.ToString().ToLower(); - } + outPutInput["typeName"] = sourceEntity.Properties["typeName"]; + outPutInput["uniqueAttributes"]!["qualifiedName"] = sourceEntity.Properties!["attributes"]!["qualifiedName"]!.ToString().ToLower(); + inputs_outputs.Add(sourceEntity); - Log("Info", $"{inorout} Entity: {qualifiedName} Type: {typename}, Not found, Creating Dummy Entity"); + _logger.LogInformation($"{inorout} Entity: {qualifiedName} Type: {typename}, Not found, Creating Dummy Entity"); } else { @@ -327,8 +345,8 @@ private async Task SetOutputInput(JObject outPutInput, string outPutInput["typeName"] = sourceEntity.Properties!["typeName"]!.ToString(); } - if (!entities.ContainsKey(qualifiedName)) - entities.Add(qualifiedName, sourceEntity); + if (!entitiesMarkedForDeletion.ContainsKey(qualifiedName)) + entitiesMarkedForDeletion.Add(qualifiedName, sourceEntity); return sourceEntity; } @@ -369,7 +387,7 @@ private async Task Validate_Process_Entities(JObject Process) string[] tmpName = qualifiedName.Split('/'); Name = tmpName[tmpName.Length - 1]; typename = "purview_custom_connector_generic_entity_with_columns"; - if (!entities.ContainsKey(qualifiedName)) + if (!entitiesMarkedForDeletion.ContainsKey(qualifiedName)) { PurviewCustomType sourceEntity = new PurviewCustomType(Name @@ -383,55 +401,39 @@ private async Task Validate_Process_Entities(JObject Process) var outputObj = await sourceEntity.QueryInPurview(); + // Capture the updated qualified name mapping in case column mapping needs it + originalFqnToDiscoveredFqn[qualifiedName] = sourceEntity.currentQualifiedName(); + Process["relationshipAttributes"]![rel!.Name]!["guid"] = sourceEntity.Properties["guid"]; - if (!entities.ContainsKey(qualifiedName)) - entities.Add(qualifiedName, sourceEntity); + if (!entitiesMarkedForDeletion.ContainsKey(qualifiedName)) + entitiesMarkedForDeletion.Add(qualifiedName, sourceEntity); } else { - Process["relationshipAttributes"]![rel!.Name]!["guid"] = entities[qualifiedName].Properties["guid"]; + Process["relationshipAttributes"]![rel!.Name]!["guid"] = entitiesMarkedForDeletion[qualifiedName].Properties["guid"]; } } } return Process; } - private async Task Validate_Resource_Set(string qualifiedName) - { - string[] tmpName = qualifiedName.Split('/'); - string Name = tmpName[tmpName.Length - 1]; - if (Name == "") - Name = tmpName[tmpName.Length - 2]; - string typeName = "azure_datalake_gen2_resource_set"; - PurviewCustomType sourceEntity = new PurviewCustomType(Name - , typeName - , qualifiedName - , typeName - , $"Data Assets {Name}" - , NewGuid() - , _logger - , _purviewClient); - - var outputObj = await sourceEntity.QueryInPurview(); - return sourceEntity; - } - private bool Validate_Process_Json(JObject Process) + private bool IsProcessEntity(JObject Process) { var _typename = get_attribute("typeName", Process); if (_typename == null) { - Log("Info", "Not found Attribute typename on " + Process.ToString()); + _logger.LogInformation("Not found Attribute typename on " + Process.ToString()); return false; } var _attributes = get_attribute("attributes", Process); if (!_attributes.HasValues) { - Log("Error", "Not found Attribute attributes on " + Process.ToString()); + _logger.LogError("Not found Attribute attributes on " + Process.ToString()); return false; } if (!((JObject)Process["attributes"]!).ContainsKey("columnMapping")) { - Log("Info", $"Not found Attribute columnMapping on {Process.ToString()} i is not a Process Entity!"); + _logger.LogInformation($"Not found Attribute columnMapping on {Process.ToString()} is not a Process Entity!"); return false; } @@ -460,31 +462,6 @@ private Int64 NewGuid() return initGuid--; } - - private void Remove_Unused_Dummy_Entitites() - { - foreach (var entity in this.entities) - { - - } - } - - private void Log(string type, string msg) - { - if (type.ToUpper() == "ERROR") - { _logger.LogError(msg); return; } - if (type.ToUpper() == "INFO") - { _logger.LogInformation(msg); return; } - if (type.ToUpper() == "DEBUG") - { _logger.LogDebug(msg); return; } - if (type.ToUpper() == "WARNING") - { _logger.LogWarning(msg); return; } - if (type.ToUpper() == "CRITICAL") - { _logger.LogCritical(msg); return; } - if (type.ToUpper() == "TRACE") - { _logger.LogInformation(msg); return; } - } - private static string CalculateHash(string payload) { var newKey = Encoding.UTF8.GetBytes(payload); @@ -502,12 +479,4 @@ private static string CalculateHash(string payload) } } - /// - /// Enumeration of the Microsoft Purview Process entity relationships - /// - public enum Relationships_Type - { - inputs, - outputs - } } \ No newline at end of file diff --git a/function-app/adb-to-purview/src/Functions/PurviewOut.cs b/function-app/adb-to-purview/src/Functions/PurviewOut.cs index cc24cdd..7a7588e 100644 --- a/function-app/adb-to-purview/src/Functions/PurviewOut.cs +++ b/function-app/adb-to-purview/src/Functions/PurviewOut.cs @@ -8,6 +8,8 @@ using Function.Domain.Services; using Newtonsoft.Json; using Newtonsoft.Json.Linq; +using Function.Domain.Helpers; +using Microsoft.Extensions.Configuration; namespace AdbToPurview.Function { @@ -18,8 +20,9 @@ public class PurviewOut private readonly IOlConsolodateEnrich _olConsolodateEnrich; private readonly IOlToPurviewParsingService _olToPurviewParsingService; private readonly IPurviewIngestion _purviewIngestion; + private readonly IConfiguration _configuration; - public PurviewOut(ILogger logger, IOlToPurviewParsingService olToPurviewParsingService, IPurviewIngestion purviewIngestion, IOlConsolodateEnrich olConsolodateEnrich, ILoggerFactory loggerFactory) + public PurviewOut(ILogger logger, IOlToPurviewParsingService olToPurviewParsingService, IPurviewIngestion purviewIngestion, IOlConsolodateEnrich olConsolodateEnrich, ILoggerFactory loggerFactory, IConfiguration configuration) { logger.LogInformation("Enter PurviewOut"); _logger = logger; @@ -27,6 +30,7 @@ public PurviewOut(ILogger logger, IOlToPurviewParsingService olToPur _olConsolodateEnrich = olConsolodateEnrich; _olToPurviewParsingService = olToPurviewParsingService; _purviewIngestion = purviewIngestion; + _configuration = configuration; } [Function("PurviewOut")] @@ -42,7 +46,9 @@ public async Task Run( _logger.LogInformation($"Start event, duplicate event, or no context found - eventData: {input}"); return ""; } - var purviewEvent = _olToPurviewParsingService.GetPurviewFromOlEvent(enrichedEvent); + + IDatabricksToPurviewParser parser = new DatabricksToPurviewParser(_loggerFactory, _configuration, enrichedEvent); + var purviewEvent = _olToPurviewParsingService.GetPurviewFromOlEvent(enrichedEvent, parser); if (purviewEvent == null) { _logger.LogWarning("No Purview Event found"); @@ -52,7 +58,7 @@ public async Task Run( _logger.LogInformation($"PurviewOut-ParserService: {purviewEvent}"); var jObjectPurviewEvent = JsonConvert.DeserializeObject(purviewEvent) ?? new JObject(); _logger.LogInformation("Calling SendToPurview"); - await _purviewIngestion.SendToPurview(jObjectPurviewEvent); + await _purviewIngestion.SendToPurview(jObjectPurviewEvent, parser.GetColumnParser()); return $"Output message created at {DateTime.Now}"; } diff --git a/function-app/adb-to-purview/src/Program.cs b/function-app/adb-to-purview/src/Program.cs index e79b571..08943b6 100644 --- a/function-app/adb-to-purview/src/Program.cs +++ b/function-app/adb-to-purview/src/Program.cs @@ -25,7 +25,7 @@ public static void Main() workerApplication.UseMiddleware(); }) .ConfigureServices(s => - { + { s.AddScoped(); s.AddScoped(); s.AddScoped(); diff --git a/function-app/adb-to-purview/src/adb-to-purview.csproj b/function-app/adb-to-purview/src/adb-to-purview.csproj index 489e48f..88a44a2 100644 --- a/function-app/adb-to-purview/src/adb-to-purview.csproj +++ b/function-app/adb-to-purview/src/adb-to-purview.csproj @@ -14,10 +14,10 @@ - + - + diff --git a/function-app/adb-to-purview/tests/tools/QualifiedNameConfigTester/QualifiedNameConfigTester.csproj b/function-app/adb-to-purview/tests/tools/QualifiedNameConfigTester/QualifiedNameConfigTester.csproj index fbe03c6..2673c90 100644 --- a/function-app/adb-to-purview/tests/tools/QualifiedNameConfigTester/QualifiedNameConfigTester.csproj +++ b/function-app/adb-to-purview/tests/tools/QualifiedNameConfigTester/QualifiedNameConfigTester.csproj @@ -7,8 +7,8 @@ enable - - + + diff --git a/function-app/adb-to-purview/tests/unit-tests/Function.Domain/Helpers/Parser/QnParserTests.cs b/function-app/adb-to-purview/tests/unit-tests/Function.Domain/Helpers/Parser/QnParserTests.cs index aa48fcc..aa0e9f1 100644 --- a/function-app/adb-to-purview/tests/unit-tests/Function.Domain/Helpers/Parser/QnParserTests.cs +++ b/function-app/adb-to-purview/tests/unit-tests/Function.Domain/Helpers/Parser/QnParserTests.cs @@ -91,8 +91,28 @@ public QnParserTests() // DBFS mount trailing slash in def [InlineData("dbfs", "/mnt/purview2", - "https://purviewexamplessa.dfs.core.windows.net/purview2")] - // Azure SQL Non DBO Schema - + "https://purviewexamplessa.dfs.core.windows.net/purview2")] + // DBFS mount with mountpoint containing a sub-directory + [InlineData("dbfs", + "/mnt/x2/foo", + "https://ysa.dfs.core.windows.net/myx2/subdir/foo")] + // DBFS mount with mountpoint containing a sub-directory for blob + [InlineData("dbfs", + "/mnt/blobx2/foo", + "https://ysa.blob.core.windows.net/myx2/subdir/foo")] + // DBFS mount containing a sub-directory + [InlineData("dbfs", + "/mnt/x2/retail", + "https://ysa.dfs.core.windows.net/myx2/subdir/retail")] + // DBFS mount - Shortest String Match containing a sub-directory + [InlineData("dbfs", + "/mnt/x2/abc", + "https://ysa.dfs.core.windows.net/myx2/subdir/abc")] + // DBFS mount - Longest String Match containing a sub-directory + [InlineData("dbfs", + "/mnt/x2/y/abc", + "https://ysa.dfs.core.windows.net/myx2/subdir/y/abc")] + //Azure SQL Non DBO Schema - [InlineData("sqlserver://purview-to-adb-sql.database.windows.net;database=purview-to-adb-sqldb;", "[mytest].[tablename.will.mark]", "mssql://purview-to-adb-sql.database.windows.net/purview-to-adb-sqldb/mytest/tablename.will.mark")] @@ -104,6 +124,34 @@ public QnParserTests() [InlineData("sqlserver://purviewadbsynapsews.sql.azuresynapse.net:1433;database=SQLPool1;", "sales.region", "mssql://purviewadbsynapsews.sql.azuresynapse.net/SQLPool1/sales/region")] + // Azure MySQL + [InlineData("mysql://fikz4nmpfka4s.mysql.database.azure.com:3306/mydatabase", + "fruits", + "mysql://fikz4nmpfka4s.mysql.database.azure.com/mydatabase/fruits")] + // Azure Postgres Public + [InlineData("postgresql://gqhfuzgnrmpzw.postgres.database.azure.com:5432/postgres", + "people", + "postgresql://gqhfuzgnrmpzw.postgres.database.azure.com/postgres/public/people")] + // Azure Postgres Non Public + [InlineData("postgresql://gqhfuzgnrmpzw.postgres.database.azure.com:5432/postgres", + "myschema.people", + "postgresql://gqhfuzgnrmpzw.postgres.database.azure.com/postgres/myschema/people")] + // Postgres Public + [InlineData("postgresql://10.2.0.4:5432/postgres", + "table01", + "postgresql://servers/10.2.0.4:5432/dbs/postgres/schemas/public/tables/table01")] + // Postgres Non Public + [InlineData("postgresql://10.2.0.4:5432/postgres", + "myschema.table01", + "postgresql://servers/10.2.0.4:5432/dbs/postgres/schemas/myschema/tables/table01")] + // Azure Data Explorer (Kusto) + [InlineData("azurekusto://qpll4l5hchczm.eastus2.kusto.windows.net/database01", + "table01", + "https://qpll4l5hchczm.eastus2.kusto.windows.net/database01/table01")] + // Cosmos + [InlineData("azurecosmos://6ch4pkm5tpniq.documents.azure.com/dbs/mydatabase", + "/colls/yourcontainer", + "https://6ch4pkm5tpniq.documents.azure.com/dbs/mydatabase/colls/yourcontainer")] public void GetIdentifiers_OlSource_ReturnsPurviewIdentifier(string nameSpace, string name, string expectedResult) { diff --git a/function-app/adb-to-purview/tests/unit-tests/Function.Domain/Helpers/Parser/UnitTestData.cs b/function-app/adb-to-purview/tests/unit-tests/Function.Domain/Helpers/Parser/UnitTestData.cs index 7b36a26..7e436b9 100644 --- a/function-app/adb-to-purview/tests/unit-tests/Function.Domain/Helpers/Parser/UnitTestData.cs +++ b/function-app/adb-to-purview/tests/unit-tests/Function.Domain/Helpers/Parser/UnitTestData.cs @@ -25,7 +25,10 @@ public struct QnParserTestData new MountPoint(){MountPointName="/databricks-results",Source="databricks-results"}, new MountPoint(){MountPointName="/mnt/purview2/",Source="abfss://purview2@purviewexamplessa.dfs.core.windows.net/"}, new MountPoint(){MountPointName="/mnt/x/",Source="abfss://x@xsa.dfs.core.windows.net/"}, - new MountPoint(){MountPointName="/mnt/x/y",Source="abfss://y@ysa.dfs.core.windows.net/"} + new MountPoint(){MountPointName="/mnt/x/y",Source="abfss://y@ysa.dfs.core.windows.net/"}, + new MountPoint(){MountPointName="/mnt/x2/",Source="abfss://myx2@ysa.dfs.core.windows.net/subdir/"}, + new MountPoint(){MountPointName="/mnt/blobx2/",Source="wasbs://myx2@ysa.blob.core.windows.net/subdir/"}, + new MountPoint(){MountPointName="/mnt/adlg1/",Source="adl://gen1.azuredatalakestore.net/subdir/"} }; } } diff --git a/function-app/adb-to-purview/tests/unit-tests/Function.Domain/Services/UnitTestData.cs b/function-app/adb-to-purview/tests/unit-tests/Function.Domain/Services/UnitTestData.cs index 21416b3..0248993 100644 --- a/function-app/adb-to-purview/tests/unit-tests/Function.Domain/Services/UnitTestData.cs +++ b/function-app/adb-to-purview/tests/unit-tests/Function.Domain/Services/UnitTestData.cs @@ -17,7 +17,7 @@ public IEnumerator GetEnumerator() , false}; // CompleteNoOutputsInputsFullMessage yield return new object[] {"CompleteNoOutputsInputsFullMessage: 2022-01-12T00:05:56.318 [Information] OpenLineageIn:{\"eventType\":\"COMPLETE\",\"eventTime\":\"2022-01-25T17:52:53.363Z\",\"inputs\":[],\"outputs\":[{\"namespace\":\"dbfs\",\"name\":\"/mnt/raw/DimProduct.parquet\"}],\"producer\":\"https://github.com/OpenLineage/OpenLineage/tree/0.5.0-SNAPSHOT/integration/spark\",\"schemaURL\":\"https://openlineage.io/spec/1-0-2/OpenLineage.json#/$defs/RunEvent\"}" - , false}; + , true}; // CompleteOutputsAndInputsFullMessage yield return new object[] {"CompleteOutputsAndInputsFullMessage: 2022-01-12T00:19:41.550 [Information] OpenLineageIn:{\"eventType\":\"COMPLETE\",\"eventTime\":\"2022-01-25T17:52:53.363Z\",\"inputs\":[{\"namespace\":\"dbfs\",\"name\":\"/mnt/raw/DimProduct.parquet\"}],\"outputs\":[{\"namespace\":\"dbfs\",\"name\":\"/mnt/destination/DimProduct.parquet\"}],\"producer\":\"https://github.com/OpenLineage/OpenLineage/tree/0.5.0-SNAPSHOT/integration/spark\",\"schemaURL\":\"https://openlineage.io/spec/1-0-2/OpenLineage.json#/$defs/RunEvent\"}" , true}; diff --git a/tests/deployment/compare-app-settings.py b/tests/deployment/compare-app-settings.py index 88fc6e0..35c2f8f 100644 --- a/tests/deployment/compare-app-settings.py +++ b/tests/deployment/compare-app-settings.py @@ -56,7 +56,10 @@ app_settings_in_template = None for resource in arm_template["resources"]: if resource["type"] == "Microsoft.Web/sites": - app_settings_in_template = resource["properties"]["siteConfig"]["appSettings"] + for child_resource in resource.get("resources", []): + if child_resource.get("type") == "config" and child_resource.get("name") == "web": + app_settings_in_template = child_resource.get("properties", {}).get("appSettings", []) + break if app_settings_in_template is None: raise ValueError("Unable to extract the Microsoft.web/sites resources") diff --git a/tests/deployment/test_arm_mapping_matches_json.py b/tests/deployment/test_arm_mapping_matches_json.py index 274dd02..9d56c36 100644 --- a/tests/deployment/test_arm_mapping_matches_json.py +++ b/tests/deployment/test_arm_mapping_matches_json.py @@ -22,7 +22,13 @@ if resource["name"] != "[variables('functionAppName')]": continue - app_settings = resource.get("properties", {}).get("siteConfig", {}).get("appSettings", []) + web_config={} + for child_resource in resource.get("resources", []): + if child_resource.get("type") == "config" and child_resource.get("name") == "web": + web_config = child_resource + break + + app_settings = web_config.get("properties", {}).get("appSettings", []) for setting in app_settings: if setting["name"] != "OlToPurviewMappings": continue diff --git a/tests/environment/README.md b/tests/environment/README.md new file mode 100644 index 0000000..4f3714a --- /dev/null +++ b/tests/environment/README.md @@ -0,0 +1,143 @@ +# Deploying the Test Environment + +## Deploying the Connector + +## Deploying the Data Sources + +``` +az deployment group create \ +--template-file ./tests/environment/sources/adlsg2.bicep \ +--resource-group db2pvsasources + +``` + +## Manual Steps + +Create a config.ini file: + +```ini +databricks_workspace_host_id = adb-workspace.id +databricks_personal_access_token = PERSONAL_ACCESS_TOKEN +databricks_spark3_cluster = CLUSTER_ID +databricks_spark2_cluster = CLUSTER_ID +``` + +Assign Service Principal Storage Blob Data Contributor to the main ADLS G2 instance + +Add Service Principal as user in Databricks. + +Enable mount points with `./tests/environment/dbfs/mounts.py` + +Install mysql:mysql-connector-java:8.0.30 (version may vary based on cluster config) on the cluster. +Create/reuse a Service Principal for Azure Data Explorer Authentication. Create and save a secret locally. + +Add Key Vault Secrets + * `tenant-id` + * `storage-service-key` + * `azuresql-username` + * `azuresql-password` + * `azuresql-jdbc-conn-str` should be of the form `jdbc:sqlserver://SERVER_NAME.database.windows.net:1433;database=DATABASE_NAME;encrypt=true;trustServerCertificate=false;hostNameInCertificate=*.database.windows.net;loginTimeout=30;` + * `synapse-storage-key` + * `synapse-query-username` + * `synapse-query-password` + * `mysql-username` of the form `username@servername` + * `mysql-password` + * `mysql-hostname` the server name of the Azure MySQL resource + * `postgres-admin-user` should be of the form `username@servername` + * `postgres-admin-password` + * `postgres-host` - the server name of the deployed postgres server + * `azurekusto-appid` + * `azurekusto-appsecret` + * `azurekusto-uri` + + * `azurecosmos-endpoint` + * `azurecosmos-key` +* Update SQL Db and Synapse Server with AAD Admin +* Add Service Principal for Databricks to connect to SQL sources +* Assign the Service Principal admin role on the ADX cluster. [Guide](https://learn.microsoft.com/en-us/azure/data-explorer/provision-azure-ad-app#grant-the-service-principal-access-to-an-azure-data-explorer-database) + +Set the following system environments: + +* `SYNAPSE_SERVICE_NAME` +* `STORAGE_SERVICE_NAME` +* `SYNAPSE_STORAGE_SERVICE_NAME` + +Install the version of the [kusto spark connector](https://github.com/Azure/azure-kusto-spark) that matches the cluster Scala and Spark versions from Maven Central. + +Upload notebooks in `./tests/integration/spark-apps/notebooks/` to dbfs' `/Shared/examples/` +* Manually for now. TODO: Automate this in Python + +Install the following libraries on the compute cluster (versions to match the Spark and Scala versions of the cluster) (TODO: Automate): +* Cosmos spark connector + + +Compile the following apps and upload them to `/dbfs/FileStore/testcases/` + +* `./tests/integration/spark-apps/jarjobs/abfssInAbfssOut/` with `./gradlew build` +* `./tests/integration/spark-apps/pythonscript/pythonscript.py` by just uploading. +* `./tests/integration/spark-apps/wheeljobs/abfssintest/` with `python -m build` + +Upload the job definitions using the python script `python .\tests\environment\dbfs\create-job.py` + +## Github Actions + +* AZURE_CLIENT_ID +* AZURE_CLIENT_SECRET +* AZURE_TENANT_ID +* INT_AZ_CLI_CREDENTIALS + ```json + { + "clientId": "xxxx", + "clientSecret": "yyyy", + "subscriptionId": "zzzz", + "tenantId": "μμμμ", + "activeDirectoryEndpointUrl": "https://login.microsoftonline.com", + "resourceManagerEndpointUrl": "https://management.azure.com/", + "activeDirectoryGraphResourceId": "https://graph.windows.net/", + "sqlManagementEndpointUrl": "https://management.core.windows.net:8443/", + "galleryEndpointUrl": "https://gallery.azure.com/", + "managementEndpointUrl": "https://management.core.windows.net/" + } + ``` +* INT_DATABRICKS_ACCESS_TOKEN +* INT_DATABRICKS_WKSP_ID: adb-xxxx.y +* INT_FUNC_NAME +* INT_PUBLISH_PROFILE from the Azure Function's publish profile XML +* INT_PURVIEW_NAME +* INT_RG_NAME +* INT_SUBSCRIPTION_ID +* INT_SYNAPSE_SQLPOOL_NAME +* INT_SYNAPSE_WKSP_NAME +* INT_SYNAPSE_WKSP_NAME + +## config.json + +```json +{ + "datasets":{ + "datasetName": { + "schema": [ + "field1", + "field2" + ], + "data": [ + [ + "val1", + "val2" + ] + ] + } + }, + "jobs": { + "job-name": [ + [ + ("storage"|"sql"|"noop"), + ("csv"|"delta"|"azuresql"|"synapse"), + "rawdata/testcase/one/", + "exampleInputA" + ] + ] + } +} + +``` diff --git a/tests/environment/config.json b/tests/environment/config.json new file mode 100644 index 0000000..79e4721 --- /dev/null +++ b/tests/environment/config.json @@ -0,0 +1,366 @@ +{ + "dataset": { + "exampleInputA": { + "schema": [ + "id", + "postalCode", + "street" + ], + "data": [ + [ + 1, + "555", + "742 Evergreen Terrace" + ] + ] + }, + "exampleInputB": { + "schema": [ + "id", + "city", + "stateAbbreviation" + ], + "data": [ + [ + 1, + "Springfield", + "??" + ] + ] + } + }, + "jobs": { + "jarjobs-abfssInAbfssOut": [ + [ + "storage", + "csv", + "rawdata/testcase/eighteen/", + "exampleInputA" + ] + ], + "pythonscript-pythonscript.py": [ + [ + "storage", + "csv", + "rawdata/testcase/twenty/", + "exampleInputA" + ] + ], + "wheeljobs-abfssintest": [ + [ + "storage", + "csv", + "rawdata/testcase/seventeen/", + "exampleInputA" + ], + [ + "storage", + "csv", + "rawdata/testcase/seventeen/", + "exampleInputB" + ] + ], + "abfss-in-abfss-out-oauth.scala": [ + [ + "storage", + "csv", + "rawdata/testcase/two/", + "exampleInputA" + ], + [ + "storage", + "csv", + "rawdata/testcase/two/", + "exampleInputB" + ] + ], + "abfss-in-abfss-out-root.scala": [ + [ + "storage", + "csv", + "rawdata/testcase/three/", + "exampleInputA" + ], + [ + "storage", + "csv", + "rawdata/testcase/three/", + "exampleInputB" + ] + ], + "abfss-in-abfss-out.scala": [ + [ + "storage", + "csv", + "rawdata/testcase/one/", + "exampleInputA" + ], + [ + "storage", + "csv", + "rawdata/testcase/one/", + "exampleInputB" + ] + ], + "abfss-in-hive+notmgd+saveAsTable-out.scala": [ + [ + "storage", + "delta", + "rawdata/testcase/abfss-in-hive+notmgd+saveAsTable-out/", + "exampleInputA" + ] + ], + "abfss-in-hive+saveAsTable-out.scala": [ + [ + "storage", + "delta", + "rawdata/testcase/abfss-in-hive+saveAsTable-out/", + "exampleInputA" + ] + ], + "azuresql-in-azuresql-out.scala": [ + [ + "azuresql", + "table", + "dbo", + "exampleInputA" + ], + [ + "azuresql", + "table", + "dbo", + "exampleInputB" + ], + [ + "azuresql", + "table", + "dbo.exampleInputC" + ], + [ + "azuresql", + "table", + "dbo.exampleOutput" + ] + ], + "call-via-adf-spark2.scala": [ + [ + "storage", + "csv", + "rawdata/testcase/thirteen/", + "exampleInputA" + ], + [ + "storage", + "csv", + "rawdata/testcase/thirteen/", + "exampleInputB" + ] + ], + "call-via-adf-spark3.scala": [ + [ + "storage", + "csv", + "rawdata/testcase/fourteen/", + "exampleInputA" + ], + [ + "storage", + "csv", + "rawdata/testcase/fourteen/", + "exampleInputB" + ] + ], + "delta-in-delta-merge.scala": [ + [ + "storage", + "delta", + "rawdata/testcase/sixteen/", + "exampleInputA" + ], + [ + "storage", + "delta", + "rawdata/testcase/sixteen/", + "exampleInputB" + ] + ], + "delta-in-delta-out-abfss.scala": [ + [ + "storage", + "delta", + "rawdata/testcase/four/", + "exampleInputA" + ], + [ + "storage", + "delta", + "rawdata/testcase/four/", + "exampleInputB" + ] + ], + "delta-in-delta-out-fs.scala": [ + [ + "storage", + "delta", + "rawdata/testcase/five/", + "exampleInputA" + ], + [ + "storage", + "delta", + "rawdata/testcase/five/", + "exampleInputB" + ] + ], + "delta-in-delta-out-mnt.scala": [ + [ + "storage", + "delta", + "rawdata/testcase/six/", + "exampleInputA" + ], + [ + "storage", + "delta", + "rawdata/testcase/six/", + "exampleInputB" + ] + ], + "hive-in-hive-out-insert.py": [ + [ + "noop" + ] + ], + "hive+abfss-in-hive+abfss-out-insert.py": [ + [ + "storage", + "delta", + "rawdata/testcase/twentyone/", + "exampleInputA" + ] + ], + "hive+mgd+not+default-in-hive+mgd+not+default-out-insert.py": [ + [ + "noop" + ] + ], + "hive+mnt-in-hive+mnt-out-insert.py": [ + [ + "noop" + ] + ], + "intermix-languages.scala": [ + [ + "storage", + "csv", + "rawdata/testcase/fifteen/", + "exampleInputA" + ], + [ + "storage", + "csv", + "rawdata/testcase/fifteen/", + "exampleInputB" + ] + ], + "mnt-in-mnt-out.scala": [ + [ + "storage", + "csv", + "rawdata/testcase/seven/", + "exampleInputA" + ], + [ + "storage", + "csv", + "rawdata/testcase/seven/", + "exampleInputB" + ] + ], + "name-with-periods.scala": [ + [ + "storage", + "csv", + "rawdata/testcase/namewithperiods/", + "exampleInputA" + ] + ], + "nested-child.scala": [ + [ + "storage", + "csv", + "rawdata/testcase/eight/", + "exampleInputA" + ] + ], + "nested-parent.scala": [ + [ + "noop" + ] + ], + "spark-sql-table-in-abfss-out.scala": [ + [ + "storage", + "csv", + "rawdata/testcase/nine/", + "exampleInputB" + ] + ], + "synapse-in-synapse-out.scala": [ + [ + "synapse", + "table", + "dbo", + "exampleInputA" + ], + [ + "synapse", + "table", + "Sales", + "Region" + ] + ], + "synapse-in-wasbs-out.scala": [ + [ + "synapse", + "table", + "dbo", + "exampleInputA" + ], + [ + "synapse", + "table", + "dbo", + "exampleInputB" + ] + ], + "synapse-wasbs-in-synapse-out.scala": [ + [ + "synapse", + "table", + "dbo", + "exampleInputA" + ], + [ + "storage", + "csv", + "rawdata/testcase/eleven/", + "exampleInputA" + ] + ], + "wasbs-in-wasbs-out.scala": [ + [ + "storage", + "csv", + "rawdata/testcase/wasinwasout/", + "exampleInputA" + ], + [ + "storage", + "csv", + "rawdata/testcase/wasinwasout/", + "exampleInputB" + ] + ] + } +} \ No newline at end of file diff --git a/tests/environment/datasets/azsql.sql b/tests/environment/datasets/azsql.sql new file mode 100644 index 0000000..8eeba28 --- /dev/null +++ b/tests/environment/datasets/azsql.sql @@ -0,0 +1,27 @@ +CREATE SCHEMA nondbo + +CREATE TABLE nondbo.exampleInputC ( +id int +,cityPopulation int +) + +CREATE TABLE dbo.exampleInputB ( +id int +,city varchar(30) +,stateAbbreviation varchar(2) +) + +CREATE TABLE dbo.exampleInputA ( +id int +,postalcode varchar(5) +,street varchar(50) +) + +INSERT INTO nondbo.exampleInputC(id, cityPopulation) +VALUES(1, 1000) + +INSERT INTO dbo.exampleInputB(id, city, stateAbbreviation) +VALUES(1, 'Springfield', '??') + +INSERT INTO dbo.exampleInputA(id, postalcode, street) +VALUES(1, '55555', '742 Evergreen Terrace') diff --git a/tests/environment/datasets/make-data.py b/tests/environment/datasets/make-data.py new file mode 100644 index 0000000..21fb1a0 --- /dev/null +++ b/tests/environment/datasets/make-data.py @@ -0,0 +1,112 @@ +import argparse +import configparser +from io import BytesIO +import json +import pathlib +import re + + +from azure.identity import DefaultAzureCredential +from azure.storage.blob import BlobServiceClient, BlobClient + +def make_or_get_connection_client(connection_string, cached_connections, **kwargs): + if connection_string in cached_connections: + return cached_connections[connection_string] + + elif re.search(r'EndpointSuffix=', connection_string): # Is Blob + _client = BlobServiceClient.from_connection_string(connection_string) + cached_connections[connection_string] = _client + return _client + else: + raise NotImplementedError("Connection String not supported") + + +def make_and_upload_data(client, storage_path, dataset_name, storage_format, data): + if isinstance(client, BlobServiceClient): + blob_full_path = pathlib.Path(storage_path) + container = blob_full_path.parts[0] + blob_relative_path = '/'.join(list(blob_full_path.parts[1:])+[dataset_name, dataset_name+"."+storage_format]) + + _blob_client = client.get_blob_client(container, blob_relative_path) + blob_stream = BytesIO() + with blob_stream as fp: + for row in data["data"]: + fp.write(bytes(','.join(str(r) for r in row), encoding="utf-8")) + fp.seek(0) + _blob_client.upload_blob(blob_stream.read(), blob_type="BlockBlob", overwrite=True) + else: + raise NotImplementedError(f"{type(client)} not supported") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--test_case", "-t", type=str, action="append", help="Name of the test case(s) to be deployed. If not specified, upload all datasets") + parser.add_argument("--config", type=str, help="Path to the json config file", default="./tests/environment/config.json") + parser.add_argument("--ini", type=str, help="Path to the ini config file", default="./tests/environment/config.ini") + args = parser.parse_args() + + # Datasets + ## CSV + ## Parquet + ## Delta + ## SQL + ## COSMOS + ## Kusto + + # Load Test Cases + ## jobs and dataset + _connections = configparser.ConfigParser() + _connections.read(args.ini) + + with open(args.config, 'r') as fp: + _config = json.load(fp) + TEST_JOBS = _config["jobs"] + TEST_DATASET = _config["dataset"] + + # Filter based on test cases provided + if args.test_case: + print(args.test_case) + jobs_to_build_data = {k:v for k,v in TEST_JOBS.items() if k in args.test_case} + else: + jobs_to_build_data = TEST_JOBS + + + # Make the data only one time + cached_data = {} + # Make the connections only one time + cached_connections = {} + # Iterate over every job and build the dataset + for job_name, dataset_def in jobs_to_build_data.items(): + if len(dataset_def) == 0 or dataset_def[0] == ["noop"]: + print(f"{job_name}: skipped") + continue + + for dataset in dataset_def: + _connection_name = dataset[0] + _storage_format = dataset[1] + _storage_path = dataset[2] + _dataset_name = dataset[3] + + print(f"{job_name}: {_storage_path}") + + _connection_string = _connections["DEFAULT"][_connection_name+"_connection_string"] + + _client = make_or_get_connection_client(_connection_string, cached_connections) + + _data = TEST_DATASET[_dataset_name] + + make_and_upload_data( + _client, + _storage_path, + _dataset_name, + _storage_format, + _data + ) + + + + # Check which storage engine is necessary + # Check what format the data will be stored in + # Check the pat + + \ No newline at end of file diff --git a/tests/environment/datasets/sqlpool.sql b/tests/environment/datasets/sqlpool.sql new file mode 100644 index 0000000..3b357c2 --- /dev/null +++ b/tests/environment/datasets/sqlpool.sql @@ -0,0 +1,30 @@ +CREATE MASTER KEY ENCRYPTION BY PASSWORD = 'xxxx' ; /* Necessary for Synapse External tables */ +CREATE SCHEMA Sales + +CREATE TABLE Sales.Region ( +id int +,regionId int +) + +CREATE TABLE dbo.exampleInputB ( +id int +,city varchar(30) +,stateAbbreviation varchar(2) +) + +CREATE TABLE dbo.exampleInputA ( +id int +,postalcode varchar(5) +,street varchar(50) +) + + + +INSERT INTO Sales.Region(id, regionId) +VALUES(1, 1000) + +INSERT INTO dbo.exampleInputB(id, city, stateAbbreviation) +VALUES(1, 'Springfield', '??') + +INSERT INTO dbo.exampleInputA(id, postalcode, street) +VALUES(1, '55555', '742 Evergreen Terrace') diff --git a/tests/environment/dbfs/create-job.py b/tests/environment/dbfs/create-job.py new file mode 100644 index 0000000..3e7f7be --- /dev/null +++ b/tests/environment/dbfs/create-job.py @@ -0,0 +1,48 @@ +# https://learn.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/workspace#--import +import argparse +import configparser +import json +import os + +import requests + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--folder", default="./tests/integration/jobdefs") + parser.add_argument("--ini", default="./tests/environment/config.ini") + args = parser.parse_args() + + cfp = configparser.ConfigParser() + + cfp.read(args.ini) + db_host_id = cfp["DEFAULT"]["databricks_workspace_host_id"] + db_pat = cfp["DEFAULT"]["databricks_personal_access_token"] + + JOB_URL = f"https://{db_host_id}.azuredatabricks.net/api/2.1/jobs/create" + for job_def in os.listdir(args.folder): + if not job_def.endswith("-def.json"): + continue + + print(job_def) + with open(os.path.join(args.folder, job_def), 'r') as fp: + job_json = json.load(fp) + + job_str = json.dumps(job_json) + if job_def.startswith("spark2"): + job_str = job_str.replace("", cfp["DEFAULT"]["databricks_spark2_cluster"]) + else: + job_str = job_str.replace("", cfp["DEFAULT"]["databricks_spark3_cluster"]) + + job_json_to_submit = json.loads(job_str) + + resp = requests.post( + url=JOB_URL, + json=job_json_to_submit, + headers={ + "Authorization": f"Bearer {db_pat}" + } + ) + print(resp.content) + + diff --git a/tests/environment/dbfs/mounts.py b/tests/environment/dbfs/mounts.py new file mode 100644 index 0000000..d03a942 --- /dev/null +++ b/tests/environment/dbfs/mounts.py @@ -0,0 +1,34 @@ +# Databricks notebook source +import os + +storage_acct_name = os.environ.get("STORAGE_SERVICE_NAME") +configs = {"fs.azure.account.auth.type": "OAuth", + "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider", + "fs.azure.account.oauth2.client.id": dbutils.secrets.get("purview-to-adb-kv", 'clientIdKey'), + "fs.azure.account.oauth2.client.secret": dbutils.secrets.get("purview-to-adb-kv", 'clientSecretKey'), + "fs.azure.account.oauth2.client.endpoint": f"https://login.microsoftonline.com/{dbutils.secrets.get('purview-to-adb-kv', 'tenant-id')}/oauth2/token"} + +# COMMAND ---------- + +# Optionally, you can add to the source URI of your mount point. +try: + dbutils.fs.mount( + source = f"abfss://rawdata@{storage_acct_name}.dfs.core.windows.net/", + mount_point = "/mnt/rawdata", + extra_configs = configs) +except Exception as e: + print(e) + +# COMMAND ---------- + +try: + dbutils.fs.mount( + source = f"abfss://outputdata@{storage_acct_name}.dfs.core.windows.net/", + mount_point = "/mnt/outputdata", + extra_configs = configs) +except Exception as e: + print(e) + +# COMMAND ---------- + + diff --git a/tests/environment/requirements.txt b/tests/environment/requirements.txt new file mode 100644 index 0000000..ecca015 --- /dev/null +++ b/tests/environment/requirements.txt @@ -0,0 +1,29 @@ +azure-core==1.26.1 +azure-identity==1.12.0 +azure-storage-blob==12.14.1 +build==0.9.0 +certifi==2022.12.7 +cffi==1.15.1 +charset-normalizer==2.1.1 +colorama==0.4.6 +cryptography==39.0.1 +idna==3.4 +importlib-metadata==5.1.0 +isodate==0.6.1 +msal==1.20.0 +msal-extensions==1.0.0 +msrest==0.7.1 +oauthlib==3.2.2 +packaging==22.0 +pep517==0.13.0 +portalocker==2.6.0 +pycparser==2.21 +PyJWT==2.6.0 +pywin32==305 +requests==2.28.1 +requests-oauthlib==1.3.1 +six==1.16.0 +tomli==2.0.1 +typing_extensions==4.4.0 +urllib3==1.26.13 +zipp==3.11.0 diff --git a/tests/environment/sources/adlsg2.bicep b/tests/environment/sources/adlsg2.bicep new file mode 100644 index 0000000..de69883 --- /dev/null +++ b/tests/environment/sources/adlsg2.bicep @@ -0,0 +1,30 @@ +@description('Location of the data factory.') +param location string = resourceGroup().location + +@description('Name of the Azure storage account that contains the input/output data.') +param storageAccountName string = 'storage${uniqueString(resourceGroup().id)}' + +resource storageAccount 'Microsoft.Storage/storageAccounts@2021-08-01' = { + name: storageAccountName + location: location + sku: { + name: 'Standard_LRS' + } + kind: 'StorageV2' + properties:{ + isHnsEnabled: true + } + +} + +resource rawdataContainer 'Microsoft.Storage/storageAccounts/blobServices/containers@2021-08-01' = { + name: '${storageAccount.name}/default/rawdata' +} + +resource writeToRootContainer 'Microsoft.Storage/storageAccounts/blobServices/containers@2021-08-01' = { + name: '${storageAccount.name}/default/writetoroot' +} + +resource outputdataContainer 'Microsoft.Storage/storageAccounts/blobServices/containers@2021-08-01' = { + name: '${storageAccount.name}/default/outputdata' +} diff --git a/tests/environment/sources/adx.bicep b/tests/environment/sources/adx.bicep new file mode 100644 index 0000000..02a174b --- /dev/null +++ b/tests/environment/sources/adx.bicep @@ -0,0 +1,33 @@ +@description('Cluster Name for Azure Data Explorer') +param clusterName string = uniqueString('adx', resourceGroup().id) + +@description('Database Name for Azure Data Explorer Cluster') +param databaseName string = 'database01' + +@description('Location for all resources.') +param location string = resourceGroup().location + +resource symbolicname 'Microsoft.Kusto/clusters@2022-11-11' = { + name: clusterName + location: location + sku: { + capacity: 1 + name: 'Dev(No SLA)_Standard_D11_v2' + tier: 'Basic' + } + identity: { + type: 'SystemAssigned' + } + properties: { + enableAutoStop: true + engineType: 'V3' + publicIPType: 'IPv4' + publicNetworkAccess: 'Enabled' + } + resource symbolicname 'databases@2022-11-11' = { + name: databaseName + location: location + kind: 'ReadWrite' + // For remaining properties, see clusters/databases objects + } +} diff --git a/tests/environment/sources/cosmos.bicep b/tests/environment/sources/cosmos.bicep new file mode 100644 index 0000000..bf9bcba --- /dev/null +++ b/tests/environment/sources/cosmos.bicep @@ -0,0 +1,160 @@ +@description('Azure Cosmos DB account name, max length 44 characters') +param accountName string = uniqueString('cosmos', resourceGroup().id) + +@description('Location for the Azure Cosmos DB account.') +param location string = resourceGroup().location + +@allowed([ + 'Eventual' + 'ConsistentPrefix' + 'Session' + 'BoundedStaleness' + 'Strong' +]) +@description('The default consistency level of the Cosmos DB account.') +param defaultConsistencyLevel string = 'Session' + +@minValue(10) +@maxValue(2147483647) +@description('Max stale requests. Required for BoundedStaleness. Valid ranges, Single Region: 10 to 2147483647. Multi Region: 100000 to 2147483647.') +param maxStalenessPrefix int = 100000 + +@minValue(5) +@maxValue(86400) +@description('Max lag time (minutes). Required for BoundedStaleness. Valid ranges, Single Region: 5 to 84600. Multi Region: 300 to 86400.') +param maxIntervalInSeconds int = 300 + +@allowed([ + true + false +]) +@description('Enable system managed failover for regions') +param systemManagedFailover bool = false + +@description('The name for the database') +param databaseName string = 'myDatabase' + +@description('The name for the input container') +param inContainerName string = 'myContainer' + +@description('The name for the output container') +param outContainerName string = 'yourContainer' + +@minValue(400) +@maxValue(1000000) +@description('The throughput for the container') +param throughput int = 400 + +var consistencyPolicy = { + Eventual: { + defaultConsistencyLevel: 'Eventual' + } + ConsistentPrefix: { + defaultConsistencyLevel: 'ConsistentPrefix' + } + Session: { + defaultConsistencyLevel: 'Session' + } + BoundedStaleness: { + defaultConsistencyLevel: 'BoundedStaleness' + maxStalenessPrefix: maxStalenessPrefix + maxIntervalInSeconds: maxIntervalInSeconds + } + Strong: { + defaultConsistencyLevel: 'Strong' + } +} +var locations = [ + { + locationName: location + failoverPriority: 0 + isZoneRedundant: false + } +] + +resource account 'Microsoft.DocumentDB/databaseAccounts@2022-05-15' = { + name: toLower(accountName) + location: location + kind: 'GlobalDocumentDB' + properties: { + consistencyPolicy: consistencyPolicy[defaultConsistencyLevel] + locations: locations + databaseAccountOfferType: 'Standard' + enableAutomaticFailover: systemManagedFailover + } +} + +resource database 'Microsoft.DocumentDB/databaseAccounts/sqlDatabases@2022-05-15' = { + name: '${account.name}/${databaseName}' + properties: { + resource: { + id: databaseName + } + } +} + +resource container 'Microsoft.DocumentDB/databaseAccounts/sqlDatabases/containers@2022-05-15' = { + name: '${database.name}/${inContainerName}' + properties: { + resource: { + id: inContainerName + partitionKey: { + paths: [ + '/country' + ] + kind: 'Hash' + } + indexingPolicy: { + indexingMode: 'consistent' + automatic: true + includedPaths: [ + { + path: '/*' + } + ] + excludedPaths: [ + { + path: '/\'_etag\'/?' + } + ] + } + defaultTtl: 86400 + } + options: { + throughput: throughput + } + } +} + +resource container2 'Microsoft.DocumentDB/databaseAccounts/sqlDatabases/containers@2022-05-15' = { + name: '${database.name}/${outContainerName}' + properties: { + resource: { + id: outContainerName + partitionKey: { + paths: [ + '/country' + ] + kind: 'Hash' + } + indexingPolicy: { + indexingMode: 'consistent' + automatic: true + includedPaths: [ + { + path: '/*' + } + ] + excludedPaths: [ + { + path: '/\'_etag\'/?' + } + ] + } + defaultTtl: 86400 + } + options: { + throughput: throughput + } + } +} diff --git a/tests/environment/sources/mysql.bicep b/tests/environment/sources/mysql.bicep new file mode 100644 index 0000000..ec32db2 --- /dev/null +++ b/tests/environment/sources/mysql.bicep @@ -0,0 +1,89 @@ +@description('Server Name for Azure database for MySQL') +param serverName string = uniqueString('mysql', resourceGroup().id) + +@description('Database administrator login name') +@minLength(1) +param administratorLogin string + +@description('Database administrator password') +@minLength(8) +@secure() +param administratorLoginPassword string + +@description('Azure database for MySQL compute capacity in vCores (2,4,8,16,32)') +param skuCapacity int = 2 + +@description('Azure database for MySQL sku name ') +param skuName string = 'B_Gen5_2' + +@description('Azure database for MySQL Sku Size ') +param SkuSizeMB int = 5120 + +@description('Azure database for MySQL pricing tier') +@allowed([ + 'Basic' + 'GeneralPurpose' + 'MemoryOptimized' +]) +param SkuTier string = 'Basic' + +@description('Azure database for MySQL sku family') +param skuFamily string = 'Gen5' + +@description('MySQL version') +@allowed([ + '5.6' + '5.7' + '8.0' +]) +param mysqlVersion string = '8.0' + +@description('Location for all resources.') +param location string = resourceGroup().location + +@description('MySQL Server backup retention days') +param backupRetentionDays int = 7 + +@description('Geo-Redundant Backup setting') +param geoRedundantBackup string = 'Disabled' + + +var firewallrules = [ + { + Name: 'rule1' + StartIpAddress: '0.0.0.0' + EndIpAddress: '255.255.255.255' + } +] + +resource mysqlDbServer 'Microsoft.DBforMySQL/servers@2017-12-01' = { + name: serverName + location: location + sku: { + name: skuName + tier: SkuTier + capacity: skuCapacity + size: '${SkuSizeMB}' //a string is expected here but a int for the storageProfile... + family: skuFamily + } + properties: { + createMode: 'Default' + version: mysqlVersion + administratorLogin: administratorLogin + administratorLoginPassword: administratorLoginPassword + storageProfile: { + storageMB: SkuSizeMB + backupRetentionDays: backupRetentionDays + geoRedundantBackup: geoRedundantBackup + } + } +} + +@batchSize(1) +resource firewallRules 'Microsoft.DBforMySQL/servers/firewallRules@2017-12-01' = [for rule in firewallrules: { + name: '${mysqlDbServer.name}/${rule.Name}' + properties: { + startIpAddress: rule.StartIpAddress + endIpAddress: rule.EndIpAddress + } +}] diff --git a/tests/environment/sources/postgres.bicep b/tests/environment/sources/postgres.bicep new file mode 100644 index 0000000..b58b7b2 --- /dev/null +++ b/tests/environment/sources/postgres.bicep @@ -0,0 +1,92 @@ +@description('Server Name for Azure database for PostgreSQL') +param serverName string = uniqueString('postgres', resourceGroup().id) + +@description('Database administrator login name') +@minLength(1) +param administratorLogin string + +@description('Database administrator password') +@minLength(8) +@secure() +param administratorLoginPassword string + +@description('Azure database for PostgreSQL compute capacity in vCores (2,4,8,16,32)') +param skuCapacity int = 2 + +@description('Azure database for PostgreSQL sku name') +param skuName string = 'B_Gen5_2' + +@description('Azure database for PostgreSQL Sku Size') +param skuSizeMB int = 5120 + +@description('Azure database for PostgreSQL pricing tier') +@allowed([ + 'Basic' + 'GeneralPurpose' + 'MemoryOptimized' +]) +param skuTier string = 'Basic' + +@description('Azure database for PostgreSQL sku family') +param skuFamily string = 'Gen5' + +@description('PostgreSQL version') +@allowed([ + '9.5' + '9.6' + '10' + '10.0' + '10.2' + '11' +]) +param postgresqlVersion string = '11' + +@description('Location for all resources.') +param location string = resourceGroup().location + +@description('PostgreSQL Server backup retention days') +param backupRetentionDays int = 7 + +@description('Geo-Redundant Backup setting') +param geoRedundantBackup string = 'Disabled' + +var firewallrules = [ + { + Name: 'rule1' + StartIpAddress: '0.0.0.0' + EndIpAddress: '255.255.255.255' + } +] + +resource server 'Microsoft.DBforPostgreSQL/servers@2017-12-01' = { + name: serverName + location: location + sku: { + name: skuName + tier: skuTier + capacity: skuCapacity + size: '${skuSizeMB}' + family: skuFamily + } + properties: { + createMode: 'Default' + version: postgresqlVersion + administratorLogin: administratorLogin + administratorLoginPassword: administratorLoginPassword + storageProfile: { + storageMB: skuSizeMB + backupRetentionDays: backupRetentionDays + geoRedundantBackup: geoRedundantBackup + } + } + +} + +@batchSize(1) +resource firewallRules 'Microsoft.DBforPostgreSQL/servers/firewallRules@2017-12-01' = [for rule in firewallrules: { + name: '${server.name}/${rule.Name}' + properties: { + startIpAddress: rule.StartIpAddress + endIpAddress: rule.EndIpAddress + } +}] diff --git a/tests/environment/sources/sql.bicep b/tests/environment/sources/sql.bicep new file mode 100644 index 0000000..af14c46 --- /dev/null +++ b/tests/environment/sources/sql.bicep @@ -0,0 +1,22 @@ +@description('The name of the SQL logical server.') +param serverName string = uniqueString('sql', resourceGroup().id) + +@description('The name of the SQL Database.') +param sqlDBName string = 'SampleDB' + +@description('Location for all resources.') +param location string = resourceGroup().location + +resource sqlServer 'Microsoft.Sql/servers@2022-05-01-preview' existing = { + name: serverName +} + +resource sqlDB 'Microsoft.Sql/servers/databases@2022-05-01-preview' = { + parent: sqlServer + name: sqlDBName + location: location + sku: { + name: 'Basic' + tier: 'Basic' + } +} diff --git a/tests/environment/sources/sqlserver.bicep b/tests/environment/sources/sqlserver.bicep new file mode 100644 index 0000000..6d503a1 --- /dev/null +++ b/tests/environment/sources/sqlserver.bicep @@ -0,0 +1,21 @@ +@description('The name of the SQL logical server.') +param serverName string = uniqueString('sql', resourceGroup().id) + +@description('Location for all resources.') +param location string = resourceGroup().location + +@description('The administrator username of the SQL logical server.') +param administratorLogin string + +@description('The administrator password of the SQL logical server.') +@secure() +param administratorLoginPassword string + +resource sqlServer 'Microsoft.Sql/servers@2022-05-01-preview' = { + name: serverName + location: location + properties: { + administratorLogin: administratorLogin + administratorLoginPassword: administratorLoginPassword + } +} diff --git a/tests/environment/sources/synapse.bicep b/tests/environment/sources/synapse.bicep new file mode 100644 index 0000000..916ac65 --- /dev/null +++ b/tests/environment/sources/synapse.bicep @@ -0,0 +1,75 @@ +@description('The Synapse Workspace name.') +param workspaceName string = uniqueString('synwksp', resourceGroup().id) + +@description('Location for all resources.') +param location string = resourceGroup().location + +@description('The administrator username of the SQL logical server.') +@secure() +param administratorLogin string + +@description('The administrator password of the SQL logical server.') +@secure() +param administratorLoginPassword string + +var supportingStorageName = '${workspaceName}sa' + +resource storageAccount 'Microsoft.Storage/storageAccounts@2021-08-01' = { + name: supportingStorageName + location: location + sku: { + name: 'Standard_LRS' + } + kind: 'StorageV2' + properties:{ + isHnsEnabled: true + } + +} + +resource rawdataContainer 'Microsoft.Storage/storageAccounts/blobServices/containers@2021-08-01' = { + name: '${storageAccount.name}/default/defaultcontainer' +} + +resource tempContainer 'Microsoft.Storage/storageAccounts/blobServices/containers@2021-08-01' = { + name: '${storageAccount.name}/default/temp' +} + +resource synapseWorkspace 'Microsoft.Synapse/workspaces@2021-06-01' = { + name: workspaceName + location: location + identity: { + type: 'SystemAssigned' + } + properties: { + azureADOnlyAuthentication: false + defaultDataLakeStorage: { + accountUrl: 'https://${storageAccount.name}.dfs.core.windows.net' + createManagedPrivateEndpoint: false + filesystem: 'synapsefs' + resourceId: resourceId('Microsoft.Storage/storageAccounts/', storageAccount.name) + } + managedResourceGroupName: '${workspaceName}rg' + + publicNetworkAccess: 'Enabled' + sqlAdministratorLogin: administratorLogin + sqlAdministratorLoginPassword: administratorLoginPassword + trustedServiceBypassEnabled: true + } +} + +resource symbolicname 'Microsoft.Synapse/workspaces/sqlPools@2021-06-01' = { + name: 'sqlpool1' + location: location + sku: { + name: 'DW100c' + capacity: 0 + } + parent: synapseWorkspace + properties: { + collation: 'SQL_Latin1_General_CP1_CI_AS' + createMode: 'Default' + + storageAccountType: 'LRS' + } +} diff --git a/tests/integration/README.md b/tests/integration/README.md index a44386b..a0704d9 100644 --- a/tests/integration/README.md +++ b/tests/integration/README.md @@ -20,6 +20,7 @@ A related `*-expectations.json` file should exist for every test definition. It |call-via-adf-spark2.scala|ABFS|ABFS|✅||Called via Azure Data Factory| |call-via-adf-spark3.scala|ABFS|ABFS||✅|Called via Azure Data Factory| |delta-in-delta-merge.scala|DELTA|DELTA|❌|❌|Uses a Merge Statement| +|delta-in-delta-merge-package.py|DELTA|DELTA|❌|❌|Uses a Merge Statement| |delta-in-delta-out-abfss.scala|DELTA|DELTA||✅|| |delta-in-delta-out-fs.scala|DELTA|DELTA||✅|| |delta-in-delta-out-mnt.scala|DELTA|DELTA||✅|Uses a Mount Point| diff --git a/tests/integration/jobdefs-inactive/spark2-tests-def.json b/tests/integration/jobdefs-inactive/spark2-tests-def.json new file mode 100644 index 0000000..e64037e --- /dev/null +++ b/tests/integration/jobdefs-inactive/spark2-tests-def.json @@ -0,0 +1,63 @@ +{ + "name": "test-examples-spark-2", + "email_notifications": { + "no_alert_for_skipped_runs": false + }, + "timeout_seconds": 0, + "max_concurrent_runs": 2, + "tasks": [ + { + "task_key": "spark2-abfss-in-abfss-out", + "notebook_task": { + "notebook_path": "/Shared/examples/abfss-in-abfss-out" + }, + "existing_cluster_id": "", + "timeout_seconds": 0, + "email_notifications": {}, + "description": "" + }, + { + "task_key": "spark2-abfss-oauth", + "depends_on": [ + { + "task_key": "spark2-abfss-in-abfss-out" + } + ], + "notebook_task": { + "notebook_path": "/Shared/examples/abfss-in-abfss-out-oauth" + }, + "existing_cluster_id": "", + "timeout_seconds": 0, + "email_notifications": {} + }, + { + "task_key": "spark2-mnt", + "depends_on": [ + { + "task_key": "spark2-abfss-oauth" + } + ], + "notebook_task": { + "notebook_path": "/Shared/examples/mnt-in-mnt-out" + }, + "existing_cluster_id": "", + "timeout_seconds": 0, + "email_notifications": {} + }, + { + "task_key": "spark2-Synapse-wasbs-synapse", + "depends_on": [ + { + "task_key": "spark2-mnt" + } + ], + "notebook_task": { + "notebook_path": "/Shared/examples/synapse-wasbs-in-synapse-out" + }, + "existing_cluster_id": "", + "timeout_seconds": 0, + "email_notifications": {} + } + ], + "format": "MULTI_TASK" +} \ No newline at end of file diff --git a/tests/integration/jobdefs/spark2-tests-expectations.json b/tests/integration/jobdefs-inactive/spark2-tests-expectations.json similarity index 79% rename from tests/integration/jobdefs/spark2-tests-expectations.json rename to tests/integration/jobdefs-inactive/spark2-tests-expectations.json index 2e6cefc..3b31266 100644 --- a/tests/integration/jobdefs/spark2-tests-expectations.json +++ b/tests/integration/jobdefs-inactive/spark2-tests-expectations.json @@ -1,13 +1,13 @@ [ "databricks://.azuredatabricks.net/jobs/", "databricks://.azuredatabricks.net/jobs//tasks/spark2-abfss-in-abfss-out", - "databricks://.azuredatabricks.net/jobs//tasks/spark2-abfss-in-abfss-out/processes/2796E46D0CCD18971A9C936C1EB97B1E->34BBA1402F1BAE560BFEA804B83FED62", + "databricks://.azuredatabricks.net/jobs//tasks/spark2-abfss-in-abfss-out/processes/58C1F24BA6C6FF7592F786C9FA8A3451->BA6B11F82FDCE37E849D25D545E6FB7A", "databricks://.azuredatabricks.net/notebooks/Shared/examples/abfss-in-abfss-out", "databricks://.azuredatabricks.net/jobs//tasks/spark2-abfss-oauth", - "databricks://.azuredatabricks.net/jobs//tasks/spark2-abfss-oauth/processes/56EE0B098A9A3D07DC11F4C6EA9BF71C->E6B1D99B74724B48DAB2BCB79142CB65", + "databricks://.azuredatabricks.net/jobs//tasks/spark2-abfss-oauth/processes/BD4A7A895E605BF6C4DE003D3F6B3F39->A3B52DA733083E4642E1C3DB6B093E84", "databricks://.azuredatabricks.net/notebooks/Shared/examples/abfss-in-abfss-out-oauth", "databricks://.azuredatabricks.net/jobs//tasks/spark2-mnt", - "databricks://.azuredatabricks.net/jobs//tasks/spark2-mnt/processes/EAEEF594372A61E0E1B545C0B430E966->ADFAB39F64A04DBD087DC73F8DF4EA47", + "databricks://.azuredatabricks.net/jobs//tasks/spark2-mnt/processes/336D6FD3010382DAB8351BFF026B2CBE->C60C4BAB82567905C64B99E2DCBCA711", "databricks://.azuredatabricks.net/notebooks/Shared/examples/mnt-in-mnt-out", "databricks://.azuredatabricks.net/jobs//tasks/spark2-Synapse-wasbs-synapse", "databricks://.azuredatabricks.net/jobs//tasks/spark2-Synapse-wasbs-synapse/processes/B596CF432EE21C0349CD0770BC839867->F1AD7C08349CD0A30B47392F787D6364", diff --git a/tests/integration/jobdefs-inactive/sparksubmit-test-def.json b/tests/integration/jobdefs-inactive/sparksubmit-test-def.json index 8400a9e..508c609 100644 --- a/tests/integration/jobdefs-inactive/sparksubmit-test-def.json +++ b/tests/integration/jobdefs-inactive/sparksubmit-test-def.json @@ -19,7 +19,7 @@ "cluster_name": "", "spark_version": "9.1.x-scala2.12", "spark_conf": { - "spark.openlineage.url.param.code": "{{secrets/purview-to-adb-scope/Ol-Output-Api-Key}}", + "spark.openlineage.url.param.code": "{{secrets/purview-to-adb-kv/Ol-Output-Api-Key}}", "spark.openlineage.host": "https://.azurewebsites.net", "spark.openlineage.namespace": "#ABC123", "spark.openlineage.version": "1" diff --git a/tests/integration/jobdefs/hive3-tests-def.json b/tests/integration/jobdefs/hive3-tests-def.json index 25aea81..8fb9996 100644 --- a/tests/integration/jobdefs/hive3-tests-def.json +++ b/tests/integration/jobdefs/hive3-tests-def.json @@ -1,98 +1,96 @@ { - "settings": { - "name": "hive3-tests", - "email_notifications": { - "no_alert_for_skipped_runs": false + "name": "hive3-tests", + "email_notifications": { + "no_alert_for_skipped_runs": false + }, + "timeout_seconds": 0, + "max_concurrent_runs": 1, + "tasks": [ + { + "task_key": "hive-in-hive-out-insert", + "notebook_task": { + "notebook_path": "/Shared/examples/hive-in-hive-out-insert", + "source": "WORKSPACE" + }, + "existing_cluster_id": "", + "timeout_seconds": 0, + "email_notifications": {} }, - "timeout_seconds": 0, - "max_concurrent_runs": 1, - "tasks": [ - { - "task_key": "hive-in-hive-out-insert", - "notebook_task": { - "notebook_path": "/Shared/examples/hive-in-hive-out-insert", - "source": "WORKSPACE" - }, - "existing_cluster_id": "", - "timeout_seconds": 0, - "email_notifications": {} + { + "task_key": "hive_abfss-in-hive_abfss-out-insert", + "depends_on": [ + { + "task_key": "hive-in-hive-out-insert" + } + ], + "notebook_task": { + "notebook_path": "/Shared/examples/hive+abfss-in-hive+abfss-out-insert", + "source": "WORKSPACE" }, - { - "task_key": "hive_abfss-in-hive_abfss-out-insert", - "depends_on": [ - { - "task_key": "hive-in-hive-out-insert" - } - ], - "notebook_task": { - "notebook_path": "/Shared/examples/hive+abfss-in-hive+abfss-out-insert", - "source": "WORKSPACE" - }, - "existing_cluster_id": "", - "timeout_seconds": 0, - "email_notifications": {} + "existing_cluster_id": "", + "timeout_seconds": 0, + "email_notifications": {} + }, + { + "task_key": "hive_mgd_not_default-in-hive_mgd_not_default-out-insert", + "depends_on": [ + { + "task_key": "hive_abfss-in-hive_abfss-out-insert" + } + ], + "notebook_task": { + "notebook_path": "/Shared/examples/hive+mgd+not+default-in-hive+mgd+not+default-out-insert", + "source": "WORKSPACE" }, - { - "task_key": "hive_mgd_not_default-in-hive_mgd_not_default-out-insert", - "depends_on": [ - { - "task_key": "hive_abfss-in-hive_abfss-out-insert" - } - ], - "notebook_task": { - "notebook_path": "/Shared/examples/hive+mgd+not+default-in-hive+mgd+not+default-out-insert", - "source": "WORKSPACE" - }, - "existing_cluster_id": "", - "timeout_seconds": 0, - "email_notifications": {} + "existing_cluster_id": "", + "timeout_seconds": 0, + "email_notifications": {} + }, + { + "task_key": "hive_mnt-in-hive_mnt-out-insert", + "depends_on": [ + { + "task_key": "hive_mgd_not_default-in-hive_mgd_not_default-out-insert" + } + ], + "notebook_task": { + "notebook_path": "/Shared/examples/hive+mnt-in-hive+mnt-out-insert", + "source": "WORKSPACE" }, - { - "task_key": "hive_mnt-in-hive_mnt-out-insert", - "depends_on": [ - { - "task_key": "hive_mgd_not_default-in-hive_mgd_not_default-out-insert" - } - ], - "notebook_task": { - "notebook_path": "/Shared/examples/hive+mnt-in-hive+mnt-out-insert", - "source": "WORKSPACE" - }, - "existing_cluster_id": "", - "timeout_seconds": 0, - "email_notifications": {} + "existing_cluster_id": "", + "timeout_seconds": 0, + "email_notifications": {} + }, + { + "task_key": "abfss-in-hive_notmgd_saveAsTable-out", + "depends_on": [ + { + "task_key": "hive_mnt-in-hive_mnt-out-insert" + } + ], + "notebook_task": { + "notebook_path": "/Shared/examples/abfss-in-hive+notmgd+saveAsTable-out", + "source": "WORKSPACE" }, - { - "task_key": "abfss-in-hive_notmgd_saveAsTable-out", - "depends_on": [ - { - "task_key": "hive_mnt-in-hive_mnt-out-insert" - } - ], - "notebook_task": { - "notebook_path": "/Shared/examples/abfss-in-hive+notmgd+saveAsTable-out", - "source": "WORKSPACE" - }, - "existing_cluster_id": "", - "timeout_seconds": 0, - "email_notifications": {} + "existing_cluster_id": "", + "timeout_seconds": 0, + "email_notifications": {} + }, + { + "task_key": "abfss-in-hive_saveAsTable-out", + "depends_on": [ + { + "task_key": "abfss-in-hive_notmgd_saveAsTable-out" + } + ], + "notebook_task": { + "notebook_path": "/Shared/examples/abfss-in-hive+saveAsTable-out", + "source": "WORKSPACE" }, - { - "task_key": "abfss-in-hive_saveAsTable-out", - "depends_on": [ - { - "task_key": "abfss-in-hive_notmgd_saveAsTable-out" - } - ], - "notebook_task": { - "notebook_path": "/Shared/examples/abfss-in-hive+saveAsTable-out", - "source": "WORKSPACE" - }, - "existing_cluster_id": "", - "timeout_seconds": 0, - "email_notifications": {} - } - ], - "format": "MULTI_TASK" - } + "existing_cluster_id": "", + "timeout_seconds": 0, + "email_notifications": {} + } + ], + "format": "MULTI_TASK" } \ No newline at end of file diff --git a/tests/integration/jobdefs/hive3-tests-expectations.json b/tests/integration/jobdefs/hive3-tests-expectations.json index bcd13dd..2d894ef 100644 --- a/tests/integration/jobdefs/hive3-tests-expectations.json +++ b/tests/integration/jobdefs/hive3-tests-expectations.json @@ -1,21 +1,21 @@ [ "databricks://.azuredatabricks.net/jobs/", "databricks://.azuredatabricks.net/jobs//tasks/hive-in-hive-out-insert", - "databricks://.azuredatabricks.net/jobs//tasks/hive-in-hive-out-insert/processes/9EA618584B76AF154FF5885F070A753F->8846F7679DA958CC91AEB2B6311C97D2", + "databricks://.azuredatabricks.net/jobs//tasks/hive-in-hive-out-insert/processes/2CE3088B4BAADD102F97D92B97F3AB79->E14B63BA5130659288E6B5DB7FC7F232", "databricks://.azuredatabricks.net/notebooks/Shared/examples/hive-in-hive-out-insert", "databricks://.azuredatabricks.net/jobs//tasks/abfss-in-hive_notmgd_saveAsTable-out", - "databricks://.azuredatabricks.net/jobs//tasks/abfss-in-hive_notmgd_saveAsTable-out/processes/DADD88BC04CD758A0D2EB06CE6F86431->C4D873C9A86F827AB135E541A4952BCD", + "databricks://.azuredatabricks.net/jobs//tasks/abfss-in-hive_notmgd_saveAsTable-out/processes/575BF7CF92625D35D6B9309C9561FE0A->43E1EB2B6E2B692F3AFDDDBD63762F41", "databricks://.azuredatabricks.net/notebooks/Shared/examples/abfss-in-hive+notmgd+saveAsTable-out", "databricks://.azuredatabricks.net/jobs//tasks/hive_abfss-in-hive_abfss-out-insert", - "databricks://.azuredatabricks.net/jobs//tasks/hive_abfss-in-hive_abfss-out-insert/processes/B5EA0788D2DFDD6724C9638A23C72530->C45E275909E82D362F516CB3DF62F01E", + "databricks://.azuredatabricks.net/jobs//tasks/hive_abfss-in-hive_abfss-out-insert/processes/0366CD2735F426A339DB69EBB00A6ABC->95F7EE6DC3AB03275F8FE27E98838D54", "databricks://.azuredatabricks.net/notebooks/Shared/examples/hive+abfss-in-hive+abfss-out-insert", "databricks://.azuredatabricks.net/jobs//tasks/hive_mgd_not_default-in-hive_mgd_not_default-out-insert", - "databricks://.azuredatabricks.net/jobs//tasks/hive_mgd_not_default-in-hive_mgd_not_default-out-insert/processes/ABC9D8E9383FFB2295BA21732E71BDE5->819BA9557FE05FAFFCD1E6C8C8B12239", + "databricks://.azuredatabricks.net/jobs//tasks/hive_mgd_not_default-in-hive_mgd_not_default-out-insert/processes/13AA3B6322616FF3E554C6A109EBAB5C->6FCA021CCAD4C906D5C29512215F86C9", "databricks://.azuredatabricks.net/notebooks/Shared/examples/hive+mgd+not+default-in-hive+mgd+not+default-out-insert", "databricks://.azuredatabricks.net/jobs//tasks/hive_mnt-in-hive_mnt-out-insert", - "databricks://.azuredatabricks.net/jobs//tasks/hive_mnt-in-hive_mnt-out-insert/processes/B5EA0788D2DFDD6724C9638A23C72530->C45E275909E82D362F516CB3DF62F01E", + "databricks://.azuredatabricks.net/jobs//tasks/hive_mnt-in-hive_mnt-out-insert/processes/0366CD2735F426A339DB69EBB00A6ABC->95F7EE6DC3AB03275F8FE27E98838D54", "databricks://.azuredatabricks.net/notebooks/Shared/examples/hive+mnt-in-hive+mnt-out-insert", "databricks://.azuredatabricks.net/jobs//tasks/abfss-in-hive_saveAsTable-out", - "databricks://.azuredatabricks.net/jobs//tasks/abfss-in-hive_saveAsTable-out/processes/B97ED17F23A32D631D1A53C1AE3A009A->7799B858F5B94A237932CDF9F987F8E0", + "databricks://.azuredatabricks.net/jobs//tasks/abfss-in-hive_saveAsTable-out/processes/D691CD0248B7A179C249AE6DA86A9A69->1073C801CC5F362F10F1CD1FFBA1972C", "databricks://.azuredatabricks.net/notebooks/Shared/examples/abfss-in-hive+saveAsTable-out" ] diff --git a/tests/integration/jobdefs/jarjob-test-def.json b/tests/integration/jobdefs/jarjob-test-def.json index 25dca17..4e66489 100644 --- a/tests/integration/jobdefs/jarjob-test-def.json +++ b/tests/integration/jobdefs/jarjob-test-def.json @@ -1,27 +1,25 @@ { - "settings": { - "name": "JarJob", - "email_notifications": { - "no_alert_for_skipped_runs": false - }, - "max_concurrent_runs": 1, - "tasks": [ - { - "task_key": "JarJob", - "spark_jar_task": { - "jar_uri": "", - "main_class_name": "SparkApp.Basic.App", - "run_as_repl": true - }, - "existing_cluster_id": "", - "libraries": [ - { - "jar": "dbfs:/FileStore/testcases/app.jar" - } - ], - "timeout_seconds": 0 - } - ], - "format": "MULTI_TASK" - } + "name": "JarJob", + "email_notifications": { + "no_alert_for_skipped_runs": false + }, + "max_concurrent_runs": 1, + "tasks": [ + { + "task_key": "JarJob", + "spark_jar_task": { + "jar_uri": "", + "main_class_name": "SparkApp.Basic.App", + "run_as_repl": true + }, + "existing_cluster_id": "", + "libraries": [ + { + "jar": "dbfs:/FileStore/testcases/app.jar" + } + ], + "timeout_seconds": 0 + } + ], + "format": "MULTI_TASK" } \ No newline at end of file diff --git a/tests/integration/jobdefs/jarjob-test-expectations.json b/tests/integration/jobdefs/jarjob-test-expectations.json index 304192f..06b31d8 100644 --- a/tests/integration/jobdefs/jarjob-test-expectations.json +++ b/tests/integration/jobdefs/jarjob-test-expectations.json @@ -1,5 +1,5 @@ [ "databricks://.azuredatabricks.net/jobs/", "databricks://.azuredatabricks.net/jobs//tasks/JarJob", - "databricks://.azuredatabricks.net/jobs//tasks/JarJob/processes/B4CFB465D62A3D282313EF88E9E4779C->2B1635731ED472A95FC7A53B61F02674" + "databricks://.azuredatabricks.net/jobs//tasks/JarJob/processes/CA1C8F378EABC4EF08062103C5D51CBE->560CF14B3818EF6B8FF5D0BC6AF7BCE9" ] diff --git a/tests/integration/jobdefs/pythonscript-test-def.json b/tests/integration/jobdefs/pythonscript-test-def.json index e9c1282..2358501 100644 --- a/tests/integration/jobdefs/pythonscript-test-def.json +++ b/tests/integration/jobdefs/pythonscript-test-def.json @@ -1,42 +1,40 @@ { - "settings": { - "name": "PythonScriptJob", - "email_notifications": {}, - "max_concurrent_runs": 1, - "tasks": [ - { - "task_key": "PythonScriptJob", - "spark_python_task": { - "python_file": "dbfs:/FileStore/testcases/pythonscript.py" + "name": "PythonScriptJob", + "email_notifications": {}, + "max_concurrent_runs": 1, + "tasks": [ + { + "task_key": "PythonScriptJob", + "spark_python_task": { + "python_file": "dbfs:/FileStore/testcases/pythonscript.py" + }, + "new_cluster": { + "spark_version": "9.1.x-scala2.12", + "spark_conf": { + "spark.openlineage.url.param.code": "{{secrets/purview-to-adb-kv/Ol-Output-Api-Key}}", + "spark.openlineage.host": "https://.azurewebsites.net", + "spark.openlineage.namespace": "#ABC123", + "spark.openlineage.version": "v1" }, - "new_cluster": { - "spark_version": "9.1.x-scala2.12", - "spark_conf": { - "spark.openlineage.url.param.code": "{{secrets/purview-to-adb-scope/Ol-Output-Api-Key}}", - "spark.openlineage.host": "https://.azurewebsites.net", - "spark.openlineage.namespace": "#ABC123", - "spark.openlineage.version": "1" - }, - "node_type_id": "Standard_DS3_v2", - "enable_elastic_disk": true, - "init_scripts": [ - { - "dbfs": { - "destination": "dbfs:/databricks/openlineagehardcoded/release-candidate.sh" - } + "node_type_id": "Standard_DS3_v2", + "enable_elastic_disk": true, + "init_scripts": [ + { + "dbfs": { + "destination": "dbfs:/databricks/openlineage/open-lineage-init-script.sh" } - ], - "azure_attributes": { - "availability": "ON_DEMAND_AZURE" - }, - "num_workers": 1 + } + ], + "azure_attributes": { + "availability": "ON_DEMAND_AZURE" }, - "max_retries": 1, - "min_retry_interval_millis": 0, - "retry_on_timeout": false, - "timeout_seconds": 3600 - } - ], - "format": "MULTI_TASK" - } + "num_workers": 1 + }, + "max_retries": 1, + "min_retry_interval_millis": 0, + "retry_on_timeout": false, + "timeout_seconds": 3600 + } + ], + "format": "MULTI_TASK" } \ No newline at end of file diff --git a/tests/integration/jobdefs/pythonscript-test-expectations.json b/tests/integration/jobdefs/pythonscript-test-expectations.json index 4fa1f04..077a08c 100644 --- a/tests/integration/jobdefs/pythonscript-test-expectations.json +++ b/tests/integration/jobdefs/pythonscript-test-expectations.json @@ -1,5 +1,5 @@ [ "databricks://.azuredatabricks.net/jobs/", "databricks://.azuredatabricks.net/jobs//tasks/PythonScriptJob", - "databricks://.azuredatabricks.net/jobs//tasks/PythonScriptJob/processes/EAEFBD6BB0CA1156256F42C7E3234487->FC65543BD0CEE9FB45BDD86AF033D876" + "databricks://.azuredatabricks.net/jobs//tasks/PythonScriptJob/processes/16D109EA9E8BC7329A7365311F917C1F->C862A921EE653ED2F3101026739FB936" ] \ No newline at end of file diff --git a/tests/integration/jobdefs/pythonwheel-test-def.json b/tests/integration/jobdefs/pythonwheel-test-def.json index adbd354..96196bc 100644 --- a/tests/integration/jobdefs/pythonwheel-test-def.json +++ b/tests/integration/jobdefs/pythonwheel-test-def.json @@ -1,26 +1,24 @@ { - "settings": { - "name": "WheelJob", - "email_notifications": { - "no_alert_for_skipped_runs": false - }, - "max_concurrent_runs": 1, - "tasks": [ - { - "task_key": "WheelJob", - "python_wheel_task": { - "package_name": "abfssintest", - "entry_point": "runapp" - }, - "existing_cluster_id": "", - "libraries": [ - { - "whl": "dbfs:/wheels/abfssintest-0.0.3-py3-none-any.whl" - } - ], - "timeout_seconds": 0 - } - ], - "format": "MULTI_TASK" - } + "name": "WheelJob", + "email_notifications": { + "no_alert_for_skipped_runs": false + }, + "max_concurrent_runs": 1, + "tasks": [ + { + "task_key": "WheelJob", + "python_wheel_task": { + "package_name": "abfssintest", + "entry_point": "runapp" + }, + "existing_cluster_id": "", + "libraries": [ + { + "whl": "dbfs:/FileStore/testcases/abfssintest-0.0.3-py3-none-any.whl" + } + ], + "timeout_seconds": 0 + } + ], + "format": "MULTI_TASK" } \ No newline at end of file diff --git a/tests/integration/jobdefs/pythonwheel-test-expectations.json b/tests/integration/jobdefs/pythonwheel-test-expectations.json index 922d6de..12ba684 100644 --- a/tests/integration/jobdefs/pythonwheel-test-expectations.json +++ b/tests/integration/jobdefs/pythonwheel-test-expectations.json @@ -1,5 +1,5 @@ [ "databricks://.azuredatabricks.net/jobs/", "databricks://.azuredatabricks.net/jobs//tasks/WheelJob", - "databricks://.azuredatabricks.net/jobs//tasks/WheelJob/processes/D18BCD0504F8604104FE4D0E7C821E13->50430216FAFCCDD3BFD497A1FA0C14D0" + "databricks://.azuredatabricks.net/jobs//tasks/WheelJob/processes/6438ED307BBA90F1285E1229E67E020B->5560AE0F6CE4403CC559ECF1821CCE47" ] \ No newline at end of file diff --git a/tests/integration/jobdefs/spark2-tests-def.json b/tests/integration/jobdefs/spark2-tests-def.json deleted file mode 100644 index b88b9be..0000000 --- a/tests/integration/jobdefs/spark2-tests-def.json +++ /dev/null @@ -1,65 +0,0 @@ -{ - "settings": { - "name": "test-examples-spark-2", - "email_notifications": { - "no_alert_for_skipped_runs": false - }, - "timeout_seconds": 0, - "max_concurrent_runs": 2, - "tasks": [ - { - "task_key": "spark2-abfss-in-abfss-out", - "notebook_task": { - "notebook_path": "/Shared/examples/abfss-in-abfss-out" - }, - "existing_cluster_id": "0505-211804-c5x0jm8p", - "timeout_seconds": 0, - "email_notifications": {}, - "description": "" - }, - { - "task_key": "spark2-abfss-oauth", - "depends_on": [ - { - "task_key": "spark2-abfss-in-abfss-out" - } - ], - "notebook_task": { - "notebook_path": "/Shared/examples/abfss-in-abfss-out-oauth" - }, - "existing_cluster_id": "0505-211804-c5x0jm8p", - "timeout_seconds": 0, - "email_notifications": {} - }, - { - "task_key": "spark2-mnt", - "depends_on": [ - { - "task_key": "spark2-abfss-oauth" - } - ], - "notebook_task": { - "notebook_path": "/Shared/examples/mnt-in-mnt-out" - }, - "existing_cluster_id": "0505-211804-c5x0jm8p", - "timeout_seconds": 0, - "email_notifications": {} - }, - { - "task_key": "spark2-Synapse-wasbs-synapse", - "depends_on": [ - { - "task_key": "spark2-mnt" - } - ], - "notebook_task": { - "notebook_path": "/Shared/examples/synapse-wasbs-in-synapse-out" - }, - "existing_cluster_id": "0505-211804-c5x0jm8p", - "timeout_seconds": 0, - "email_notifications": {} - } - ], - "format": "MULTI_TASK" - } -} \ No newline at end of file diff --git a/tests/integration/jobdefs/spark3-tests-def.json b/tests/integration/jobdefs/spark3-tests-def.json index e58b1c1..a1d51fc 100644 --- a/tests/integration/jobdefs/spark3-tests-def.json +++ b/tests/integration/jobdefs/spark3-tests-def.json @@ -1,205 +1,293 @@ { - "settings": { - "name": "test-examples-spark-3", - "email_notifications": { - "no_alert_for_skipped_runs": false - }, - "timeout_seconds": 0, - "max_concurrent_runs": 1, - "tasks": [ - { - "task_key": "abfss-in-abfss-out", - "notebook_task": { - "notebook_path": "/Shared/examples/abfss-in-abfss-out" - }, - "existing_cluster_id": "", - "timeout_seconds": 0, - "email_notifications": {}, - "description": "" - }, - { - "task_key": "abfss-oauth", - "depends_on": [ - { - "task_key": "abfss-in-abfss-out" - } - ], - "notebook_task": { - "notebook_path": "/Shared/examples/abfss-in-abfss-out-oauth" - }, - "existing_cluster_id": "", - "timeout_seconds": 0, - "email_notifications": {} - }, - { - "task_key": "azuresql-in-out", - "depends_on": [ - { - "task_key": "ab-in-ab-out-root" - } - ], - "notebook_task": { - "notebook_path": "/Shared/examples/azuresql-in-azuresql-out" - }, - "existing_cluster_id": "", - "timeout_seconds": 0, - "email_notifications": {} - }, - { - "task_key": "delta-abfss", - "depends_on": [ - { - "task_key": "azuresql-in-out" - } - ], - "notebook_task": { - "notebook_path": "/Shared/examples/delta-in-delta-out-abfss" - }, - "existing_cluster_id": "", - "timeout_seconds": 0, - "email_notifications": {} - }, - { - "task_key": "delta-fs", - "depends_on": [ - { - "task_key": "delta-abfss" - } - ], - "notebook_task": { - "notebook_path": "/Shared/examples/delta-in-delta-out-fs" - }, - "existing_cluster_id": "", - "timeout_seconds": 0, - "email_notifications": {} - }, - { - "task_key": "delta-mnt", - "depends_on": [ - { - "task_key": "delta-fs" - } - ], - "notebook_task": { - "notebook_path": "/Shared/examples/delta-in-delta-out-mnt" - }, - "existing_cluster_id": "", - "timeout_seconds": 0, - "email_notifications": {} - }, - { - "task_key": "mnt", - "depends_on": [ - { - "task_key": "intermix-languages" - } - ], - "notebook_task": { - "notebook_path": "/Shared/examples/mnt-in-mnt-out" - }, - "existing_cluster_id": "", - "timeout_seconds": 0, - "email_notifications": {} - }, - { - "task_key": "synapse-in-wasbs-out", - "depends_on": [ - { - "task_key": "nested-parent" - } - ], - "notebook_task": { - "notebook_path": "/Shared/examples/synapse-in-wasbs-out" - }, - "existing_cluster_id": "", - "timeout_seconds": 0, - "email_notifications": {} - }, - { - "task_key": "Syn-in-WB-in-Syn-Out", - "depends_on": [ - { - "task_key": "synapse-in-wasbs-out" - } - ], - "notebook_task": { - "notebook_path": "/Shared/examples/synapse-wasbs-in-synapse-out" - }, - "existing_cluster_id": "", - "timeout_seconds": 0, - "email_notifications": {} - }, - { - "task_key": "wasbs-in-wasbs-out", - "depends_on": [ - { - "task_key": "Syn-in-WB-in-Syn-Out" - } - ], - "notebook_task": { - "notebook_path": "/Shared/examples/wasbs-in-wasbs-out" - }, - "existing_cluster_id": "", - "timeout_seconds": 0, - "email_notifications": {} - }, - { - "task_key": "ab-in-ab-out-root", - "depends_on": [ - { - "task_key": "abfss-oauth" - } - ], - "notebook_task": { - "notebook_path": "/Shared/examples/abfss-in-abfss-out-root" - }, - "existing_cluster_id": "", - "timeout_seconds": 0, - "email_notifications": {} - }, - { - "task_key": "nested-parent", - "depends_on": [ - { - "task_key": "mnt" - } - ], - "notebook_task": { - "notebook_path": "/Shared/examples/nested-parent" - }, - "existing_cluster_id": "", - "timeout_seconds": 0, - "email_notifications": {} - }, - { - "task_key": "intermix-languages", - "depends_on": [ - { - "task_key": "delta-mnt" - } - ], - "notebook_task": { - "notebook_path": "/Shared/examples/intermix-languages" - }, - "existing_cluster_id": "", - "timeout_seconds": 0, - "email_notifications": {} - }, - { - "task_key": "output-with-period", - "depends_on": [ - { - "task_key": "nested-parent" - } - ], - "notebook_task": { - "notebook_path": "/Shared/examples/name-with-periods" - }, - "existing_cluster_id": "", - "timeout_seconds": 0, - "email_notifications": {} - } - ], - "format": "MULTI_TASK" - } + "name": "test-examples-spark-3", + "email_notifications": { + "no_alert_for_skipped_runs": false + }, + "timeout_seconds": 0, + "max_concurrent_runs": 1, + "tasks": [ + { + "task_key": "abfss-in-abfss-out", + "notebook_task": { + "notebook_path": "/Shared/examples/abfss-in-abfss-out" + }, + "existing_cluster_id": "", + "timeout_seconds": 0, + "email_notifications": {}, + "description": "" + }, + { + "task_key": "abfss-oauth", + "depends_on": [ + { + "task_key": "abfss-in-abfss-out" + } + ], + "notebook_task": { + "notebook_path": "/Shared/examples/abfss-in-abfss-out-oauth" + }, + "existing_cluster_id": "", + "timeout_seconds": 0, + "email_notifications": {} + }, + { + "task_key": "azuresql-in-out", + "depends_on": [ + { + "task_key": "ab-in-ab-out-root" + } + ], + "notebook_task": { + "notebook_path": "/Shared/examples/azuresql-in-azuresql-out" + }, + "existing_cluster_id": "", + "timeout_seconds": 0, + "email_notifications": {} + }, + { + "task_key": "delta-abfss", + "depends_on": [ + { + "task_key": "azuresql-in-out" + } + ], + "notebook_task": { + "notebook_path": "/Shared/examples/delta-in-delta-out-abfss" + }, + "existing_cluster_id": "", + "timeout_seconds": 0, + "email_notifications": {} + }, + { + "task_key": "delta-fs", + "depends_on": [ + { + "task_key": "delta-abfss" + } + ], + "notebook_task": { + "notebook_path": "/Shared/examples/delta-in-delta-out-fs" + }, + "existing_cluster_id": "", + "timeout_seconds": 0, + "email_notifications": {} + }, + { + "task_key": "delta-mnt", + "depends_on": [ + { + "task_key": "delta-fs" + } + ], + "notebook_task": { + "notebook_path": "/Shared/examples/delta-in-delta-out-mnt" + }, + "existing_cluster_id": "", + "timeout_seconds": 0, + "email_notifications": {} + }, + { + "task_key": "mnt", + "depends_on": [ + { + "task_key": "intermix-languages" + } + ], + "notebook_task": { + "notebook_path": "/Shared/examples/mnt-in-mnt-out" + }, + "existing_cluster_id": "", + "timeout_seconds": 0, + "email_notifications": {} + }, + { + "task_key": "synapse-in-wasbs-out", + "depends_on": [ + { + "task_key": "nested-parent" + } + ], + "notebook_task": { + "notebook_path": "/Shared/examples/synapse-in-wasbs-out" + }, + "existing_cluster_id": "", + "timeout_seconds": 0, + "email_notifications": {} + }, + { + "task_key": "Syn-in-WB-in-Syn-Out", + "depends_on": [ + { + "task_key": "synapse-in-wasbs-out" + } + ], + "notebook_task": { + "notebook_path": "/Shared/examples/synapse-wasbs-in-synapse-out" + }, + "existing_cluster_id": "", + "timeout_seconds": 0, + "email_notifications": {} + }, + { + "task_key": "wasbs-in-wasbs-out", + "depends_on": [ + { + "task_key": "Syn-in-WB-in-Syn-Out" + } + ], + "notebook_task": { + "notebook_path": "/Shared/examples/wasbs-in-wasbs-out" + }, + "existing_cluster_id": "", + "timeout_seconds": 0, + "email_notifications": {} + }, + { + "task_key": "ab-in-ab-out-root", + "depends_on": [ + { + "task_key": "abfss-oauth" + } + ], + "notebook_task": { + "notebook_path": "/Shared/examples/abfss-in-abfss-out-root" + }, + "existing_cluster_id": "", + "timeout_seconds": 0, + "email_notifications": {} + }, + { + "task_key": "nested-parent", + "depends_on": [ + { + "task_key": "mnt" + } + ], + "notebook_task": { + "notebook_path": "/Shared/examples/nested-parent" + }, + "existing_cluster_id": "", + "timeout_seconds": 0, + "email_notifications": {} + }, + { + "task_key": "intermix-languages", + "depends_on": [ + { + "task_key": "delta-mnt" + } + ], + "notebook_task": { + "notebook_path": "/Shared/examples/intermix-languages" + }, + "existing_cluster_id": "", + "timeout_seconds": 0, + "email_notifications": {} + }, + { + "task_key": "output-with-period", + "depends_on": [ + { + "task_key": "nested-parent" + } + ], + "notebook_task": { + "notebook_path": "/Shared/examples/name-with-periods" + }, + "existing_cluster_id": "", + "timeout_seconds": 0, + "email_notifications": {} + }, + { + "task_key": "wasbs-in-kusto-out", + "depends_on": [ + { + "task_key": "output-with-period" + } + ], + "notebook_task": { + "notebook_path": "/Shared/examples/wasbs-in-kusto-out", + "source": "WORKSPACE" + }, + "existing_cluster_id": "0104-045638-iaecf5ne", + "timeout_seconds": 0, + "email_notifications": {} + }, + { + "task_key": "kusto-in-wasbs-out", + "depends_on": [ + { + "task_key": "wasbs-in-kusto-out" + } + ], + "notebook_task": { + "notebook_path": "/Shared/examples/kusto-in-wasbs-out", + "source": "WORKSPACE" + }, + "existing_cluster_id": "0104-045638-iaecf5ne", + "timeout_seconds": 0, + "email_notifications": {} + }, + { + "task_key": "postgres-in-postgres-out", + "depends_on": [ + { + "task_key": "kusto-in-wasbs-out" + } + ], + "notebook_task": { + "notebook_path": "/Shared/examples/postgres-in-postgres-out", + "source": "WORKSPACE" + }, + "existing_cluster_id": "", + "timeout_seconds": 0, + "email_notifications": {} + }, + { + "task_key": "mysql-in-mysql-out", + "depends_on": [ + { + "task_key": "postgres-in-postgres-out" + } + ], + "notebook_task": { + "notebook_path": "/Shared/examples/mysql-in-mysql-out", + "source": "WORKSPACE" + }, + "existing_cluster_id": "", + "timeout_seconds": 0, + "email_notifications": {} + }, + { + "task_key": "delta-merge-task", + "depends_on": [ + { + "task_key": "mysql-in-mysql-out" + } + ], + "notebook_task": { + "notebook_path": "/Shared/examples/delta-in-delta-merge", + "source": "WORKSPACE" + }, + "existing_cluster_id": "", + "timeout_seconds": 0, + "email_notifications": {} + }, + { + "task_key": "delta-merge-pkg-task", + "depends_on": [ + { + "task_key": "delta-merge-task" + } + ], + "notebook_task": { + "notebook_path": "/Shared/examples/delta-in-delta-merge-package", + "source": "WORKSPACE" + }, + "existing_cluster_id": "", + "timeout_seconds": 0, + "email_notifications": {} + } + ], + "format": "MULTI_TASK" } \ No newline at end of file diff --git a/tests/integration/jobdefs/spark3-tests-expectations.json b/tests/integration/jobdefs/spark3-tests-expectations.json index 2dca462..15960bb 100644 --- a/tests/integration/jobdefs/spark3-tests-expectations.json +++ b/tests/integration/jobdefs/spark3-tests-expectations.json @@ -2,31 +2,31 @@ "databricks://.azuredatabricks.net/jobs/", "databricks://.azuredatabricks.net/jobs//tasks/Syn-in-WB-in-Syn-Out", "databricks://.azuredatabricks.net/jobs//tasks/ab-in-ab-out-root", - "databricks://.azuredatabricks.net/jobs//tasks/ab-in-ab-out-root/processes/E74B887E65E059D38DAB51D31A5432D8->F6D2E2554E30B80D0E6901908AADEDBB", + "databricks://.azuredatabricks.net/jobs//tasks/ab-in-ab-out-root/processes/0AFD1EDAC25ECE8253F387E74F28629A->1BB1B95C33EDB21B2D3903B9A8103087", "databricks://.azuredatabricks.net/jobs//tasks/abfss-in-abfss-out", - "databricks://.azuredatabricks.net/jobs//tasks/abfss-in-abfss-out/processes/2796E46D0CCD18971A9C936C1EB97B1E->34BBA1402F1BAE560BFEA804B83FED62", + "databricks://.azuredatabricks.net/jobs//tasks/abfss-in-abfss-out/processes/58C1F24BA6C6FF7592F786C9FA8A3451->BA6B11F82FDCE37E849D25D545E6FB7A", "databricks://.azuredatabricks.net/jobs//tasks/abfss-oauth", - "databricks://.azuredatabricks.net/jobs//tasks/abfss-oauth/processes/56EE0B098A9A3D07DC11F4C6EA9BF71C->E6B1D99B74724B48DAB2BCB79142CB65", + "databricks://.azuredatabricks.net/jobs//tasks/abfss-oauth/processes/BD4A7A895E605BF6C4DE003D3F6B3F39->A3B52DA733083E4642E1C3DB6B093E84", "databricks://.azuredatabricks.net/jobs//tasks/azuresql-in-out", - "databricks://.azuredatabricks.net/jobs//tasks/azuresql-in-out/processes/03CC0799BCA86B4A823AD9B6C9A772A1->1A1EF10BC89D10CA52B3559833DAC1F3", + "databricks://.azuredatabricks.net/jobs//tasks/azuresql-in-out/processes/B95334DF8F53EB63EDBA24AF88CFC7AA->80FC7C28AF3F669752CE8F2DA1987526", "databricks://.azuredatabricks.net/jobs//tasks/delta-abfss", - "databricks://.azuredatabricks.net/jobs//tasks/delta-abfss/processes/EEDA606783A7DD68C6A6C60221608209->0533EFDC2210DD1546DACD3291D14EE9", + "databricks://.azuredatabricks.net/jobs//tasks/delta-abfss/processes/CE0291670068E208B1A9621C1721730D->FD3D635F915390056518ECC38AB07DCC", "databricks://.azuredatabricks.net/jobs//tasks/delta-fs", - "databricks://.azuredatabricks.net/jobs//tasks/delta-fs/processes/C8A21C9CC03564B883DD6A2E4174F9AE->FC6DC149F25D86CE472C77290596DD9F", + "databricks://.azuredatabricks.net/jobs//tasks/delta-fs/processes/F0F4F25C04BAFB0FBFF90BE92709E7E4->9557C9A65FE7A9A7A89B6D9061C55B5A", "databricks://.azuredatabricks.net/jobs//tasks/delta-mnt", - "databricks://.azuredatabricks.net/jobs//tasks/delta-mnt/processes/F33E9424B73DFD1C3B8D0259EB772F87->E443883D4B66E0DBEE76C5331401E533", + "databricks://.azuredatabricks.net/jobs//tasks/delta-mnt/processes/A191E946F919A0717BB4FF2A79221996->3718CE24F8FCB01C633CF37CED45B3FC", "databricks://.azuredatabricks.net/jobs//tasks/intermix-languages", - "databricks://.azuredatabricks.net/jobs//tasks/intermix-languages/processes/837D6375622EA0C277BB0275C5B2E4BE->A950ACA0CBDF8EABD0C758E01B8893B3", + "databricks://.azuredatabricks.net/jobs//tasks/intermix-languages/processes/7D3D5D44FDC1DC865806712E633C5E56->B3CF5624F08EEEDF819869D074FA7774", "databricks://.azuredatabricks.net/jobs//tasks/mnt", - "databricks://.azuredatabricks.net/jobs//tasks/mnt/processes/EAEEF594372A61E0E1B545C0B430E966->ADFAB39F64A04DBD087DC73F8DF4EA47", - "databricks://.azuredatabricks.net/jobs//tasks/output-with-period", - "databricks://.1.azuredatabricks.net/jobs//tasks/output-with-period/processes/4AF18D6C70DDBCA092FC53396B2C908F->F0460570010BB248E2256F0F932A82B8", + "databricks://.azuredatabricks.net/jobs//tasks/mnt/processes/336D6FD3010382DAB8351BFF026B2CBE->C60C4BAB82567905C64B99E2DCBCA711", "databricks://.azuredatabricks.net/jobs//tasks/nested-parent", - "databricks://.azuredatabricks.net/jobs//tasks/nested-parent/processes/1611F7AEE100534D05476B8D8D8096A2->8F13AE1A6297C7B53E82AD0862A258C5", + "databricks://.azuredatabricks.net/jobs//tasks/nested-parent/processes/8514E8FCB25E967BC6DA61D1A48E2CD4->7C40325C08313ADDF8F653CACEAAA8C1", + "databricks://.azuredatabricks.net/jobs//tasks/output-with-period", + "databricks://.azuredatabricks.net/jobs//tasks/output-with-period/processes/8530DB90732944CA2C3C02E4FEE633E2->054707838715BECB4629ECF6B398BF1A", "databricks://.azuredatabricks.net/jobs//tasks/synapse-in-wasbs-out", - "databricks://.azuredatabricks.net/jobs//tasks/synapse-in-wasbs-out/processes/F8D4C5D21F4175BBF75031FA6F5C3C81->367B7F5964BD2C529BD8BF705A32802A", + "databricks://.azuredatabricks.net/jobs//tasks/synapse-in-wasbs-out/processes/FC4F9610428CB3C9FBCB97DF6D2B939D->76AAA8ADC61434F8BFB7C92E6ABF8C85", "databricks://.azuredatabricks.net/jobs//tasks/wasbs-in-wasbs-out", - "databricks://.azuredatabricks.net/jobs//tasks/wasbs-in-wasbs-out/processes/A85A7E9B093C1A818F2C4276C5A9A871->C903E6160BE06DD452AFE1AAD278162C", + "databricks://.azuredatabricks.net/jobs//tasks/wasbs-in-wasbs-out/processes/34DA3FD40AC2F55C125A86039355D6ED->4A56EEA94A2A249B6FA359EC03F43FF7", "databricks://.azuredatabricks.net/notebooks/Shared/examples/abfss-in-abfss-out", "databricks://.azuredatabricks.net/notebooks/Shared/examples/abfss-in-abfss-out-oauth", "databricks://.azuredatabricks.net/notebooks/Shared/examples/abfss-in-abfss-out-root", @@ -40,5 +40,23 @@ "databricks://.azuredatabricks.net/notebooks/Shared/examples/nested-parent", "databricks://.azuredatabricks.net/notebooks/Shared/examples/synapse-in-wasbs-out", "databricks://.azuredatabricks.net/notebooks/Shared/examples/synapse-wasbs-in-synapse-out", - "databricks://.azuredatabricks.net/notebooks/Shared/examples/wasbs-in-wasbs-out" + "databricks://.azuredatabricks.net/notebooks/Shared/examples/wasbs-in-wasbs-out", + "databricks://.azuredatabricks.net/notebooks/Shared/examples/mysql-in-mysql-out", + "databricks://.azuredatabricks.net/jobs//tasks/mysql-in-mysql-out/processes/1F6965315A6049825A37C4AD085BD605->A08160B244AF828E1FDB80AC8D14FA96", + "databricks://.azuredatabricks.net/jobs//tasks/mysql-in-mysql-out", + "databricks://.azuredatabricks.net/notebooks/Shared/examples/postgres-in-postgres-out", + "databricks://.azuredatabricks.net/jobs//tasks/postgres-in-postgres-out/processes/7E6CEF8EC093F119A11618169A8C4EAE->DB99105F739F05449E1ECD20A652DEE1", + "databricks://.azuredatabricks.net/jobs//tasks/postgres-in-postgres-out", + "databricks://.azuredatabricks.net/notebooks/Shared/examples/wasbs-in-kusto-out", + "databricks://.azuredatabricks.net/jobs//tasks/wasbs-in-kusto-out", + "databricks://.azuredatabricks.net/jobs//tasks/wasbs-in-kusto-out/processes/D3E8F45D1D4150D809A56423DD2F2CFD->9D42C0F01CD2D2C2F014263C2D812602", + "databricks://.azuredatabricks.net/notebooks/Shared/examples/kusto-in-wasbs-out", + "databricks://.azuredatabricks.net/jobs//tasks/kusto-in-wasbs-out", + "databricks://.azuredatabricks.net/jobs//tasks/kusto-in-wasbs-out/processes/9D42C0F01CD2D2C2F014263C2D812602->55517236A8804C9548A4CB81814AEA6B", + "databricks://.azuredatabricks.net/jobs//tasks/delta-merge-task", + "databricks://.azuredatabricks.net/jobs//tasks/delta-merge-task/processes/XXX->XXX", + "databricks://.azuredatabricks.net/notebooks/Shared/examples/delta-in-delta-merge", + "databricks://.azuredatabricks.net/jobs//tasks/delta-merge-pkg-task", + "databricks://.azuredatabricks.net/jobs//tasks/delta-merge-pkg-task/processes/XXX->XXX", + "databricks://.azuredatabricks.net/notebooks/Shared/examples/delta-in-delta-merge-package" ] \ No newline at end of file diff --git a/tests/integration/jobdefs/wasbs-in-wasbs-out-with-param-def.json b/tests/integration/jobdefs/wasbs-in-wasbs-out-with-param-def.json deleted file mode 100644 index e3c32df..0000000 --- a/tests/integration/jobdefs/wasbs-in-wasbs-out-with-param-def.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "settings": { - "name": "wasbs-in-wasbs-out-with-param", - "email_notifications": { - "no_alert_for_skipped_runs": false - }, - "timeout_seconds": 0, - "max_concurrent_runs": 1, - "tasks": [ - { - "task_key": "wasbs-in-wasbs-out-with-param", - "notebook_task": { - "notebook_path": "/Shared/examples/wasbs-in-wasbs-out-with-param", - "base_parameters": { - "myval": "10" - } - }, - "existing_cluster_id": "0326-140927-mc4qzaj5", - "timeout_seconds": 0, - "email_notifications": {} - } - ], - "format": "MULTI_TASK" - } -} \ No newline at end of file diff --git a/tests/integration/jobdefs/wasbs-in-wasbs-out-with-param-expectations.json b/tests/integration/jobdefs/wasbs-in-wasbs-out-with-param-expectations.json deleted file mode 100644 index a850eb6..0000000 --- a/tests/integration/jobdefs/wasbs-in-wasbs-out-with-param-expectations.json +++ /dev/null @@ -1,6 +0,0 @@ -[ - "databricks://.azuredatabricks.net/jobs//tasks/wasbs-in-wasbs-out-with-param", - "databricks://.azuredatabricks.net/notebooks/Shared/examples/wasbs-in-wasbs-out-with-param", - "databricks://.azuredatabricks.net/jobs//tasks/wasbs-in-wasbs-out-with-param/processes/9E51CA344228BD2C592091F34BCF81B8->D4051C5A34E3E2812E191B59E82CB1B1", - "databricks://.azuredatabricks.net/jobs//tasks/wasbs-in-wasbs-out-with-param/processes/D4051C5A34E3E2812E191B59E82CB1B1->8031DA4E838D99236E94A4CE72C951BC" -] \ No newline at end of file diff --git a/tests/integration/run-test.sh b/tests/integration/run-test.sh index e31bc8e..002f5cd 100644 --- a/tests/integration/run-test.sh +++ b/tests/integration/run-test.sh @@ -43,8 +43,8 @@ for fn in `ls ./tests/integration/jobdefs`; do continue fi - # For each file, get the settings.name - job_name=$(cat "$TESTS_DIRECTORY/$fn" | jq -r '.settings.name') + # For each file, get the .name + job_name=$(cat "$TESTS_DIRECTORY/$fn" | jq -r '.name') echo "Preparing to run JobDef:$fn JobName:$job_name JobId:${jobnametoid[$job_name]}" temp_job_id=${jobnametoid[$job_name]} # Get the expectation file diff --git a/tests/integration/spark-apps/jarjobs/abfssInAbfssOut/app/src/main/java/SparkApp/Basic/App.java b/tests/integration/spark-apps/jarjobs/abfssInAbfssOut/app/src/main/java/SparkApp/Basic/App.java index 87b4bec..68da198 100644 --- a/tests/integration/spark-apps/jarjobs/abfssInAbfssOut/app/src/main/java/SparkApp/Basic/App.java +++ b/tests/integration/spark-apps/jarjobs/abfssInAbfssOut/app/src/main/java/SparkApp/Basic/App.java @@ -32,7 +32,7 @@ public static void main(String[] args) { System.out.println(new App().getGreeting()); - String storageKey = dbutils.secrets().get("purview-to-adb-scope", "storage-service-key"); + String storageKey = dbutils.secrets().get("purview-to-adb-kv", "storage-service-key"); spark.conf().set("fs.azure.account.key."+storageServiceName+".dfs.core.windows.net", storageKey); diff --git a/tests/integration/spark-apps/notebooks/abfss-in-abfss-out-oauth.scala b/tests/integration/spark-apps/notebooks/abfss-in-abfss-out-oauth.scala index 05702ab..59d3b7c 100644 --- a/tests/integration/spark-apps/notebooks/abfss-in-abfss-out-oauth.scala +++ b/tests/integration/spark-apps/notebooks/abfss-in-abfss-out-oauth.scala @@ -11,9 +11,9 @@ val outputRootPath = "abfss://"+ouptutContainerName+"@"+storageServiceName+".dfs spark.conf.set("fs.azure.account.auth.type."+storageServiceName+".dfs.core.windows.net", "OAuth") spark.conf.set("fs.azure.account.oauth.provider.type."+storageServiceName+".dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider") -spark.conf.set("fs.azure.account.oauth2.client.id."+storageServiceName+".dfs.core.windows.net", dbutils.secrets.get("purview-to-adb-scope", "project-spn-client-id")) -spark.conf.set("fs.azure.account.oauth2.client.secret."+storageServiceName+".dfs.core.windows.net", dbutils.secrets.get("purview-to-adb-scope", "project-spn-secret")) -spark.conf.set("fs.azure.account.oauth2.client.endpoint."+storageServiceName+".dfs.core.windows.net", "https://login.microsoftonline.com/"+dbutils.secrets.get("purview-to-adb-scope", "tenant-id")+"/oauth2/token") +spark.conf.set("fs.azure.account.oauth2.client.id."+storageServiceName+".dfs.core.windows.net", dbutils.secrets.get("purview-to-adb-kv", "clientIdKey")) +spark.conf.set("fs.azure.account.oauth2.client.secret."+storageServiceName+".dfs.core.windows.net", dbutils.secrets.get("purview-to-adb-kv", "clientSecretKey")) +spark.conf.set("fs.azure.account.oauth2.client.endpoint."+storageServiceName+".dfs.core.windows.net", "https://login.microsoftonline.com/"+dbutils.secrets.get("purview-to-adb-kv", "tenant-id")+"/oauth2/token") // COMMAND ---------- diff --git a/tests/integration/spark-apps/notebooks/abfss-in-abfss-out-root.scala b/tests/integration/spark-apps/notebooks/abfss-in-abfss-out-root.scala index 7ed5f0a..6177e48 100644 --- a/tests/integration/spark-apps/notebooks/abfss-in-abfss-out-root.scala +++ b/tests/integration/spark-apps/notebooks/abfss-in-abfss-out-root.scala @@ -9,7 +9,7 @@ val ouptutContainerName = "writetoroot" val abfssRootPath = "abfss://"+storageContainerName+"@"+storageServiceName+".dfs.core.windows.net" val outputAbfssRootPath = "abfss://"+ouptutContainerName+"@"+storageServiceName+".dfs.core.windows.net/root" -val storageKey = dbutils.secrets.get("purview-to-adb-scope", "storage-service-key") +val storageKey = dbutils.secrets.get("purview-to-adb-kv", "storage-service-key") spark.conf.set("fs.azure.account.key."+storageServiceName+".dfs.core.windows.net", storageKey) diff --git a/tests/integration/spark-apps/notebooks/abfss-in-abfss-out.scala b/tests/integration/spark-apps/notebooks/abfss-in-abfss-out.scala index 82e3738..04c49a2 100644 --- a/tests/integration/spark-apps/notebooks/abfss-in-abfss-out.scala +++ b/tests/integration/spark-apps/notebooks/abfss-in-abfss-out.scala @@ -9,7 +9,7 @@ val ouptutContainerName = "outputdata" val abfssRootPath = "abfss://"+storageContainerName+"@"+storageServiceName+".dfs.core.windows.net" val outputRootPath = "abfss://"+ouptutContainerName+"@"+storageServiceName+".dfs.core.windows.net" -val storageKey = dbutils.secrets.get("purview-to-adb-scope", "storage-service-key") +val storageKey = dbutils.secrets.get("purview-to-adb-kv", "storage-service-key") spark.conf.set("fs.azure.account.key."+storageServiceName+".dfs.core.windows.net", storageKey) diff --git a/tests/integration/spark-apps/notebooks/abfss-in-hive+notmgd+saveAsTable-out.scala b/tests/integration/spark-apps/notebooks/abfss-in-hive+notmgd+saveAsTable-out.scala index 648fc45..d7ccd8d 100644 --- a/tests/integration/spark-apps/notebooks/abfss-in-hive+notmgd+saveAsTable-out.scala +++ b/tests/integration/spark-apps/notebooks/abfss-in-hive+notmgd+saveAsTable-out.scala @@ -18,7 +18,7 @@ val ouptutContainerName = "outputdata" val abfssRootPath = "abfss://"+storageContainerName+"@"+storageServiceName+".dfs.core.windows.net" val outputRootPath = "abfss://"+ouptutContainerName+"@"+storageServiceName+".dfs.core.windows.net" -val storageKey = dbutils.secrets.get("purview-to-adb-scope", "storage-service-key") +val storageKey = dbutils.secrets.get("purview-to-adb-kv", "storage-service-key") spark.conf.set("fs.azure.account.key."+storageServiceName+".dfs.core.windows.net", storageKey) diff --git a/tests/integration/spark-apps/notebooks/abfss-in-hive+saveAsTable-out.scala b/tests/integration/spark-apps/notebooks/abfss-in-hive+saveAsTable-out.scala index c6d6565..2922297 100644 --- a/tests/integration/spark-apps/notebooks/abfss-in-hive+saveAsTable-out.scala +++ b/tests/integration/spark-apps/notebooks/abfss-in-hive+saveAsTable-out.scala @@ -22,7 +22,7 @@ val ouptutContainerName = "outputdata" val abfssRootPath = "abfss://"+storageContainerName+"@"+storageServiceName+".dfs.core.windows.net" val outputRootPath = "abfss://"+ouptutContainerName+"@"+storageServiceName+".dfs.core.windows.net" -val storageKey = dbutils.secrets.get("purview-to-adb-scope", "storage-service-key") +val storageKey = dbutils.secrets.get("purview-to-adb-kv", "storage-service-key") spark.conf.set("fs.azure.account.key."+storageServiceName+".dfs.core.windows.net", storageKey) diff --git a/tests/integration/spark-apps/notebooks/azuresql-in-azuresql-out.scala b/tests/integration/spark-apps/notebooks/azuresql-in-azuresql-out.scala index a412c0f..c6bf66f 100644 --- a/tests/integration/spark-apps/notebooks/azuresql-in-azuresql-out.scala +++ b/tests/integration/spark-apps/notebooks/azuresql-in-azuresql-out.scala @@ -10,12 +10,10 @@ import java.lang.{ClassNotFoundException} // COMMAND ---------- -val server_name = "jdbc:sqlserver://FILL-IN-CONNECTION-STRING" -val database_name = "purview-to-adb-sqldb" -val url = server_name + ";" + "database=" + database_name + ";" +val url = dbutils.secrets.get("purview-to-adb-kv", "azuresql-jdbc-conn-str") -val username = dbutils.secrets.get("purview-to-adb-scope", "azuresql-username") -val password = dbutils.secrets.get("purview-to-adb-scope", "azuresql-password") +val username = dbutils.secrets.get("purview-to-adb-kv", "azuresql-username") +val password = dbutils.secrets.get("purview-to-adb-kv", "azuresql-password") // COMMAND ---------- diff --git a/tests/integration/spark-apps/notebooks/call-via-adf-spark2.scala b/tests/integration/spark-apps/notebooks/call-via-adf-spark2.scala index f2e147c..58c4039 100644 --- a/tests/integration/spark-apps/notebooks/call-via-adf-spark2.scala +++ b/tests/integration/spark-apps/notebooks/call-via-adf-spark2.scala @@ -9,7 +9,7 @@ val ouptutContainerName = "outputdata" val abfssRootPath = "abfss://"+storageContainerName+"@"+storageServiceName+".dfs.core.windows.net" val outputRootPath = "abfss://"+ouptutContainerName+"@"+storageServiceName+".dfs.core.windows.net" -val storageKey = dbutils.secrets.get("purview-to-adb-scope", "storage-service-key") +val storageKey = dbutils.secrets.get("purview-to-adb-kv", "storage-service-key") spark.conf.set("fs.azure.account.key."+storageServiceName+".dfs.core.windows.net", storageKey) diff --git a/tests/integration/spark-apps/notebooks/call-via-adf-spark3.scala b/tests/integration/spark-apps/notebooks/call-via-adf-spark3.scala index f4be397..c283939 100644 --- a/tests/integration/spark-apps/notebooks/call-via-adf-spark3.scala +++ b/tests/integration/spark-apps/notebooks/call-via-adf-spark3.scala @@ -9,7 +9,7 @@ val ouptutContainerName = "outputdata" val abfssRootPath = "abfss://"+storageContainerName+"@"+storageServiceName+".dfs.core.windows.net" val outputRootPath = "abfss://"+ouptutContainerName+"@"+storageServiceName+".dfs.core.windows.net" -val storageKey = dbutils.secrets.get("purview-to-adb-scope", "storage-service-key") +val storageKey = dbutils.secrets.get("purview-to-adb-kv", "storage-service-key") spark.conf.set("fs.azure.account.key."+storageServiceName+".dfs.core.windows.net", storageKey) diff --git a/tests/integration/spark-apps/notebooks/delta-in-delta-merge-package.py b/tests/integration/spark-apps/notebooks/delta-in-delta-merge-package.py new file mode 100644 index 0000000..5acbb31 --- /dev/null +++ b/tests/integration/spark-apps/notebooks/delta-in-delta-merge-package.py @@ -0,0 +1,94 @@ +# Databricks notebook source +# MAGIC %md +# MAGIC # Delta Table using Python Package rather than SQL + +# COMMAND ---------- + +import os +storageServiceName = os.environ.get("STORAGE_SERVICE_NAME") +storageContainerName = "rawdata" +abfssRootPath = "abfss://"+storageContainerName+"@"+storageServiceName+".dfs.core.windows.net" + +storageKey = dbutils.secrets.get("purview-to-adb-kv", "storage-service-key") + +spark.conf.set("fs.azure.account.auth.type."+storageServiceName+".dfs.core.windows.net", "OAuth") +spark.conf.set("fs.azure.account.oauth.provider.type."+storageServiceName+".dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider") +spark.conf.set("fs.azure.account.oauth2.client.id."+storageServiceName+".dfs.core.windows.net", dbutils.secrets.get("purview-to-adb-kv", "clientIdKey")) +spark.conf.set("fs.azure.account.oauth2.client.secret."+storageServiceName+".dfs.core.windows.net", dbutils.secrets.get("purview-to-adb-kv", "clientSecretKey")) +spark.conf.set("fs.azure.account.oauth2.client.endpoint."+storageServiceName+".dfs.core.windows.net", "https://login.microsoftonline.com/"+dbutils.secrets.get("purview-to-adb-kv", "tenant-id")+"/oauth2/token") + +# COMMAND ---------- + +from delta.tables import * + +exampleInputA = DeltaTable.forPath(spark, abfssRootPath+"/testcase/delta-merge-using-delta-package/subfolder-a/productA/") +exampleInputB = DeltaTable.forPath(spark, abfssRootPath+"/testcase/delta-merge-using-delta-package/subfolder-b/productB/") + +dfUpdates = exampleInputB.toDF() + +# COMMAND ---------- + +( + exampleInputA.alias('a') + .merge( + dfUpdates.alias('updates'), + 'a.id = updates.id' + ) + .whenMatchedUpdate( + set = { + "id": "updates.id", + "postalCode": "updates.postalCode", + "streetAddress": "updates.streetAddress" + } + ) + .whenNotMatchedInsert( + values = { + "id": "updates.id", + "postalCode": "updates.postalCode", + "streetAddress": "updates.streetAddress" + } + ) + .execute() +) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC # For Experimenting + +# COMMAND ---------- + +# %scala +# val exampleA = ( +# spark.read.format("delta") +# .load(abfssRootPath+"/testcase/sixteen/exampleInputA") +# ) + +# val exampleB = ( +# spark.read.format("delta") +# .load(abfssRootPath+"/testcase/sixteen/exampleInputB") +# ) +# val outputDf = exampleA.join(exampleB, exampleA("id") === exampleB("id"), "inner").drop(exampleB("id")) +# outputDf.createOrReplaceTempView("outputDf") + +# COMMAND ---------- + +# dfUpdates.createOrReplaceTempView("updates") + +# COMMAND ---------- + +# %sql +# MERGE INTO deltadestination_tbl +# USING updates +# ON deltadestination_tbl.id = updates.id +# WHEN MATCHED THEN +# UPDATE SET +# id = updates.id, +# postalcode = updates.postalcode, +# streetAddress = updates.streetAddress +# WHEN NOT MATCHED +# THEN INSERT * + +# COMMAND ---------- + + diff --git a/tests/integration/spark-apps/notebooks/delta-in-delta-merge.scala b/tests/integration/spark-apps/notebooks/delta-in-delta-merge.scala index 81d88ff..c74bc0e 100644 --- a/tests/integration/spark-apps/notebooks/delta-in-delta-merge.scala +++ b/tests/integration/spark-apps/notebooks/delta-in-delta-merge.scala @@ -11,13 +11,13 @@ val storageServiceName = sys.env("STORAGE_SERVICE_NAME") val storageContainerName = "rawdata" val abfssRootPath = "abfss://"+storageContainerName+"@"+storageServiceName+".dfs.core.windows.net" -val storageKey = dbutils.secrets.get("purview-to-adb-scope", "example-sa-key") +val storageKey = dbutils.secrets.get("purview-to-adb-kv", "storage-service-key") spark.conf.set("fs.azure.account.auth.type."+storageServiceName+".dfs.core.windows.net", "OAuth") spark.conf.set("fs.azure.account.oauth.provider.type."+storageServiceName+".dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider") -spark.conf.set("fs.azure.account.oauth2.client.id."+storageServiceName+".dfs.core.windows.net", dbutils.secrets.get("purview-to-adb-scope", "project-spn-client-id")) -spark.conf.set("fs.azure.account.oauth2.client.secret."+storageServiceName+".dfs.core.windows.net", dbutils.secrets.get("purview-to-adb-scope", "project-spn-secret")) -spark.conf.set("fs.azure.account.oauth2.client.endpoint."+storageServiceName+".dfs.core.windows.net", "https://login.microsoftonline.com/"+dbutils.secrets.get("purview-to-adb-scope", "tenant-id")+"/oauth2/token") +spark.conf.set("fs.azure.account.oauth2.client.id."+storageServiceName+".dfs.core.windows.net", dbutils.secrets.get("purview-to-adb-kv", "clientIdKey")) +spark.conf.set("fs.azure.account.oauth2.client.secret."+storageServiceName+".dfs.core.windows.net", dbutils.secrets.get("purview-to-adb-kv", "clientSecretKey")) +spark.conf.set("fs.azure.account.oauth2.client.endpoint."+storageServiceName+".dfs.core.windows.net", "https://login.microsoftonline.com/"+dbutils.secrets.get("purview-to-adb-kv", "tenant-id")+"/oauth2/token") // COMMAND ---------- @@ -35,7 +35,7 @@ outputDf.createOrReplaceTempView("outputDf") // COMMAND ---------- -// This is ran only once +// This is ran only once and screwing up our tests for delta // %sql // CREATE TABLE testcasesixteen // USING DELTA diff --git a/tests/integration/spark-apps/notebooks/delta-in-delta-out-abfss.scala b/tests/integration/spark-apps/notebooks/delta-in-delta-out-abfss.scala index 03e31de..6989356 100644 --- a/tests/integration/spark-apps/notebooks/delta-in-delta-out-abfss.scala +++ b/tests/integration/spark-apps/notebooks/delta-in-delta-out-abfss.scala @@ -5,13 +5,13 @@ val ouptutContainerName = "outputdata" val abfssRootPath = "abfss://"+storageContainerName+"@"+storageServiceName+".dfs.core.windows.net" val outputRootPath = "abfss://"+ouptutContainerName+"@"+storageServiceName+".dfs.core.windows.net" -val storageKey = dbutils.secrets.get("purview-to-adb-scope", "example-sa-key") +val storageKey = dbutils.secrets.get("purview-to-adb-kv", "storage-service-key") spark.conf.set("fs.azure.account.auth.type."+storageServiceName+".dfs.core.windows.net", "OAuth") spark.conf.set("fs.azure.account.oauth.provider.type."+storageServiceName+".dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider") -spark.conf.set("fs.azure.account.oauth2.client.id."+storageServiceName+".dfs.core.windows.net", dbutils.secrets.get("purview-to-adb-scope", "project-spn-client-id")) -spark.conf.set("fs.azure.account.oauth2.client.secret."+storageServiceName+".dfs.core.windows.net", dbutils.secrets.get("purview-to-adb-scope", "project-spn-secret")) -spark.conf.set("fs.azure.account.oauth2.client.endpoint."+storageServiceName+".dfs.core.windows.net", "https://login.microsoftonline.com/"+dbutils.secrets.get("purview-to-adb-scope", "tenant-id")+"/oauth2/token") +spark.conf.set("fs.azure.account.oauth2.client.id."+storageServiceName+".dfs.core.windows.net", dbutils.secrets.get("purview-to-adb-kv", "clientIdKey")) +spark.conf.set("fs.azure.account.oauth2.client.secret."+storageServiceName+".dfs.core.windows.net", dbutils.secrets.get("purview-to-adb-kv", "clientSecretKey")) +spark.conf.set("fs.azure.account.oauth2.client.endpoint."+storageServiceName+".dfs.core.windows.net", "https://login.microsoftonline.com/"+dbutils.secrets.get("purview-to-adb-kv", "tenant-id")+"/oauth2/token") // COMMAND ---------- diff --git a/tests/integration/spark-apps/notebooks/delta-in-delta-out-fs.scala b/tests/integration/spark-apps/notebooks/delta-in-delta-out-fs.scala index d247e4f..3d3131a 100644 --- a/tests/integration/spark-apps/notebooks/delta-in-delta-out-fs.scala +++ b/tests/integration/spark-apps/notebooks/delta-in-delta-out-fs.scala @@ -5,14 +5,14 @@ val ouptutContainerName = "outputdata" val abfssRootPath = "abfss://"+storageContainerName+"@"+storageServiceName+".dfs.core.windows.net" val outputRootPath = "abfss://"+ouptutContainerName+"@"+storageServiceName+".dfs.core.windows.net" -val storageKey = dbutils.secrets.get("purview-to-adb-scope", "example-sa-key") +val storageKey = dbutils.secrets.get("purview-to-adb-kv", "storage-service-key") //spark.conf.set("fs.azure.account.key."+storageServiceName+".dfs.core.windows.net", storageKey) spark.conf.set("fs.azure.account.auth.type."+storageServiceName+".dfs.core.windows.net", "OAuth") spark.conf.set("fs.azure.account.oauth.provider.type."+storageServiceName+".dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider") -spark.conf.set("fs.azure.account.oauth2.client.id."+storageServiceName+".dfs.core.windows.net", dbutils.secrets.get("purview-to-adb-scope", "project-spn-client-id")) -spark.conf.set("fs.azure.account.oauth2.client.secret."+storageServiceName+".dfs.core.windows.net", dbutils.secrets.get("purview-to-adb-scope", "project-spn-secret")) -spark.conf.set("fs.azure.account.oauth2.client.endpoint."+storageServiceName+".dfs.core.windows.net", "https://login.microsoftonline.com/"+dbutils.secrets.get("purview-to-adb-scope", "tenant-id")+"/oauth2/token") +spark.conf.set("fs.azure.account.oauth2.client.id."+storageServiceName+".dfs.core.windows.net", dbutils.secrets.get("purview-to-adb-kv", "clientIdKey")) +spark.conf.set("fs.azure.account.oauth2.client.secret."+storageServiceName+".dfs.core.windows.net", dbutils.secrets.get("purview-to-adb-kv", "clientSecretKey")) +spark.conf.set("fs.azure.account.oauth2.client.endpoint."+storageServiceName+".dfs.core.windows.net", "https://login.microsoftonline.com/"+dbutils.secrets.get("purview-to-adb-kv", "tenant-id")+"/oauth2/token") // COMMAND ---------- diff --git a/tests/integration/spark-apps/notebooks/delta-in-delta-out-mnt.scala b/tests/integration/spark-apps/notebooks/delta-in-delta-out-mnt.scala index 487c2e0..1d2f8c2 100644 --- a/tests/integration/spark-apps/notebooks/delta-in-delta-out-mnt.scala +++ b/tests/integration/spark-apps/notebooks/delta-in-delta-out-mnt.scala @@ -5,14 +5,14 @@ val ouptutContainerName = "outputdata" val abfssRootPath = "abfss://"+storageContainerName+"@"+storageServiceName+".dfs.core.windows.net" val outputRootPath = "abfss://"+ouptutContainerName+"@"+storageServiceName+".dfs.core.windows.net" -val storageKey = dbutils.secrets.get("purview-to-adb-scope", "example-sa-key") +val storageKey = dbutils.secrets.get("purview-to-adb-kv", "storage-service-key") //spark.conf.set("fs.azure.account.key."+storageServiceName+".dfs.core.windows.net", storageKey) spark.conf.set("fs.azure.account.auth.type."+storageServiceName+".dfs.core.windows.net", "OAuth") spark.conf.set("fs.azure.account.oauth.provider.type."+storageServiceName+".dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider") -spark.conf.set("fs.azure.account.oauth2.client.id."+storageServiceName+".dfs.core.windows.net", dbutils.secrets.get("purview-to-adb-scope", "project-spn-client-id")) -spark.conf.set("fs.azure.account.oauth2.client.secret."+storageServiceName+".dfs.core.windows.net", dbutils.secrets.get("purview-to-adb-scope", "project-spn-secret")) -spark.conf.set("fs.azure.account.oauth2.client.endpoint."+storageServiceName+".dfs.core.windows.net", "https://login.microsoftonline.com/"+dbutils.secrets.get("purview-to-adb-scope", "tenant-id")+"/oauth2/token") +spark.conf.set("fs.azure.account.oauth2.client.id."+storageServiceName+".dfs.core.windows.net", dbutils.secrets.get("purview-to-adb-kv", "clientIdKey")) +spark.conf.set("fs.azure.account.oauth2.client.secret."+storageServiceName+".dfs.core.windows.net", dbutils.secrets.get("purview-to-adb-kv", "clientSecretKey")) +spark.conf.set("fs.azure.account.oauth2.client.endpoint."+storageServiceName+".dfs.core.windows.net", "https://login.microsoftonline.com/"+dbutils.secrets.get("purview-to-adb-kv", "tenant-id")+"/oauth2/token") // COMMAND ---------- diff --git a/tests/integration/spark-apps/notebooks/hive+abfss-in-hive+abfss-out-insert.py b/tests/integration/spark-apps/notebooks/hive+abfss-in-hive+abfss-out-insert.py index d0ea40d..d95ced1 100644 --- a/tests/integration/spark-apps/notebooks/hive+abfss-in-hive+abfss-out-insert.py +++ b/tests/integration/spark-apps/notebooks/hive+abfss-in-hive+abfss-out-insert.py @@ -7,7 +7,7 @@ abfssRootPath = "abfss://"+storageContainerName+"@"+storageServiceName+".dfs.core.windows.net" outputRootPath = "abfss://"+ouptutContainerName+"@"+storageServiceName+".dfs.core.windows.net" -storageKey = dbutils.secrets.get("purview-to-adb-scope", "storage-service-key") +storageKey = dbutils.secrets.get("purview-to-adb-kv", "storage-service-key") spark.conf.set("fs.azure.account.key."+storageServiceName+".dfs.core.windows.net", storageKey) spark.conf.set('spark.query.rootPath',abfssRootPath) @@ -15,25 +15,27 @@ # COMMAND ---------- -# MAGIC %sql -# MAGIC CREATE TABLE IF NOT EXISTS default.hiveExampleA001 ( -# MAGIC tableId INT, -# MAGIC x INT -# MAGIC ) -# MAGIC LOCATION 'abfss://rawdata@.dfs.core.windows.net/testcase/twentyone/exampleInputA/' -# MAGIC ; -# MAGIC -# MAGIC CREATE TABLE IF NOT EXISTS default.hiveExampleOutput001( -# MAGIC tableId INT, -# MAGIC x INT -# MAGIC ) -# MAGIC LOCATION 'abfss://rawdata@.dfs.core.windows.net/testcase/twentyone/exampleOutput/' -# MAGIC ; +spark.sql(f""" +CREATE TABLE IF NOT EXISTS default.testSample ( +tableId INT, +x INT +) +LOCATION 'abfss://rawdata@{storageServiceName}.dfs.core.windows.net/testcase/twentyone/exampleInputA/' +; +""" +) # COMMAND ---------- -# %sql -# INSERT INTO default.hiveExampleA001 (tableId, x) VALUES(1,2) +spark.sql(f""" +CREATE TABLE IF NOT EXISTS default.hiveExampleOutput001 ( +tableId INT, +x INT +) +LOCATION 'abfss://rawdata@{storageServiceName}.dfs.core.windows.net/testcase/twentyone/exampleOutput/' +; +""" +) # COMMAND ---------- @@ -44,12 +46,4 @@ # COMMAND ---------- -spark.read.table("default.hiveExampleOutput001").inputFiles() - -# COMMAND ---------- - -dbutils.fs.ls("abfss://rawdata@.dfs.core.windows.net/testcase/twentyone/exampleInputA/") - -# COMMAND ---------- - diff --git a/tests/integration/spark-apps/notebooks/hive+mgd+not+default-in-hive+mgd+not+default-out-insert.py b/tests/integration/spark-apps/notebooks/hive+mgd+not+default-in-hive+mgd+not+default-out-insert.py index 3bd5be9..18a99c8 100644 --- a/tests/integration/spark-apps/notebooks/hive+mgd+not+default-in-hive+mgd+not+default-out-insert.py +++ b/tests/integration/spark-apps/notebooks/hive+mgd+not+default-in-hive+mgd+not+default-out-insert.py @@ -1,28 +1,24 @@ # Databricks notebook source -# %sql -# CREATE DATABASE IF NOT EXISTS notdefault; +# MAGIC %sql +# MAGIC CREATE DATABASE IF NOT EXISTS notdefault; # COMMAND ---------- -# %sql -# CREATE TABLE IF NOT EXISTS notdefault.hiveExampleA ( -# tableId INT, -# x INT -# ); - -# CREATE TABLE notdefault.hiveExampleOutput( -# tableId INT, -# x INT -# ) - -# COMMAND ---------- +# MAGIC %sql +# MAGIC CREATE TABLE IF NOT EXISTS notdefault.hiveExampleA ( +# MAGIC tableId INT, +# MAGIC x INT +# MAGIC ); -# %sql -# INSERT INTO notdefault.hiveExampleA (tableId, x) VALUES(1,2) +# MAGIC CREATE TABLE IF NOT EXISTS notdefault.hiveExampleOutput( +# MAGIC tableId INT, +# MAGIC x INT +# MAGIC ) # COMMAND ---------- -spark.sparkContext.setLogLevel("DEBUG") +# MAGIC %sql +# MAGIC INSERT INTO notdefault.hiveExampleA (tableId, x) VALUES(1,2) # COMMAND ---------- @@ -32,10 +28,3 @@ # MAGIC FROM notdefault.hiveExampleA # COMMAND ---------- - -# MAGIC %md -# MAGIC # Exploring the File Path - -# COMMAND ---------- - -# dbutils.fs.ls("/user/hive/warehouse/notdefault.db/hiveexamplea") diff --git a/tests/integration/spark-apps/notebooks/hive-in-hive-out-insert.py b/tests/integration/spark-apps/notebooks/hive-in-hive-out-insert.py index 914a4c0..e68d33b 100644 --- a/tests/integration/spark-apps/notebooks/hive-in-hive-out-insert.py +++ b/tests/integration/spark-apps/notebooks/hive-in-hive-out-insert.py @@ -1,19 +1,14 @@ -# Datricks notebook source -# %sql -# CREATE TABLE IF NOT EXISTS default.hiveExampleA000 ( -# tableId INT, -# x INT -# ); - -# CREATE TABLE default.hiveExampleOutput000( -# tableId INT, -# x INT -# ) - -# COMMAND ---------- - -# %sql -# INSERT INTO default.hiveExampleA000 (tableId, x) VALUES(1,2) +# Databricks notebook source +# MAGIC %sql +# MAGIC CREATE TABLE IF NOT EXISTS default.hiveExampleA000 ( +# MAGIC tableId INT, +# MAGIC x INT +# MAGIC ); +# MAGIC +# MAGIC CREATE TABLE IF NOT EXISTS default.hiveExampleOutput000( +# MAGIC tableId INT, +# MAGIC x INT +# MAGIC ) # COMMAND ---------- diff --git a/tests/integration/spark-apps/notebooks/intermix-languages.scala b/tests/integration/spark-apps/notebooks/intermix-languages.scala index dee2728..83d51cf 100644 --- a/tests/integration/spark-apps/notebooks/intermix-languages.scala +++ b/tests/integration/spark-apps/notebooks/intermix-languages.scala @@ -9,14 +9,15 @@ val ouptutContainerName = "outputdata" val abfssRootPath = "abfss://"+storageContainerName+"@"+storageServiceName+".dfs.core.windows.net" val outputRootPath = "abfss://"+ouptutContainerName+"@"+storageServiceName+".dfs.core.windows.net" -val storageKey = dbutils.secrets.get("purview-to-adb-scope", "storage-service-key") +val storageKey = dbutils.secrets.get("purview-to-adb-kv", "storage-service-key") spark.conf.set("fs.azure.account.key."+storageServiceName+".dfs.core.windows.net", storageKey) // COMMAND ---------- // MAGIC %python -// MAGIC storageServiceName = sys.env("STORAGE_SERVICE_NAME") +// MAGIC import os +// MAGIC storageServiceName = os.environ.get("STORAGE_SERVICE_NAME") // MAGIC storageContainerName = "rawdata" // MAGIC ouptutContainerName = "outputdata" // MAGIC abfssRootPath = "abfss://"+storageContainerName+"@"+storageServiceName+".dfs.core.windows.net" diff --git a/tests/integration/spark-apps/notebooks/kusto-in-wasbs-out.scala b/tests/integration/spark-apps/notebooks/kusto-in-wasbs-out.scala new file mode 100644 index 0000000..5cb20d6 --- /dev/null +++ b/tests/integration/spark-apps/notebooks/kusto-in-wasbs-out.scala @@ -0,0 +1,60 @@ +// Databricks notebook source +spark.sparkContext.setLogLevel("ALL") + +// COMMAND ---------- + +import org.apache.commons.lang3.reflect.FieldUtils +import org.apache.commons.lang3.reflect.MethodUtils +import org.apache.spark.sql.execution.datasources.LogicalRelation +import com.microsoft.kusto.spark.datasink.KustoSinkOptions +import org.apache.spark.sql.{SaveMode, SparkSession} +import com.microsoft.kusto.spark.datasource.KustoSourceOptions +import org.apache.spark.SparkConf +import org.apache.spark.sql._ +import com.microsoft.azure.kusto.data.ClientRequestProperties +import com.microsoft.kusto.spark.sql.extension.SparkExtension._ +import com.microsoft.azure.kusto.data.ClientRequestProperties + +// COMMAND ---------- + +val appId = dbutils.secrets.get("purview-to-adb-kv", "azurekusto-appid") +val appKey = dbutils.secrets.get("purview-to-adb-kv", "azurekusto-appsecret") +val uri = dbutils.secrets.get("purview-to-adb-kv", "azurekusto-uri") +val authorityId = dbutils.secrets.get("purview-to-adb-kv", "tenant-id") +val cluster = uri.replaceAll(".kusto.windows.net", "").replaceAll("https://", "") +val database = "database01" // this is hardcoded - so if changed in the bicep template, also needs to be changed here. +val table = "table01" + +// COMMAND ---------- + +val conf: Map[String, String] = Map( + KustoSourceOptions.KUSTO_AAD_APP_ID -> appId, + KustoSourceOptions.KUSTO_AAD_APP_SECRET -> appKey, + KustoSourceOptions.KUSTO_AAD_AUTHORITY_ID -> authorityId + ) + +val df = spark.read.kusto(cluster, database, table, conf) + +// COMMAND ---------- + +val storageServiceName = sys.env("STORAGE_SERVICE_NAME") +val ouptutContainerName = "outputdata" + +val storageKey = dbutils.secrets.get("purview-to-adb-kv", "storage-service-key") + +spark.conf.set("fs.azure.account.key."+storageServiceName+".blob.core.windows.net", storageKey) + +// COMMAND ---------- + +val wasbsRootPath = "wasbs://"+ouptutContainerName+"@"+storageServiceName+".blob.core.windows.net" + +val file_location = wasbsRootPath+"/kusto/wasbs_out.csv" +val file_type = "csv" + +// COMMAND ---------- + +df.write.mode("overwrite").option("header","true").csv(file_location) + +// COMMAND ---------- + + diff --git a/tests/integration/spark-apps/notebooks/mnt-in-mnt-out.scala b/tests/integration/spark-apps/notebooks/mnt-in-mnt-out.scala index 2485288..92ed5d8 100644 --- a/tests/integration/spark-apps/notebooks/mnt-in-mnt-out.scala +++ b/tests/integration/spark-apps/notebooks/mnt-in-mnt-out.scala @@ -7,7 +7,7 @@ val storageServiceName = sys.env("STORAGE_SERVICE_NAME") val storageContainerName = "rawdata" val abfssRootPath = "abfss://"+storageContainerName+"@"+storageServiceName+".dfs.core.windows.net" -val storageKey = dbutils.secrets.get("purview-to-adb-scope", "example-sa-key") +val storageKey = dbutils.secrets.get("purview-to-adb-kv", "storage-service-key") spark.conf.set("fs.azure.account.key."+storageServiceName+".dfs.core.windows.net", storageKey) diff --git a/tests/integration/spark-apps/notebooks/mysql-in-mysql-out.py b/tests/integration/spark-apps/notebooks/mysql-in-mysql-out.py new file mode 100644 index 0000000..593f3a2 --- /dev/null +++ b/tests/integration/spark-apps/notebooks/mysql-in-mysql-out.py @@ -0,0 +1,57 @@ +# Databricks notebook source +# MAGIC %scala +# MAGIC Class.forName("com.mysql.cj.jdbc.Driver") + +# COMMAND ---------- + +host = dbutils.secrets.get("purview-to-adb-kv", "mysql-hostname") +user = dbutils.secrets.get("purview-to-adb-kv", "mysql-user") +password = dbutils.secrets.get("purview-to-adb-kv", "mysql-password") +database = "mydatabase" # hardcoded based on populate-data-mysql notebook. +table = "people" # hardcoded based on populate-data-mysql notebook. +port = "3306" # update if you use a non-default port +driver = "com.mysql.cj.jdbc.Driver" + +# COMMAND ---------- + +url = f"jdbc:mysql://{host}:{port}/{database}" + +df = (spark.read + .format("jdbc") + .option("driver", driver) + .option("url", url) + .option("dbtable", table) + .option("user", user) + .option("ssl", False) + .option("password", password) + .load() +) + +# COMMAND ---------- + +df.show() + +# COMMAND ---------- + +df=df.withColumn("age", df.age-100) + +# COMMAND ---------- + +df.show() + +# COMMAND ---------- + +df.write \ + .format("jdbc") \ + .option("driver", driver) \ + .option("url", url) \ + .option("dbtable", "fruits") \ + .option("user", user) \ + .option("ssl", False) \ + .mode("overwrite") \ + .option("password", password) \ + .save() + +# COMMAND ---------- + + diff --git a/tests/integration/spark-apps/notebooks/name-with-periods.scala b/tests/integration/spark-apps/notebooks/name-with-periods.scala index f26aced..151b3d0 100644 --- a/tests/integration/spark-apps/notebooks/name-with-periods.scala +++ b/tests/integration/spark-apps/notebooks/name-with-periods.scala @@ -9,7 +9,7 @@ val ouptutContainerName = "outputdata" val abfssRootPath = "abfss://"+storageContainerName+"@"+storageServiceName+".dfs.core.windows.net" val outputRootPath = "abfss://"+ouptutContainerName+"@"+storageServiceName+".dfs.core.windows.net" -val storageKey = dbutils.secrets.get("purview-to-adb-scope", "storage-service-key") +val storageKey = dbutils.secrets.get("purview-to-adb-kv", "storage-service-key") spark.conf.set("fs.azure.account.key."+storageServiceName+".dfs.core.windows.net", storageKey) diff --git a/tests/integration/spark-apps/notebooks/nested-child.scala b/tests/integration/spark-apps/notebooks/nested-child.scala index 5b98bbe..3cd2b5f 100644 --- a/tests/integration/spark-apps/notebooks/nested-child.scala +++ b/tests/integration/spark-apps/notebooks/nested-child.scala @@ -9,7 +9,7 @@ val ouptutContainerName = "outputdata" val abfssRootPath = "abfss://"+storageContainerName+"@"+storageServiceName+".dfs.core.windows.net" val outputRootPath = "abfss://"+ouptutContainerName+"@"+storageServiceName+".dfs.core.windows.net" -val storageKey = dbutils.secrets.get("purview-to-adb-scope", "storage-service-key") +val storageKey = dbutils.secrets.get("purview-to-adb-kv", "storage-service-key") spark.conf.set("fs.azure.account.key."+storageServiceName+".dfs.core.windows.net", storageKey) diff --git a/tests/integration/spark-apps/notebooks/populate-data-kusto.scala b/tests/integration/spark-apps/notebooks/populate-data-kusto.scala new file mode 100644 index 0000000..ff04fc5 --- /dev/null +++ b/tests/integration/spark-apps/notebooks/populate-data-kusto.scala @@ -0,0 +1,50 @@ +// Databricks notebook source +spark.sparkContext.setLogLevel("ALL") + +// COMMAND ---------- + +import org.apache.commons.lang3.reflect.FieldUtils +import org.apache.commons.lang3.reflect.MethodUtils +import org.apache.spark.sql.execution.datasources.LogicalRelation +import com.microsoft.kusto.spark.datasink.KustoSinkOptions +import org.apache.spark.sql.{SaveMode, SparkSession} +import com.microsoft.kusto.spark.datasource.KustoSourceOptions +import org.apache.spark.SparkConf +import org.apache.spark.sql._ +import com.microsoft.azure.kusto.data.ClientRequestProperties +import com.microsoft.kusto.spark.sql.extension.SparkExtension._ +import com.microsoft.azure.kusto.data.ClientRequestProperties + +// COMMAND ---------- + +val appId = dbutils.secrets.get("purview-to-adb-kv", "azurekusto-appid") +val appKey = dbutils.secrets.get("purview-to-adb-kv", "azurekusto-appsecret") +val uri = dbutils.secrets.get("purview-to-adb-kv", "azurekusto-uri") +val authorityId = dbutils.secrets.get("purview-to-adb-kv", "tenant-id") +val cluster = uri.replaceAll(".kusto.windows.net", "").replaceAll("https://", "") +val database = "database01" // this is hardcoded - so if changed in the bicep template, also needs to be changed here. +val table = "table01" + +// COMMAND ---------- + +case class City(id: String, name: String, country: String) + +val df = Seq(new City("1", "Milwaukee", "USA"), new City("2", "Cairo", "Egypt"), new City("3", "Doha", "Qatar"), new City("4", "Kabul", "Afghanistan")).toDF + +// COMMAND ---------- + +df.write + .format("com.microsoft.kusto.spark.datasource") + .option(KustoSinkOptions.KUSTO_CLUSTER, cluster) + .option(KustoSinkOptions.KUSTO_DATABASE, database) + .option(KustoSinkOptions.KUSTO_TABLE, table) + .option(KustoSinkOptions.KUSTO_AAD_APP_ID, appId) + .option(KustoSinkOptions.KUSTO_AAD_APP_SECRET, appKey) + .option(KustoSinkOptions.KUSTO_AAD_AUTHORITY_ID, authorityId) + .option(KustoSinkOptions.KUSTO_TABLE_CREATE_OPTIONS, "CreateIfNotExist") + .mode(SaveMode.Append) + .save() + +// COMMAND ---------- + + diff --git a/tests/integration/spark-apps/notebooks/postgres-in-postgres-out.py b/tests/integration/spark-apps/notebooks/postgres-in-postgres-out.py new file mode 100644 index 0000000..e02a81c --- /dev/null +++ b/tests/integration/spark-apps/notebooks/postgres-in-postgres-out.py @@ -0,0 +1,50 @@ +# Databricks notebook source +host = dbutils.secrets.get("purview-to-adb-kv", "postgres-host") +port = "5432" +dbname = "postgres" +user = dbutils.secrets.get("purview-to-adb-kv", "postgres-admin-user") +password = dbutils.secrets.get("purview-to-adb-kv", "postgres-admin-password") +table_in = "people" # hardcoded based on populate-data-postgres. +table_out = "fruits" +sslmode = "require" + +# COMMAND ---------- + +df = spark.read \ + .format("jdbc") \ + .option("url", f"jdbc:postgresql://{host}:{port}/{dbname}") \ + .option("dbtable", table_in) \ + .option("user", user) \ + .option("password", password) \ + .option("driver", "org.postgresql.Driver") \ + .option("ssl", False) \ + .load() + +# COMMAND ---------- + +df.show() + +# COMMAND ---------- + +df=df.withColumn("age", df.age-100) + +# COMMAND ---------- + +df.show() + +# COMMAND ---------- + +df.write \ + .format("jdbc") \ + .option("url", f"jdbc:postgresql://{host}:{port}/{dbname}") \ + .option("dbtable", table_out) \ + .option("user", user) \ + .option("password", password) \ + .option("driver", "org.postgresql.Driver") \ + .mode("overwrite") \ + .option("ssl", False) \ + .save() + +# COMMAND ---------- + + diff --git a/tests/integration/spark-apps/notebooks/spark-sql-table-in-abfss-out.scala b/tests/integration/spark-apps/notebooks/spark-sql-table-in-abfss-out.scala index 74f2f8f..aa4fc63 100644 --- a/tests/integration/spark-apps/notebooks/spark-sql-table-in-abfss-out.scala +++ b/tests/integration/spark-apps/notebooks/spark-sql-table-in-abfss-out.scala @@ -46,7 +46,7 @@ val ouptutContainerName = "outputdata" val abfssRootPath = "abfss://"+storageContainerName+"@"+storageServiceName+".dfs.core.windows.net" val outputRootPath = "abfss://"+ouptutContainerName+"@"+storageServiceName+".dfs.core.windows.net" -val storageKey = dbutils.secrets.get("purview-to-adb-scope", "example-sa-key") +val storageKey = dbutils.secrets.get("purview-to-adb-kv", "storage-service-key") spark.conf.set("fs.azure.account.key."+storageServiceName+".dfs.core.windows.net", storageKey) diff --git a/tests/integration/spark-apps/notebooks/synapse-in-synapse-out.scala b/tests/integration/spark-apps/notebooks/synapse-in-synapse-out.scala index cfc10dc..c5e9892 100644 --- a/tests/integration/spark-apps/notebooks/synapse-in-synapse-out.scala +++ b/tests/integration/spark-apps/notebooks/synapse-in-synapse-out.scala @@ -1,33 +1,33 @@ // Databricks notebook source //Defining the service principal credentials for the Azure storage account -val tenantid = dbutils.secrets.get("purview-to-adb-scope", "tenant-id") +val tenantid = dbutils.secrets.get("purview-to-adb-kv", "tenant-id") val synapseStorageAccount = sys.env("SYNAPSE_STORAGE_SERVICE_NAME") spark.conf.set("fs.azure.account.auth.type", "OAuth") spark.conf.set("fs.azure.account.oauth.provider.type", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider") -spark.conf.set("fs.azure.account.oauth2.client.id", dbutils.secrets.get("purview-to-adb-scope", "project-spn-client-id")) -spark.conf.set("fs.azure.account.oauth2.client.secret", dbutils.secrets.get("purview-to-adb-scope", "project-spn-secret")) +spark.conf.set("fs.azure.account.oauth2.client.id", dbutils.secrets.get("purview-to-adb-kv", "clientIdKey")) +spark.conf.set("fs.azure.account.oauth2.client.secret", dbutils.secrets.get("purview-to-adb-kv", "project-spn-secret")) spark.conf.set("fs.azure.account.oauth2.client.endpoint", "https://login.microsoftonline.com/" + tenantid + "/oauth2/token") //# Defining a separate set of service principal credentials for Azure Synapse Analytics (If not defined, the connector will use the Azure storage account credentials) -spark.conf.set("spark.databricks.sqldw.jdbc.service.principal.client.id", dbutils.secrets.get("purview-to-adb-scope", "project-spn-client-id")) -spark.conf.set("spark.databricks.sqldw.jdbc.service.principal.client.secret", dbutils.secrets.get("purview-to-adb-scope", "project-spn-secret")) -spark.conf.set("fs.azure.account.key."+synapseStorageAccount+".blob.core.windows.net", dbutils.secrets.get("purview-to-adb-scope", "synapse-storage-key")) +spark.conf.set("spark.databricks.sqldw.jdbc.service.principal.client.id", dbutils.secrets.get("purview-to-adb-kv", "clientIdKey")) +spark.conf.set("spark.databricks.sqldw.jdbc.service.principal.client.secret", dbutils.secrets.get("purview-to-adb-kv", "project-spn-secret")) +spark.conf.set("fs.azure.account.key."+synapseStorageAccount+".blob.core.windows.net", dbutils.secrets.get("purview-to-adb-kv", "synapse-storage-key")) // COMMAND ---------- //Azure Synapse related settings -val dwDatabase = "SQLPool1" +val dwDatabase = "sqlpool1" val dwServer = sys.env("SYNAPSE_SERVICE_NAME")+".sql.azuresynapse.net" -val dwUser = dbutils.secrets.get("purview-to-adb-scope", "synapse-query-username") -val dwPass = dbutils.secrets.get("purview-to-adb-scope", "synapse-query-password") +val dwUser = dbutils.secrets.get("purview-to-adb-kv", "synapse-query-username") +val dwPass = dbutils.secrets.get("purview-to-adb-kv", "synapse-query-password") val dwJdbcPort = "1433" val dwJdbcExtraOptions = "encrypt=true;trustServerCertificate=true;hostNameInCertificate=*.database.windows.net;loginTimeout=30;" val sqlDwUrl = "jdbc:sqlserver://" + dwServer + ":" + dwJdbcPort + ";database=" + dwDatabase + ";user=" + dwUser+";password=" + dwPass + ";" + dwJdbcExtraOptions val blobStorage = synapseStorageAccount+".blob.core.windows.net" val blobContainer = "temp" -val blobAccessKey = dbutils.secrets.get("purview-to-adb-scope", "synapse-storage-key") +val blobAccessKey = dbutils.secrets.get("purview-to-adb-kv", "synapse-storage-key") val tempDir = "wasbs://" + blobContainer + "@" + blobStorage +"/tempfolder" // COMMAND ---------- diff --git a/tests/integration/spark-apps/notebooks/synapse-in-wasbs-out.scala b/tests/integration/spark-apps/notebooks/synapse-in-wasbs-out.scala index 90e0dc4..34ced24 100644 --- a/tests/integration/spark-apps/notebooks/synapse-in-wasbs-out.scala +++ b/tests/integration/spark-apps/notebooks/synapse-in-wasbs-out.scala @@ -1,40 +1,40 @@ // Databricks notebook source //Defining the service principal credentials for the Azure storage account -val tenantid = dbutils.secrets.get("purview-to-adb-scope", "tenant-id") +val tenantid = dbutils.secrets.get("purview-to-adb-kv", "tenant-id") val synapseStorageAccount = sys.env("SYNAPSE_STORAGE_SERVICE_NAME") spark.conf.set("fs.azure.account.auth.type", "OAuth") spark.conf.set("fs.azure.account.oauth.provider.type", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider") -spark.conf.set("fs.azure.account.oauth2.client.id", dbutils.secrets.get("purview-to-adb-scope", "project-spn-client-id")) -spark.conf.set("fs.azure.account.oauth2.client.secret", dbutils.secrets.get("purview-to-adb-scope", "project-spn-secret")) +spark.conf.set("fs.azure.account.oauth2.client.id", dbutils.secrets.get("purview-to-adb-kv", "clientIdKey")) +spark.conf.set("fs.azure.account.oauth2.client.secret", dbutils.secrets.get("purview-to-adb-kv", "clientSecretKey")) spark.conf.set("fs.azure.account.oauth2.client.endpoint", "https://login.microsoftonline.com/" + tenantid + "/oauth2/token") //# Defining a separate set of service principal credentials for Azure Synapse Analytics (If not defined, the connector will use the Azure storage account credentials) -spark.conf.set("spark.databricks.sqldw.jdbc.service.principal.client.id", dbutils.secrets.get("purview-to-adb-scope", "project-spn-client-id")) -spark.conf.set("spark.databricks.sqldw.jdbc.service.principal.client.secret", dbutils.secrets.get("purview-to-adb-scope", "project-spn-secret")) -spark.conf.set("fs.azure.account.key."+synapseStorageAccount+".blob.core.windows.net", dbutils.secrets.get("purview-to-adb-scope", "synapse-storage-key")) +spark.conf.set("spark.databricks.sqldw.jdbc.service.principal.client.id", dbutils.secrets.get("purview-to-adb-kv", "clientIdKey")) +spark.conf.set("spark.databricks.sqldw.jdbc.service.principal.client.secret", dbutils.secrets.get("purview-to-adb-kv", "clientSecretKey")) +spark.conf.set("fs.azure.account.key."+synapseStorageAccount+".blob.core.windows.net", dbutils.secrets.get("purview-to-adb-kv", "synapse-storage-key")) // COMMAND ---------- //Azure Synapse related settings -val dwDatabase = "SQLPool1" +val dwDatabase = "sqlpool1" val dwServer = sys.env("SYNAPSE_SERVICE_NAME")+".sql.azuresynapse.net" -val dwUser = dbutils.secrets.get("purview-to-adb-scope", "synapse-query-username") -val dwPass = dbutils.secrets.get("purview-to-adb-scope", "synapse-query-password") +val dwUser = dbutils.secrets.get("purview-to-adb-kv", "synapse-query-username") +val dwPass = dbutils.secrets.get("purview-to-adb-kv", "synapse-query-password") val dwJdbcPort = "1433" val dwJdbcExtraOptions = "encrypt=true;trustServerCertificate=true;hostNameInCertificate=*.database.windows.net;loginTimeout=30;" val sqlDwUrl = "jdbc:sqlserver://" + dwServer + ":" + dwJdbcPort + ";database=" + dwDatabase + ";user=" + dwUser+";password=" + dwPass + ";" + dwJdbcExtraOptions val blobStorage = synapseStorageAccount+".blob.core.windows.net" val blobContainer = "temp" -val blobAccessKey = dbutils.secrets.get("purview-to-adb-scope", "synapse-storage-key") +val blobAccessKey = dbutils.secrets.get("purview-to-adb-kv", "synapse-storage-key") val tempDir = "wasbs://" + blobContainer + "@" + blobStorage +"/tempfolder" val storageServiceName = sys.env("STORAGE_SERVICE_NAME") val storageContainerName = "outputdata" val wasbsRootPath = "wasbs://"+storageContainerName+"@"+storageServiceName+".blob.core.windows.net" -val storageKey = dbutils.secrets.get("purview-to-adb-scope", "example-sa-key") +val storageKey = dbutils.secrets.get("purview-to-adb-kv", "storage-service-key") spark.conf.set("fs.azure.account.key."+storageServiceName+".blob.core.windows.net", storageKey) diff --git a/tests/integration/spark-apps/notebooks/synapse-wasbs-in-synapse-out.scala b/tests/integration/spark-apps/notebooks/synapse-wasbs-in-synapse-out.scala index 4c7205e..643b5bf 100644 --- a/tests/integration/spark-apps/notebooks/synapse-wasbs-in-synapse-out.scala +++ b/tests/integration/spark-apps/notebooks/synapse-wasbs-in-synapse-out.scala @@ -2,42 +2,42 @@ import org.apache.spark.sql.types.{StructType, StructField, IntegerType, StringType} //Defining the service principal credentials for the Azure storage account -val tenantid = dbutils.secrets.get("purview-to-adb-scope", "tenant-id") +val tenantid = dbutils.secrets.get("purview-to-adb-kv", "tenant-id") val synapseStorageAccount = sys.env("SYNAPSE_STORAGE_SERVICE_NAME") spark.conf.set("fs.azure.account.auth.type", "OAuth") spark.conf.set("fs.azure.account.oauth.provider.type", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider") -spark.conf.set("fs.azure.account.oauth2.client.id", dbutils.secrets.get("purview-to-adb-scope", "project-spn-client-id")) -spark.conf.set("fs.azure.account.oauth2.client.secret", dbutils.secrets.get("purview-to-adb-scope", "project-spn-secret")) +spark.conf.set("fs.azure.account.oauth2.client.id", dbutils.secrets.get("purview-to-adb-kv", "clientIdKey")) +spark.conf.set("fs.azure.account.oauth2.client.secret", dbutils.secrets.get("purview-to-adb-kv", "clientSecretKey")) spark.conf.set("fs.azure.account.oauth2.client.endpoint", "https://login.microsoftonline.com/" + tenantid + "/oauth2/token") //# Defining a separate set of service principal credentials for Azure Synapse Analytics (If not defined, the connector will use the Azure storage account credentials) -spark.conf.set("spark.databricks.sqldw.jdbc.service.principal.client.id", dbutils.secrets.get("purview-to-adb-scope", "project-spn-client-id")) -spark.conf.set("spark.databricks.sqldw.jdbc.service.principal.client.secret", dbutils.secrets.get("purview-to-adb-scope", "project-spn-secret")) -spark.conf.set("fs.azure.account.key."+synapseStorageAccount+".blob.core.windows.net", dbutils.secrets.get("purview-to-adb-scope", "synapse-storage-key")) +spark.conf.set("spark.databricks.sqldw.jdbc.service.principal.client.id", dbutils.secrets.get("purview-to-adb-kv", "clientIdKey")) +spark.conf.set("spark.databricks.sqldw.jdbc.service.principal.client.secret", dbutils.secrets.get("purview-to-adb-kv", "clientSecretKey")) +spark.conf.set("fs.azure.account.key."+synapseStorageAccount+".blob.core.windows.net", dbutils.secrets.get("purview-to-adb-kv", "synapse-storage-key")) val storageServiceName = sys.env("STORAGE_SERVICE_NAME") val storageContainerName = "rawdata" val wasbsRootPath = "wasbs://"+storageContainerName+"@"+storageServiceName+".blob.core.windows.net" -val storageKey = dbutils.secrets.get("purview-to-adb-scope", "example-sa-key") +val storageKey = dbutils.secrets.get("purview-to-adb-kv", "storage-service-key") spark.conf.set("fs.azure.account.key."+storageServiceName+".blob.core.windows.net", storageKey) // COMMAND ---------- //Azure Synapse related settings -val dwDatabase = "SQLPool1" +val dwDatabase = "sqlpool1" val dwServer = sys.env("SYNAPSE_SERVICE_NAME")+".sql.azuresynapse.net" -val dwUser = dbutils.secrets.get("purview-to-adb-scope", "synapse-query-username") -val dwPass = dbutils.secrets.get("purview-to-adb-scope", "synapse-query-password") +val dwUser = dbutils.secrets.get("purview-to-adb-kv", "synapse-query-username") +val dwPass = dbutils.secrets.get("purview-to-adb-kv", "synapse-query-password") val dwJdbcPort = "1433" val dwJdbcExtraOptions = "encrypt=true;trustServerCertificate=true;hostNameInCertificate=*.database.windows.net;loginTimeout=30;" val sqlDwUrl = "jdbc:sqlserver://" + dwServer + ":" + dwJdbcPort + ";database=" + dwDatabase + ";user=" + dwUser+";password=" + dwPass + ";" + dwJdbcExtraOptions val blobStorage = synapseStorageAccount+".blob.core.windows.net" val blobContainer = "temp" -val blobAccessKey = dbutils.secrets.get("purview-to-adb-scope", "synapse-storage-key") +val blobAccessKey = dbutils.secrets.get("purview-to-adb-kv", "synapse-storage-key") val tempDir = "wasbs://" + blobContainer + "@" + blobStorage +"/tempfolder" // COMMAND ---------- diff --git a/tests/integration/spark-apps/notebooks/wasbs-in-kusto-out.scala b/tests/integration/spark-apps/notebooks/wasbs-in-kusto-out.scala new file mode 100644 index 0000000..ad6bf01 --- /dev/null +++ b/tests/integration/spark-apps/notebooks/wasbs-in-kusto-out.scala @@ -0,0 +1,66 @@ +// Databricks notebook source +spark.sparkContext.setLogLevel("DEBUG") + +// COMMAND ---------- + +import org.apache.commons.lang3.reflect.FieldUtils +import org.apache.commons.lang3.reflect.MethodUtils +import org.apache.spark.sql.execution.datasources.LogicalRelation +import com.microsoft.kusto.spark.datasink.KustoSinkOptions +import org.apache.spark.sql.{SaveMode, SparkSession} +import com.microsoft.kusto.spark.datasource.KustoSourceOptions +import org.apache.spark.SparkConf +import org.apache.spark.sql._ +import com.microsoft.azure.kusto.data.ClientRequestProperties +import com.microsoft.kusto.spark.sql.extension.SparkExtension._ +import com.microsoft.azure.kusto.data.ClientRequestProperties + +// COMMAND ---------- + +// MAGIC %md +// MAGIC ### Write + +// COMMAND ---------- + +val appId = dbutils.secrets.get("purview-to-adb-kv", "azurekusto-appid") +val appKey = dbutils.secrets.get("purview-to-adb-kv", "azurekusto-appsecret") +val uri = dbutils.secrets.get("purview-to-adb-kv", "azurekusto-uri") +val authorityId = dbutils.secrets.get("purview-to-adb-kv", "tenant-id") +val cluster = uri.replaceAll(".kusto.windows.net", "").replaceAll("https://", "") +val database = "database01" // this is hardcoded - so if changed in the bicep template, also needs to be changed here. +val table = "table01" + +// COMMAND ---------- + +val storageServiceName = sys.env("STORAGE_SERVICE_NAME") +val storageContainerName = "rawdata" +val wasbsRootPath = "wasbs://"+storageContainerName+"@"+storageServiceName+".blob.core.windows.net" + +val storageKey = dbutils.secrets.get("purview-to-adb-kv", "storage-service-key") + +spark.conf.set("fs.azure.account.key."+storageServiceName+".blob.core.windows.net", storageKey) + +val file_location = wasbsRootPath + "/testcase/one/exampleInputA/exampleInputA.csv" + +// COMMAND ---------- + +val df = spark.read.option("header","true").csv(file_location) + + +// COMMAND ---------- + +df.write + .format("com.microsoft.kusto.spark.datasource") + .option(KustoSinkOptions.KUSTO_CLUSTER, cluster) + .option(KustoSinkOptions.KUSTO_DATABASE, database) + .option(KustoSinkOptions.KUSTO_TABLE, table) + .option(KustoSinkOptions.KUSTO_AAD_APP_ID, appId) + .option(KustoSinkOptions.KUSTO_AAD_APP_SECRET, appKey) + .option(KustoSinkOptions.KUSTO_AAD_AUTHORITY_ID, authorityId) + .option(KustoSinkOptions.KUSTO_TABLE_CREATE_OPTIONS, "CreateIfNotExist") + .mode(SaveMode.Append) + .save() + +// COMMAND ---------- + + diff --git a/tests/integration/spark-apps/notebooks/wasbs-in-wasbs-out-with-param.py b/tests/integration/spark-apps/notebooks/wasbs-in-wasbs-out-with-param.py deleted file mode 100644 index 8dcb9a7..0000000 --- a/tests/integration/spark-apps/notebooks/wasbs-in-wasbs-out-with-param.py +++ /dev/null @@ -1,39 +0,0 @@ -# Databricks notebook source -# MAGIC %md -# MAGIC # Sample Databricks Lineage Extraction witrh param - -# COMMAND ---------- - -myval = dbutils.widgets.text('mayval','') -print(myval) - -# COMMAND ---------- - -key = dbutils.secrets.get("purview-to-adb-scope", "storage-service-key") - -spark.conf.set( - "fs.azure.account.key..blob.core.windows.net", - key) - -# COMMAND ---------- - -retail = ( - spark.read.csv("wasbs://rawdata@.blob.core.windows.net/retail/", inferSchema=True, header=True) - .withColumnRenamed('Customer ID', 'CustomerId' ) - .drop("Invoice") -) -retail.write.mode("overwrite").parquet("wasbs://outputdata@.blob.core.windows.net/retail/wasbdemo") - -# COMMAND ---------- - -display(retail.take(2)) - -# COMMAND ---------- - -retail2 = spark.read.parquet("wasbs://outputdata@.blob.core.windows.net/retail/wasbdemo") -retail2 = retail2.withColumnRenamed('Quantity', 'QuantitySold').drop('Country') -retail2.write.mode("overwrite").parquet("wasbs://outputdata@.blob.core.windows.net/retail/wasbdemo_updated") - -# COMMAND ---------- - -# display(retail2.take(2)) diff --git a/tests/integration/spark-apps/notebooks/wasbs-in-wasbs-out.scala b/tests/integration/spark-apps/notebooks/wasbs-in-wasbs-out.scala index f369f99..4368da8 100644 --- a/tests/integration/spark-apps/notebooks/wasbs-in-wasbs-out.scala +++ b/tests/integration/spark-apps/notebooks/wasbs-in-wasbs-out.scala @@ -7,7 +7,7 @@ val storageServiceName = sys.env("STORAGE_SERVICE_NAME") val storageContainerName = "rawdata" val wasbsRootPath = "wasbs://"+storageContainerName+"@"+storageServiceName+".blob.core.windows.net" -val storageKey = dbutils.secrets.get("purview-to-adb-scope", "storage-service-key") +val storageKey = dbutils.secrets.get("purview-to-adb-kv", "storage-service-key") spark.conf.set("fs.azure.account.key."+storageServiceName+".blob.core.windows.net", storageKey) @@ -22,7 +22,7 @@ val exampleA = ( spark.read.format("csv") .schema(exampleASchema) .option("header", true) - .load(wasbsRootPath+"/examples/data/csv/exampleInputA/exampleInputA.csv") + .load(wasbsRootPath+"/testcase/wasinwasout/exampleInputA/") ) @@ -35,14 +35,14 @@ val exampleB = ( spark.read.format("csv") .schema(exampleBSchema) .option("header", true) - .load(wasbsRootPath+"/examples/data/csv/exampleInputB/exampleInputB.csv") + .load(wasbsRootPath+"/testcase/wasinwasout/exampleInputB/") ) // COMMAND ---------- val outputDf = exampleA.join(exampleB, exampleA("id") === exampleB("id"), "inner").drop(exampleB("id")) -outputDf.repartition(1).write.mode("overwrite").format("csv").save(wasbsRootPath+"/examples/data/csv/exampleOutputWASBS/") +outputDf.repartition(1).write.mode("overwrite").format("csv").save(wasbsRootPath+"/testcase/wasinwasout/exampleOutputWASBS/") // COMMAND ---------- diff --git a/tests/integration/spark-apps/pythonscript/pythonscript.json b/tests/integration/spark-apps/pythonscript/pythonscript.json index 3732532..a58a07b 100644 --- a/tests/integration/spark-apps/pythonscript/pythonscript.json +++ b/tests/integration/spark-apps/pythonscript/pythonscript.json @@ -4,7 +4,7 @@ "num_workers": 1, "spark_version": "9.1.x-scala2.12", "spark_conf": { - "spark.openlineage.url.param.code": "{{secrets/purview-to-adb-scope/Ol-Output-Api-Key}}", + "spark.openlineage.url.param.code": "{{secrets/purview-to-adb-kv/Ol-Output-Api-Key}}", "spark.openlineage.host": "https://YOURFUNCTION.azurewebsites.net", "spark.openlineage.namespace": "adb-123.1#ABC123", "spark.openlineage.version": "1" diff --git a/tests/integration/spark-apps/sparksubmit/sparksubmit.json b/tests/integration/spark-apps/sparksubmit/sparksubmit.json index 3fd25b5..d62c1b6 100644 --- a/tests/integration/spark-apps/sparksubmit/sparksubmit.json +++ b/tests/integration/spark-apps/sparksubmit/sparksubmit.json @@ -4,7 +4,7 @@ "num_workers": 1, "spark_version": "9.1.x-scala2.12", "spark_conf": { - "spark.openlineage.url.param.code": "{{secrets/purview-to-adb-scope/Ol-Output-Api-Key}}", + "spark.openlineage.url.param.code": "{{secrets/purview-to-adb-kv/Ol-Output-Api-Key}}", "spark.openlineage.host": "https://YOURFUNCTION.azurewebsites.net", "spark.openlineage.namespace": "YOURNAMESPACE#JOBNAME", "spark.openlineage.version": "1" diff --git a/tests/integration/spark-apps/wheeljobs/abfssInAbfssOut/abfssintest/main.py b/tests/integration/spark-apps/wheeljobs/abfssInAbfssOut/abfssintest/main.py index e36cf25..32486a4 100644 --- a/tests/integration/spark-apps/wheeljobs/abfssInAbfssOut/abfssintest/main.py +++ b/tests/integration/spark-apps/wheeljobs/abfssInAbfssOut/abfssintest/main.py @@ -17,7 +17,7 @@ def runapp(): abfssRootPath = "abfss://"+storageContainerName+"@"+storageServiceName+".dfs.core.windows.net" outputRootPath = "abfss://"+ouptutContainerName+"@"+storageServiceName+".dfs.core.windows.net" - storageKey = dbutils.secrets.get("purview-to-adb-scope", "storage-service-key") + storageKey = dbutils.secrets.get("purview-to-adb-kv", "storage-service-key") spark.conf.set("fs.azure.account.key."+storageServiceName+".dfs.core.windows.net", storageKey) diff --git a/tests/integration/spark-apps/wheeljobs/abfssInAbfssOut/db-job-def.json b/tests/integration/spark-apps/wheeljobs/abfssInAbfssOut/db-job-def.json deleted file mode 100644 index e89c7c1..0000000 --- a/tests/integration/spark-apps/wheeljobs/abfssInAbfssOut/db-job-def.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "settings": { - "existing_cluster_id": "TEST-CLUSTER-ID", - "libraries": [ - { - "whl": "dbfs:/wheels/abfssintest-0.0.3-py3-none-any.whl" - } - ], - "python_wheel_task": { - "packageName": "abfssintest", - "entryPoint": "runapp" - }, - "timeout_seconds": 0, - "email_notifications": {}, - "name": "WheelJob", - "max_concurrent_runs": 1 - } -} \ No newline at end of file