From 0cbcca41ac6655ecb25305f591d55ea8608fce8f Mon Sep 17 00:00:00 2001 From: bwmac Date: Wed, 29 May 2024 12:34:42 -0600 Subject: [PATCH 1/6] updates gx --- src/agoradatatools/gx.py | 6 ++++-- tests/test_gx.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/src/agoradatatools/gx.py b/src/agoradatatools/gx.py index 565bcb98..a6247bf2 100644 --- a/src/agoradatatools/gx.py +++ b/src/agoradatatools/gx.py @@ -25,7 +25,7 @@ def __init__( syn: Synapse, dataset_path: str, dataset_name: str, - upload_folder: str, + upload_folder: str = None, nested_columns: typing.List[str] = None, ): """Initialize the class""" @@ -180,7 +180,9 @@ def run(self) -> None: f"Data validation complete for {self.expectation_suite_name}. Uploading results to Synapse." ) latest_reults_path = self._get_results_path(checkpoint_result) - self._upload_results_file_to_synapse(latest_reults_path) + + if self.upload_folder: + self._upload_results_file_to_synapse(latest_reults_path) if not checkpoint_result.success: fail_message = self.get_failed_expectations(checkpoint_result) diff --git a/tests/test_gx.py b/tests/test_gx.py index c9a8b176..d66b7572 100644 --- a/tests/test_gx.py +++ b/tests/test_gx.py @@ -276,3 +276,34 @@ def test_run_raises_error_when_validation_fails( patch_get_failed_expectations.assert_called_once_with( patch_upload_results_file_to_synapse.return_value ) + + def test_that_that_files_are_not_uploaded_when_upload_folder_is_none( + self, + ): + with patch.object( + self.good_runner, "_check_if_expectation_suite_exists", return_value=True + ), patch.object( + pd, "read_json", return_value=pd.DataFrame() + ) as patch_read_json, patch.object( + self.good_runner, + "convert_nested_columns_to_json", + return_value=pd.DataFrame(), + ) as patch_convert_nested_columns_to_json, patch.object( + self.good_runner, "_get_results_path", return_value="test_path" + ) as patch_get_results_path, patch.object( + self.good_runner, "_upload_results_file_to_synapse", return_value=None + ) as patch_upload_results_file_to_synapse, patch.object( + Checkpoint, + "run", + return_value=self.passed_checkpoint_result, + ) as patch_checkpoint_run, patch.object( + self.good_runner, "get_failed_expectations", return_value="test" + ) as patch_get_failed_expectations: + self.good_runner.upload_folder = None + self.good_runner.run() + patch_read_json.assert_called_once_with(self.good_runner.dataset_path) + patch_convert_nested_columns_to_json.assert_not_called() + patch_get_results_path.assert_called_once() + patch_upload_results_file_to_synapse.assert_not_called() + patch_checkpoint_run.assert_called_once() + patch_get_failed_expectations.assert_not_called() From 7c216104bac88a3b1ce45111b593fd4b00007c3b Mon Sep 17 00:00:00 2001 From: bwmac Date: Wed, 29 May 2024 15:13:55 -0600 Subject: [PATCH 2/6] updates process --- src/agoradatatools/process.py | 82 ++++++++++++++------ tests/test_process.py | 137 +++++++++++++++++++++++++++++++++- 2 files changed, 194 insertions(+), 25 deletions(-) diff --git a/src/agoradatatools/process.py b/src/agoradatatools/process.py index 29c256d5..d2c855de 100644 --- a/src/agoradatatools/process.py +++ b/src/agoradatatools/process.py @@ -1,4 +1,6 @@ import logging +import typing +from enum import Enum import synapseclient from pandas import DataFrame @@ -12,6 +14,12 @@ logger = logging.getLogger(__name__) +class Platform(Enum): + LOCAL = "LOCAL" + GITHUB = "GITHUB" + NEXTFLOW = "NEXTFLOW" + + # TODO refactor to avoid so many if's - maybe some sort of mapping to callables def apply_custom_transformations(datasets: dict, dataset_name: str, dataset_obj: dict): if not isinstance(datasets, dict) or not isinstance(dataset_name, str): @@ -62,6 +70,7 @@ def process_dataset( staging_path: str, gx_folder: str, syn: synapseclient.Synapse, + upload: bool = True, ) -> tuple: """Takes in a dataset from the configuration file and passes it through the ETL process @@ -70,6 +79,7 @@ def process_dataset( staging_path (str): Staging path gx_folder (str): Synapse ID of the folder where Great Expectations reports should be uploaded syn (synapseclient.Synapse): synapseclient.Synapse session. + upload (bool, optional): Whether or not to upload the data to Synapse. Defaults to True. Returns: syn_obj (tuple): Tuple containing the id and version number of the uploaded file. @@ -128,7 +138,7 @@ def process_dataset( syn=syn, dataset_path=json_path, dataset_name=dataset_name, - upload_folder=gx_folder, + upload_folder=gx_folder if upload else None, nested_columns=( dataset_obj[dataset_name]["gx_nested_columns"] if "gx_nested_columns" in dataset_obj[dataset_name].keys() @@ -137,19 +147,18 @@ def process_dataset( ) gx_runner.run() - syn_obj = load.load( - file_path=json_path, - provenance=dataset_obj[dataset_name]["provenance"], - destination=dataset_obj[dataset_name]["destination"], - syn=syn, - ) - - return syn_obj + if upload: + load.load( + file_path=json_path, + provenance=dataset_obj[dataset_name]["provenance"], + destination=dataset_obj[dataset_name]["destination"], + syn=syn, + ) def create_data_manifest( syn: synapseclient.Synapse, parent: synapseclient.Folder = None -) -> DataFrame: +) -> typing.Union[DataFrame, None]: """Creates data manifest (dataframe) that has the IDs and version numbers of child synapse folders Args: @@ -157,14 +166,13 @@ def create_data_manifest( parent (synapseclient.Folder/str, optional): synapse folder or synapse id pointing to parent synapse folder. Defaults to None. Returns: - DataFrame: Dataframe containing IDs and version numbers of folders within the parent directory + Dataframe containing IDs and version numbers of folders within the parent directory, or None if parent is None """ if not parent: return None folders = syn.getChildren(parent) - folder = [folders] folder = [ {"id": folder["id"], "version": folder["versionNumber"]} for folder in folders ] @@ -176,17 +184,28 @@ def create_data_manifest( def process_all_files( syn: synapseclient.Synapse, config_path: str = None, + platform: Platform = Platform.LOCAL, + upload: bool = True, ): """This function will read through the entire configuration and process each file listed. Args: syn (synapseclient.Session): Synapse client session config_path (str, optional): path to configuration file. Defaults to None. + platform (Platform, optional): Platform where the process is being run. One of LOCAL, GITHUB, NEXTFLOW. Defaults to LOCAL. + upload (bool, optional): Whether or not to upload the data to Synapse. Defaults to True. """ + if platform == Platform.LOCAL and upload is True: + logger.warning( + """Data will be uploaded to Synapse despite the platform being set to `LOCAL`. + Make sure you have provided a configuration file with alternative upload `destination` and `gx_folder`. + See the contributing guide for more information.""" + ) config = utils._get_config(config_path=config_path) datasets = config["datasets"] + destination = config["destination"] # create staging location staging_path = config.get("staging_path", None) @@ -204,14 +223,13 @@ def process_all_files( staging_path=staging_path, gx_folder=config["gx_folder"], syn=syn, + upload=upload, ) except Exception as e: error_list.append( f"{list(dataset.keys())[0]}: " + str(e).replace("\n", "") ) - destination = config["destination"] - if error_list: raise ADTDataProcessingError( "\nData Processing has failed for one or more data sources. Refer to the list of errors below to address issues:\n" @@ -223,18 +241,33 @@ def process_all_files( df=manifest_df, staging_path=staging_path, filename="data_manifest.csv" ) - load.load( - file_path=manifest_path, - provenance=manifest_df["id"].to_list(), - destination=destination, - syn=syn, - ) + if upload: + load.load( + file_path=manifest_path, + provenance=manifest_df["id"].to_list(), + destination=destination, + syn=syn, + ) app = Typer() - input_path_arg = Argument(..., help="Path to configuration file for processing run") + +platform_arg = Option( + "LOCAL", + "--platform", + "-p", + help="Platform that is running the process. Must be one of LOCAL, GITHUB, or NEXTFLOW.", + show_default=True, +) +upload_opt = Option( + False, + "--upload", + "-u", + help="Toggles whether or not files will be uploaded to Synapse.", + show_default=True, +) synapse_auth_opt = Option( None, "--token", @@ -247,10 +280,15 @@ def process_all_files( @app.command() def process( config_path: str = input_path_arg, + platform: str = platform_arg, + upload: bool = upload_opt, auth_token: str = synapse_auth_opt, ): syn = utils._login_to_synapse(token=auth_token) - process_all_files(syn=syn, config_path=config_path) + platform_enum = Platform(platform) + process_all_files( + syn=syn, config_path=config_path, platform=platform_enum, upload=upload + ) if __name__ == "__main__": diff --git a/tests/test_process.py b/tests/test_process.py index 82da3c9c..ca68ba0f 100644 --- a/tests/test_process.py +++ b/tests/test_process.py @@ -8,6 +8,7 @@ from agoradatatools import process from agoradatatools.errors import ADTDataProcessingError from agoradatatools.etl import extract, load, utils +from agoradatatools.gx import GreatExpectationsRunner STAGING_PATH = "./staging" GX_FOLDER = "test_folder" @@ -53,6 +54,16 @@ class TestProcessDataset: } } + dataset_object_gx_enabled = { + "neuropath_corr": { + "files": [{"name": "test_file_1", "id": "syn1111111", "format": "csv"}], + "final_format": "json", + "provenance": ["syn1111111"], + "destination": "syn1111113", + "gx_enabled": True, + } + } + def setup_method(self): self.patch_get_entity_as_df = patch.object( extract, "get_entity_as_df", return_value=pd.DataFrame @@ -76,6 +87,10 @@ def setup_method(self): self.patch_dict_to_json = patch.object( load, "dict_to_json", return_value="path/to/json" ).start() + self.patch_gx_runner_run = patch.object( + GreatExpectationsRunner, + "run", + ).start() def teardown_method(self): self.patch_get_entity_as_df.stop() @@ -86,6 +101,7 @@ def teardown_method(self): self.patch_load.stop() self.patch_custom_transform.stop() self.patch_dict_to_json.stop() + self.patch_gx_runner_run.stop() def test_process_dataset_with_column_rename(self, syn: Any): process.process_dataset( @@ -93,12 +109,20 @@ def test_process_dataset_with_column_rename(self, syn: Any): staging_path=STAGING_PATH, gx_folder=GX_FOLDER, syn=syn, + upload=True, ) self.patch_rename_columns.assert_called_once_with( df=pd.DataFrame, column_map={"col_1": "new_col_1", "col_2": "new_col_2"} ) self.patch_custom_transform.assert_not_called() self.patch_dict_to_json.assert_not_called() + self.patch_gx_runner_run.assert_not_called() + self.patch_load.assert_called_once_with( + file_path=self.patch_dict_to_json.return_value, + provenance=self.dataset_object_col_rename["neuropath_corr"]["provenance"], + destination=self.dataset_object_col_rename["neuropath_corr"]["destination"], + syn=syn, + ) def test_process_dataset_custom_transformations(self, syn: Any): process.process_dataset( @@ -106,6 +130,7 @@ def test_process_dataset_custom_transformations(self, syn: Any): staging_path=STAGING_PATH, gx_folder=GX_FOLDER, syn=syn, + upload=True, ) self.patch_custom_transform.assert_called_once_with( datasets={"test_file_1": pd.DataFrame}, @@ -120,6 +145,17 @@ def test_process_dataset_custom_transformations(self, syn: Any): ) self.patch_rename_columns.assert_not_called() self.patch_dict_to_json.assert_not_called() + self.patch_gx_runner_run.assert_not_called() + self.patch_load.assert_called_once_with( + file_path=self.patch_dict_to_json.return_value, + provenance=self.dataset_object_custom_transform["neuropath_corr"][ + "provenance" + ], + destination=self.dataset_object_custom_transform["neuropath_corr"][ + "destination" + ], + syn=syn, + ) def test_process_dataset_with_agora_rename(self, syn: Any): process.process_dataset( @@ -127,22 +163,31 @@ def test_process_dataset_with_agora_rename(self, syn: Any): staging_path=STAGING_PATH, gx_folder=GX_FOLDER, syn=syn, + upload=True, ) self.patch_rename_columns.assert_called_once_with( df=pd.DataFrame, column_map={"col_1": "new_col_1", "col_2": "new_col_2"} ) self.patch_custom_transform.assert_not_called() self.patch_dict_to_json.assert_not_called() + self.patch_gx_runner_run.assert_not_called() + self.patch_load.assert_called_once_with( + file_path=self.patch_dict_to_json.return_value, + provenance=self.dataset_object_agora_rename["neuropath_corr"]["provenance"], + destination=self.dataset_object_agora_rename["neuropath_corr"][ + "destination" + ], + syn=syn, + ) def test_process_dataset_type_dict(self, syn: Any): - self.patch_standardize_values.return_value = ( - dict() - ) # test if it is a dictionary later + self.patch_standardize_values.return_value = dict() process.process_dataset( dataset_obj=self.dataset_object, staging_path=STAGING_PATH, gx_folder=GX_FOLDER, syn=syn, + upload=True, ) self.patch_dict_to_json.assert_called_once_with( df={}, staging_path=STAGING_PATH, filename="neuropath_corr.json" @@ -150,6 +195,50 @@ def test_process_dataset_type_dict(self, syn: Any): self.patch_rename_columns.assert_not_called() self.patch_custom_transform.assert_not_called() self.patch_df_to_json.assert_not_called() + self.patch_gx_runner_run.assert_not_called() + self.patch_load.assert_called_once_with( + file_path=self.patch_dict_to_json.return_value, + provenance=self.dataset_object["neuropath_corr"]["provenance"], + destination=self.dataset_object["neuropath_corr"]["destination"], + syn=syn, + ) + + def test_process_when_upload_false(self, syn: Any): + process.process_dataset( + dataset_obj=self.dataset_object, + staging_path=STAGING_PATH, + gx_folder=GX_FOLDER, + syn=syn, + upload=False, + ) + self.patch_rename_columns.assert_not_called() + self.patch_custom_transform.assert_not_called() + self.patch_df_to_json.assert_called_once_with( + df=pd.DataFrame, staging_path=STAGING_PATH, filename="neuropath_corr.json" + ) + self.patch_gx_runner_run.assert_not_called() + self.patch_load.assert_not_called() + + def test_process_when_gx_is_enabled(self, syn: Any): + process.process_dataset( + dataset_obj=self.dataset_object_gx_enabled, + staging_path=STAGING_PATH, + gx_folder=GX_FOLDER, + syn=syn, + upload=True, + ) + self.patch_rename_columns.assert_not_called() + self.patch_custom_transform.assert_not_called() + self.patch_df_to_json.assert_called_once_with( + df=pd.DataFrame, staging_path=STAGING_PATH, filename="neuropath_corr.json" + ) + self.patch_gx_runner_run.assert_called_once() + self.patch_load.assert_called_once_with( + file_path=self.patch_dict_to_json.return_value, + provenance=self.dataset_object["neuropath_corr"]["provenance"], + destination=self.dataset_object["neuropath_corr"]["destination"], + syn=syn, + ) class TestCreateDataManifest: @@ -230,18 +319,59 @@ def test_process_all_files_full(self, syn: Any): staging_path=STAGING_PATH, gx_folder=GX_FOLDER, syn=syn, + upload=True, + ) + self.patch_process_dataset.assert_any_call( + dataset_obj={"d": {"e": "f"}}, + staging_path=STAGING_PATH, + gx_folder=GX_FOLDER, + syn=syn, + upload=True, + ) + self.patch_process_dataset.assert_any_call( + dataset_obj={"g": {"h": "i"}}, + staging_path=STAGING_PATH, + gx_folder=GX_FOLDER, + syn=syn, + upload=True, + ) + self.patch_create_data_manifest.assert_called_once_with( + parent="destination", syn=syn + ) + self.patch_df_to_csv.assert_called_once_with( + df=self.patch_create_data_manifest.return_value, + staging_path=STAGING_PATH, + filename="data_manifest.csv", + ) + self.patch_load.assert_called_once_with( + file_path=self.patch_df_to_csv.return_value, + provenance=self.patch_create_data_manifest.return_value["id"].tolist(), + destination=self.patch_get_config.return_value["destination"], + syn=syn, + ) + + def test_process_all_files_no_upload(self, syn: Any): + process.process_all_files(syn=syn, config_path=self.CONFIG_PATH, upload=False) + self.patch_process_dataset.assert_any_call( + dataset_obj={"a": {"b": "c"}}, + staging_path=STAGING_PATH, + gx_folder=GX_FOLDER, + syn=syn, + upload=False, ) self.patch_process_dataset.assert_any_call( dataset_obj={"d": {"e": "f"}}, staging_path=STAGING_PATH, gx_folder=GX_FOLDER, syn=syn, + upload=False, ) self.patch_process_dataset.assert_any_call( dataset_obj={"g": {"h": "i"}}, staging_path=STAGING_PATH, gx_folder=GX_FOLDER, syn=syn, + upload=False, ) self.patch_create_data_manifest.assert_called_once_with( parent="destination", syn=syn @@ -251,3 +381,4 @@ def test_process_all_files_full(self, syn: Any): staging_path=STAGING_PATH, filename="data_manifest.csv", ) + self.patch_load.assert_not_called() From e238b0859c130a77adc199999174b28d5e97d9d6 Mon Sep 17 00:00:00 2001 From: bwmac Date: Wed, 29 May 2024 15:28:05 -0600 Subject: [PATCH 3/6] updates CI pipeline --- .github/workflows/dev.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index ef4dd275..84521d5c 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -68,7 +68,7 @@ jobs: python-version: "3.9" - run: pip install -U setuptools - run: pip install . - - run: adt test_config.yaml -t ${{secrets.SYNAPSE_PAT}} + - run: adt test_config.yaml --upload --platform GITHUB --token ${{secrets.SYNAPSE_PAT}} ghcr-publish: needs: [build, test] From 8f8100d6ab3d5bd1fce68affe74fb18645efa632 Mon Sep 17 00:00:00 2001 From: bwmac Date: Wed, 29 May 2024 16:19:06 -0600 Subject: [PATCH 4/6] updates contributor guide --- CONTRIBUTING.md | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 814a7e7e..675ab6d0 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,4 +1,3 @@ - ## Contributing We welcome all contributions! That said, this is a Sage Bionetworks owned project, and we use JIRA ([AG](https://sagebionetworks.jira.com/jira/software/c/projects/AG/boards/91)/[IBCDPE](https://sagebionetworks.jira.com/jira/software/c/projects/IBCDPE/boards/189)) to track any bug/feature requests. This guide will be more focussed on a Sage Bio employee's development workflow. If you are a Sage Bio employee, make sure to assign yourself the JIRA ticket if you decide to work on it. @@ -71,9 +70,21 @@ The agora-data-tools project follows the standard [trunk based development](http 1. Make sure to run the auto python code formatter, black. - ```shell - black ./ - ``` + ```shell + black ./ + ``` + +1. Test your changes by running `agora-data-tools` locally. + +``` +adt test_config.yaml +``` + +If your changes have to do with the way that files are uploaded to Synapse, create a new configuration file by copying `test_config.yaml` and changing the `destination` and `gx_folder` fields to testing locations that you own. The command will change to be: + +``` +adt my_dev_config.yaml --upload +``` 1. Once you have completed all the steps above, create a pull request from the feature branch to the `dev` branch of the Sage-Bionetworks/agora-data-tools repo. From 6efabf4e93af84f3828ec87ce25c17839721ca45 Mon Sep 17 00:00:00 2001 From: bwmac Date: Wed, 29 May 2024 16:24:45 -0600 Subject: [PATCH 5/6] adds gx_enabled note --- CONTRIBUTING.md | 74 +++++++++++++++++++++++++------------------------ 1 file changed, 38 insertions(+), 36 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 675ab6d0..375910be 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,6 +1,6 @@ ## Contributing -We welcome all contributions! That said, this is a Sage Bionetworks owned project, and we use JIRA ([AG](https://sagebionetworks.jira.com/jira/software/c/projects/AG/boards/91)/[IBCDPE](https://sagebionetworks.jira.com/jira/software/c/projects/IBCDPE/boards/189)) to track any bug/feature requests. This guide will be more focussed on a Sage Bio employee's development workflow. If you are a Sage Bio employee, make sure to assign yourself the JIRA ticket if you decide to work on it. +We welcome all contributions! That said, this is a Sage Bionetworks owned project, and we use JIRA ([AG](https://sagebionetworks.jira.com/jira/software/c/projects/AG/boards/91)/[IBCDPE](https://sagebionetworks.jira.com/jira/software/c/projects/IBCDPE/boards/189)) to track any bug/feature requests. This guide will be more focussed on a Sage Bio employee's development workflow. If you are a Sage Bio employee, make sure to assign yourself the JIRA ticket if you decide to work on it. ## Coding Style @@ -23,7 +23,7 @@ The code in this package is also automatically formatted by `black` for consiste ### Install development dependencies -Please follow the [README.md](README.md) to install the package for development purposes. Be sure you run this: +Please follow the [README.md](README.md) to install the package for development purposes. Be sure you run this: ``` pipenv install --dev @@ -32,41 +32,42 @@ pipenv install --dev ### Developing at Sage Bio The agora-data-tools project follows the standard [trunk based development](https://trunkbaseddevelopment.com/) development strategy. + > To ensure the most fluid development, do not push to `dev`! 1. Please ask for write permissions to contribute directly to this repository. 1. Make sure you are always creating feature branches from the `dev` branch. We use branches instead of forks, because CI/CD cannot access secrets across Github forks. - ```shell - git checkout dev - git pull - ``` + ```shell + git checkout dev + git pull + ``` 1. Create a feature branch from the `dev` branch. Use the Id of the JIRA issue that you are addressing and name the branch after the issue with some more detail like `{user}/{JIRA}-123/add-some-new-feature`. - ```shell - git checkout dev - git checkout -b tyu/JIRA-123/some-new-feature - ``` + ```shell + git checkout dev + git checkout -b tyu/JIRA-123/some-new-feature + ``` 1. At this point, you have only created the branch locally, you need to push this to your fork on GitHub. - ```shell - git push --set-upstream origin tyu/JIRA-123/some-new-feature - ``` + ```shell + git push --set-upstream origin tyu/JIRA-123/some-new-feature + ``` - You should now be able to see the branch on GitHub. Make commits as you deem necessary. It helps to provide useful commit messages - a commit message saying 'Update' is a lot less helpful than saying 'Remove X parameter because it was unused'. + You should now be able to see the branch on GitHub. Make commits as you deem necessary. It helps to provide useful commit messages - a commit message saying 'Update' is a lot less helpful than saying 'Remove X parameter because it was unused'. - ```shell - git commit changed_file.txt -m "Remove X parameter because it was unused" - git push - ``` + ```shell + git commit changed_file.txt -m "Remove X parameter because it was unused" + git push + ``` -1. Once you have made your additions or changes, make sure you write tests and run the test suite. More information on testing below. +1. Once you have made your additions or changes, make sure you write tests and run the test suite. More information on testing below. - ```shell - pytest -vs tests/ - ``` + ```shell + pytest -vs tests/ + ``` 1. Make sure to run the auto python code formatter, black. @@ -88,9 +89,9 @@ adt my_dev_config.yaml --upload 1. Once you have completed all the steps above, create a pull request from the feature branch to the `dev` branch of the Sage-Bionetworks/agora-data-tools repo. -> *A code maintainer must review and accept your pull request.* Most code reviews can be done asyncronously. For more complex code updates, an "in-person" or zoom code review can happen between the reviewer(s) and contributor. +> _A code maintainer must review and accept your pull request._ Most code reviews can be done asyncronously. For more complex code updates, an "in-person" or zoom code review can happen between the reviewer(s) and contributor. -This package uses [semantic versioning](https://semver.org/) for releasing new versions. A github release should occur at least once a quarter to capture the changes between releases. Currently releases are minted by admins of this repo, but there is no formal process of when releases are minted except for more freqeunt releases leading to smaller changelogs. +This package uses [semantic versioning](https://semver.org/) for releasing new versions. A github release should occur at least once a quarter to capture the changes between releases. Currently releases are minted by admins of this repo, but there is no formal process of when releases are minted except for more freqeunt releases leading to smaller changelogs. @@ -145,28 +146,29 @@ Follow gitflow best practices as linked above. ### Transforms -This package has a `src/agoradatatools/etl/transform` submodule. This folder houses all the individual transform modules required for the package. Here are the steps to add more transforms: +This package has a `src/agoradatatools/etl/transform` submodule. This folder houses all the individual transform modules required for the package. Here are the steps to add more transforms: -1. Create new script in the transform submodule that matches the dataset name and name the function `transform_...`. For example, if you have a dataset named `genome_variants`, your new script would be `src/agoradatatools/etl/transform/transform_genome_variants.py`. +1. Create new script in the transform submodule that matches the dataset name and name the function `transform_...`. For example, if you have a dataset named `genome_variants`, your new script would be `src/agoradatatools/etl/transform/transform_genome_variants.py`. 1. Register the new transform function in `src/agoradatatools/etl/transform/__init__.py`. Look in that file for examples. 1. Modify the `apply_custom_transformations` in `src/agoradatatools/process.py` to include your new transform. 1. Write a test for the transform: - - For transform tests, we are using a [Data-Driven Testing](https://www.develer.com/en/blog/data-driven-testing-with-python/) strategy - - To contribute new tests, assets in the form of input and output data files are needed. - - The input file is loaded to serve as the data fed into the transform function, while the output file is loaded in to check the function output against. - - These tests should include multiple ways of evaluating the transform function, including one test that should pass (good input data) and at least one that should fail (bad input data). - - For some functions, it may be appropriate to include multiple passing datasets (e.g. for functions that are written to handle imperfect data) and/or multiple failing datasets (e.g. for transforms operating on datasets that can be unclean in multiple distinct ways). - - Each transform function should have its own folder in `test_assets` to hold its input and output data files. Inputs should be in CSV form and outputs in JSON form. - - Use `pytest.mark.parameterize` to loop through multiple datasets in a single test. - - The class `TestTransformGenesBiodomains` can be used as an example for future tests contibuted. + - For transform tests, we are using a [Data-Driven Testing](https://www.develer.com/en/blog/data-driven-testing-with-python/) strategy + - To contribute new tests, assets in the form of input and output data files are needed. + - The input file is loaded to serve as the data fed into the transform function, while the output file is loaded in to check the function output against. + - These tests should include multiple ways of evaluating the transform function, including one test that should pass (good input data) and at least one that should fail (bad input data). + - For some functions, it may be appropriate to include multiple passing datasets (e.g. for functions that are written to handle imperfect data) and/or multiple failing datasets (e.g. for transforms operating on datasets that can be unclean in multiple distinct ways). + - Each transform function should have its own folder in `test_assets` to hold its input and output data files. Inputs should be in CSV form and outputs in JSON form. + - Use `pytest.mark.parameterize` to loop through multiple datasets in a single test. + - The class `TestTransformGenesBiodomains` can be used as an example for future tests contibuted. ### Great Expectations -This package uses [Great Expectations](https://greatexpectations.io/) to validate output data. The `src/agoradatatools/great_expectations` folder houses our file system data context and Great Expectations-specific configuration files. Eventually, our goal is for each `agora-data-tools` dataset to be convered by an expectation suite. To add data validation for more datasets, follow these steps: +This package uses [Great Expectations](https://greatexpectations.io/) to validate output data. The `src/agoradatatools/great_expectations` folder houses our file system data context and Great Expectations-specific configuration files. Eventually, our goal is for each `agora-data-tools` dataset to be convered by an expectation suite. To add data validation for more datasets, follow these steps: 1. Create a new expectation suite by defining the expectations for the dataset in a Jupyter Notebook inside the `gx_suite_definitions` folder. Use `metabolomics.ipynb` as an example. You can find a catalog of existing expectations [here](https://greatexpectations.io/expectations/). 1. Run the notebook to generate the new expectation suite. It should populate as a JSON file in the `/great_expectations/expectations` folder. -1. Add support for running Great Expectations on a dataset by adding `gx_enabled: true` to the configuration for the datatset in both `test_config.yaml` and `config.yaml`. After updating the config files reports should be uploaded in the proper locations ([Prod](https://www.synapse.org/#!Synapse:syn52948668), [Testing](https://www.synapse.org/#!Synapse:syn52948670)) when data processing is complete. +1. Add support for running Great Expectations on a dataset by adding `gx_enabled: true` to the configuration for the datatset in both `test_config.yaml` and `config.yaml`. After updating the config files reports should be uploaded in the proper locations ([Prod](https://www.synapse.org/#!Synapse:syn52948668), [Testing](https://www.synapse.org/#!Synapse:syn52948670)) when data processing is complete. + - You can prevent Great Expectations from running for a dataset by removing the `gx_enabled: true` from the configuration for the dataset. 1. Test data processing by running `adt test_config.yaml` and ensure that HTML reports with all expectations are generated and uploaded to the proper folder in Synapse. #### Custom Expectations From 9cd4bc4cc4b23e894b45428451687a84456be5a8 Mon Sep 17 00:00:00 2001 From: bwmac Date: Wed, 29 May 2024 17:24:59 -0600 Subject: [PATCH 6/6] updates docstring --- src/agoradatatools/process.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/agoradatatools/process.py b/src/agoradatatools/process.py index d2c855de..c7dd49dd 100644 --- a/src/agoradatatools/process.py +++ b/src/agoradatatools/process.py @@ -71,7 +71,7 @@ def process_dataset( gx_folder: str, syn: synapseclient.Synapse, upload: bool = True, -) -> tuple: +) -> None: """Takes in a dataset from the configuration file and passes it through the ETL process Args: @@ -80,9 +80,6 @@ def process_dataset( gx_folder (str): Synapse ID of the folder where Great Expectations reports should be uploaded syn (synapseclient.Synapse): synapseclient.Synapse session. upload (bool, optional): Whether or not to upload the data to Synapse. Defaults to True. - - Returns: - syn_obj (tuple): Tuple containing the id and version number of the uploaded file. """ dataset_name = list(dataset_obj.keys())[0] @@ -254,7 +251,7 @@ def process_all_files( input_path_arg = Argument(..., help="Path to configuration file for processing run") -platform_arg = Option( +platform_opt = Option( "LOCAL", "--platform", "-p", @@ -280,7 +277,7 @@ def process_all_files( @app.command() def process( config_path: str = input_path_arg, - platform: str = platform_arg, + platform: str = platform_opt, upload: bool = upload_opt, auth_token: str = synapse_auth_opt, ):