diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 1bd8d7c9..814a7e7e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -151,11 +151,11 @@ This package has a `src/agoradatatools/etl/transform` submodule. This folder ho ### Great Expectations -This package uses [Great Expectations](https://greatexpectations.io/) to validate output data. The `src/agoradatatools/great_expectations` folder houses our file system data context and Great Expectations-specific configuration files. Eventually, our goal is for each `agora-data-tools` output dataset to be convered by an expectation suite. To add data validation for more datasets, follow these steps: +This package uses [Great Expectations](https://greatexpectations.io/) to validate output data. The `src/agoradatatools/great_expectations` folder houses our file system data context and Great Expectations-specific configuration files. Eventually, our goal is for each `agora-data-tools` dataset to be convered by an expectation suite. To add data validation for more datasets, follow these steps: -1. Create a new expectation suite by defining the expectations for the new dataset in a Jupyter Notebook inside the `gx_suite_definitions` folder. Use `metabolomics.ipynb` as an example. You can find a catalog of existing expectations [here](https://greatexpectations.io/expectations/). +1. Create a new expectation suite by defining the expectations for the dataset in a Jupyter Notebook inside the `gx_suite_definitions` folder. Use `metabolomics.ipynb` as an example. You can find a catalog of existing expectations [here](https://greatexpectations.io/expectations/). 1. Run the notebook to generate the new expectation suite. It should populate as a JSON file in the `/great_expectations/expectations` folder. -1. Add support for running Great Expectations on a dataset by adding the `gx_folder` key to the configuration for the datatset in both `test_config.yaml` and `config.yaml`. The `gx_folder` should be the Synapse ID pointing of a folder where generated HTML reports from Great Expectations for that dataset should be uploaded. If a folder specific to your dataset does not yet exist in the proper locations ([Prod](https://www.synapse.org/#!Synapse:syn52948668), [Testing](https://www.synapse.org/#!Synapse:syn52948670)), create folders with the same name as the dataset itself and copy the new folders' Synapse IDs to the config files. +1. Add support for running Great Expectations on a dataset by adding `gx_enabled: true` to the configuration for the datatset in both `test_config.yaml` and `config.yaml`. After updating the config files reports should be uploaded in the proper locations ([Prod](https://www.synapse.org/#!Synapse:syn52948668), [Testing](https://www.synapse.org/#!Synapse:syn52948670)) when data processing is complete. 1. Test data processing by running `adt test_config.yaml` and ensure that HTML reports with all expectations are generated and uploaded to the proper folder in Synapse. #### Custom Expectations diff --git a/README.md b/README.md index a5321fc6..ad34b762 100644 --- a/README.md +++ b/README.md @@ -122,6 +122,8 @@ python -m pytest ## Config Parameters: - `destination`: Defines the default target location (folder) that the generated json files are written to; this value can be overridden on a per-dataset basis +- `staging_path`: Defines the location of the staging folder that the generated json files are written to +- `gx_folder`: Defines the Synapse ID of the folder that generated GX reports are written to - `datasets/`: Each generated json file is named `.json` - `datasets//files`: A list of source files for the dataset - `name`: The name of the source file (this name is the reference the code will use to retrieve a file from the configuration) diff --git a/config.yaml b/config.yaml index 05e7b15c..72943eb2 100644 --- a/config.yaml +++ b/config.yaml @@ -1,5 +1,6 @@ destination: &dest syn12177492 staging_path: ./staging +gx_folder: syn52948668 sources: - genes_biodomains: genes_biodomains_files: &genes_biodomains_files @@ -62,7 +63,7 @@ datasets: ensembl_id: ensembl_gene_id goterm_name: go_terms destination: *dest - gx_folder: syn53127958 + gx_enabled: true gx_nested_columns: - gene_biodomains @@ -81,7 +82,7 @@ datasets: ensembl_gene_id: ensg hgnc_gene_id: gname destination: *dest - gx_folder: syn53461513 + gx_enabled: true - proteomics: files: *agora_proteomics_files @@ -100,7 +101,7 @@ datasets: genename: hgnc_symbol ensg: ensembl_gene_id destination: *dest - gx_folder: syn53469660 + gx_enabled: true - proteomics_srm: files: *agora_proteomics_srm_files @@ -120,7 +121,7 @@ datasets: provenance: - syn24184512.9 destination: *dest - gx_folder: syn53710839 + gx_enabled: true - metabolomics: files: @@ -131,7 +132,7 @@ datasets: provenance: - syn26064497.1 destination: *dest - gx_folder: syn52948669 + gx_enabled: true - gene_info: files: @@ -209,7 +210,7 @@ datasets: - syn12615624.18 - syn12615633.18 destination: *dest - gx_folder: syn53616579 + gx_enabled: true gx_nested_columns: - members @@ -227,7 +228,7 @@ datasets: overall: target_risk_score omicsscore: multi_omics_score destination: *dest - gx_folder: syn53453229 + gx_enabled: true - network: files: @@ -286,4 +287,4 @@ datasets: - *agora_proteomics_tmt_provenance - *agora_proteomics_srm_provenance destination: *dest - gx_folder: syn53463345 + gx_enabled: true diff --git a/src/agoradatatools/gx.py b/src/agoradatatools/gx.py index 925ebc58..565bcb98 100644 --- a/src/agoradatatools/gx.py +++ b/src/agoradatatools/gx.py @@ -85,9 +85,9 @@ def _get_results_path(self, checkpoint_result: CheckpointResult) -> str: *original_results_path_items, ) - timestamp_file_name = original_results_path_items[-2] + ".html" + expectation_suite_name = self.expectation_suite_name + ".html" new_results_path_items = original_results_path_items - new_results_path_items[-1] = timestamp_file_name + new_results_path_items[-1] = expectation_suite_name new_results_path = os.path.join( self.validations_path, *new_results_path_items, @@ -107,6 +107,7 @@ def _upload_results_file_to_synapse(self, results_path: str) -> None: name=f"Great Expectations {self.expectation_suite_name} results", executed="https://github.com/Sage-Bionetworks/agora-data-tools", ), + forceVersion=True, ) @staticmethod diff --git a/src/agoradatatools/process.py b/src/agoradatatools/process.py index ef0dca71..29c256d5 100644 --- a/src/agoradatatools/process.py +++ b/src/agoradatatools/process.py @@ -60,6 +60,7 @@ def apply_custom_transformations(datasets: dict, dataset_name: str, dataset_obj: def process_dataset( dataset_obj: dict, staging_path: str, + gx_folder: str, syn: synapseclient.Synapse, ) -> tuple: """Takes in a dataset from the configuration file and passes it through the ETL process @@ -67,6 +68,7 @@ def process_dataset( Args: dataset_obj (dict): A dataset defined in the configuration file staging_path (str): Staging path + gx_folder (str): Synapse ID of the folder where Great Expectations reports should be uploaded syn (synapseclient.Synapse): synapseclient.Synapse session. Returns: @@ -121,12 +123,12 @@ def process_dataset( ) # run great expectations on dataset if expectation suite exists - if "gx_folder" in dataset_obj[dataset_name].keys(): + if "gx_enabled" in dataset_obj[dataset_name].keys(): gx_runner = GreatExpectationsRunner( syn=syn, dataset_path=json_path, dataset_name=dataset_name, - upload_folder=dataset_obj[dataset_name]["gx_folder"], + upload_folder=gx_folder, nested_columns=( dataset_obj[dataset_name]["gx_nested_columns"] if "gx_nested_columns" in dataset_obj[dataset_name].keys() @@ -200,6 +202,7 @@ def process_all_files( process_dataset( dataset_obj=dataset, staging_path=staging_path, + gx_folder=config["gx_folder"], syn=syn, ) except Exception as e: diff --git a/test_config.yaml b/test_config.yaml index 8ee423bf..05eac300 100644 --- a/test_config.yaml +++ b/test_config.yaml @@ -1,5 +1,6 @@ destination: &dest syn17015333 staging_path: ./staging +gx_folder: syn52948670 sources: - genes_biodomains: genes_biodomains_files: &genes_biodomains_files @@ -62,7 +63,7 @@ datasets: ensembl_id: ensembl_gene_id goterm_name: go_terms destination: *dest - gx_folder: syn53127956 + gx_enabled: true gx_nested_columns: - gene_biodomains @@ -81,7 +82,7 @@ datasets: ensembl_gene_id: ensg hgnc_gene_id: gname destination: *dest - gx_folder: syn53461511 + gx_enabled: true - proteomics: files: *agora_proteomics_files @@ -100,7 +101,7 @@ datasets: genename: hgnc_symbol ensg: ensembl_gene_id destination: *dest - gx_folder: syn53469659 + gx_enabled: true - proteomics_srm: files: *agora_proteomics_srm_files @@ -120,7 +121,7 @@ datasets: provenance: - syn24184512.9 destination: *dest - gx_folder: syn53710838 + gx_enabled: true - metabolomics: files: @@ -131,7 +132,7 @@ datasets: provenance: - syn26064497.1 destination: *dest - gx_folder: syn52948671 + gx_enabled: true - gene_info: files: @@ -209,7 +210,7 @@ datasets: - syn12615624.18 - syn12615633.18 destination: *dest - gx_folder: syn53616774 + gx_enabled: true gx_nested_columns: - members @@ -227,7 +228,7 @@ datasets: overall: target_risk_score omicsscore: multi_omics_score destination: *dest - gx_folder: syn53453225 + gx_enabled: true - network: files: @@ -286,4 +287,4 @@ datasets: - *agora_proteomics_tmt_provenance - *agora_proteomics_srm_provenance destination: *dest - gx_folder: syn53463344 + gx_enabled: true diff --git a/tests/test_gx.py b/tests/test_gx.py index 5c596174..c9a8b176 100644 --- a/tests/test_gx.py +++ b/tests/test_gx.py @@ -80,7 +80,10 @@ def test_check_if_expectation_suite_exists_returns_true_when_the_expectation_sui assert self.good_runner._check_if_expectation_suite_exists() is True def test_get_results_path(self): - expected = self.good_runner.validations_path + "/test/path/to/to.html" + expected = ( + self.good_runner.validations_path + + f"/test/path/to/{self.good_runner.expectation_suite_name}.html" + ) mocked_checkpoint_result = mock.create_autospec(CheckpointResult) mocked_validation_result_identifier = mock.create_autospec( ValidationResultIdentifier( @@ -103,7 +106,8 @@ def test_get_results_path(self): patch_list_validation_result_identifiers.assert_called_once() patch_copy.assert_called_once_with( self.good_runner.validations_path + "/test/path/to/file.html", - self.good_runner.validations_path + "/test/path/to/to.html", + self.good_runner.validations_path + + f"/test/path/to/{self.good_runner.expectation_suite_name}.html", ) assert result == expected @@ -116,6 +120,7 @@ def test_upload_results_file_to_synapse(self): name=f"Great Expectations {self.good_runner.expectation_suite_name} results", executed="https://github.com/Sage-Bionetworks/agora-data-tools", ), + forceVersion=True, ) def test_that_convert_nested_columns_to_json_converts_nested_columns_to_json(self): diff --git a/tests/test_process.py b/tests/test_process.py index b4bf4471..82da3c9c 100644 --- a/tests/test_process.py +++ b/tests/test_process.py @@ -9,6 +9,9 @@ from agoradatatools.errors import ADTDataProcessingError from agoradatatools.etl import extract, load, utils +STAGING_PATH = "./staging" +GX_FOLDER = "test_folder" + class TestProcessDataset: dataset_object = { @@ -87,7 +90,8 @@ def teardown_method(self): def test_process_dataset_with_column_rename(self, syn: Any): process.process_dataset( dataset_obj=self.dataset_object_col_rename, - staging_path="./staging", + staging_path=STAGING_PATH, + gx_folder=GX_FOLDER, syn=syn, ) self.patch_rename_columns.assert_called_once_with( @@ -99,7 +103,8 @@ def test_process_dataset_with_column_rename(self, syn: Any): def test_process_dataset_custom_transformations(self, syn: Any): process.process_dataset( dataset_obj=self.dataset_object_custom_transform, - staging_path="./staging", + staging_path=STAGING_PATH, + gx_folder=GX_FOLDER, syn=syn, ) self.patch_custom_transform.assert_called_once_with( @@ -119,7 +124,8 @@ def test_process_dataset_custom_transformations(self, syn: Any): def test_process_dataset_with_agora_rename(self, syn: Any): process.process_dataset( dataset_obj=self.dataset_object_col_rename, - staging_path="./staging", + staging_path=STAGING_PATH, + gx_folder=GX_FOLDER, syn=syn, ) self.patch_rename_columns.assert_called_once_with( @@ -133,10 +139,13 @@ def test_process_dataset_type_dict(self, syn: Any): dict() ) # test if it is a dictionary later process.process_dataset( - dataset_obj=self.dataset_object, staging_path="./staging", syn=syn + dataset_obj=self.dataset_object, + staging_path=STAGING_PATH, + gx_folder=GX_FOLDER, + syn=syn, ) self.patch_dict_to_json.assert_called_once_with( - df={}, staging_path="./staging", filename="neuropath_corr.json" + df={}, staging_path=STAGING_PATH, filename="neuropath_corr.json" ) self.patch_rename_columns.assert_not_called() self.patch_custom_transform.assert_not_called() @@ -168,6 +177,8 @@ def test_create_data_manifest_no_none(self, syn: Any): class TestProcessAllFiles: + CONFIG_PATH = "./path/to/config" + @pytest.fixture(scope="function", autouse=True) def setup_method(self): self.patch_get_config = patch.object( @@ -175,6 +186,7 @@ def setup_method(self): "_get_config", return_value={ "destination": "destination", + "gx_folder": GX_FOLDER, "datasets": [{"a": {"b": "c"}}, {"d": {"e": "f"}}, {"g": {"h": "i"}}], }, ).start() @@ -198,8 +210,8 @@ def teardown_method(self): mock.patch.stopall() def test_process_all_files_config_path(self, syn: Any): - process.process_all_files(syn=syn, config_path="path/to/config") - self.patch_get_config.assert_called_once_with(config_path="path/to/config") + process.process_all_files(syn=syn, config_path=self.CONFIG_PATH) + self.patch_get_config.assert_called_once_with(config_path=self.CONFIG_PATH) def test_process_all_files_no_config_path(self, syn: Any): process.process_all_files(syn=syn, config_path=None) @@ -208,25 +220,34 @@ def test_process_all_files_no_config_path(self, syn: Any): def test_process_all_files_process_dataset_fails(self, syn: Any): with pytest.raises(ADTDataProcessingError): self.patch_process_dataset.side_effect = Exception - process.process_all_files(syn=syn, config_path="path/to/config") + process.process_all_files(syn=syn, config_path=self.CONFIG_PATH) self.patch_create_data_manifest.assert_not_called() def test_process_all_files_full(self, syn: Any): process.process_all_files(syn=syn, config_path=None) self.patch_process_dataset.assert_any_call( - dataset_obj={"a": {"b": "c"}}, staging_path="./staging", syn=syn + dataset_obj={"a": {"b": "c"}}, + staging_path=STAGING_PATH, + gx_folder=GX_FOLDER, + syn=syn, ) self.patch_process_dataset.assert_any_call( - dataset_obj={"d": {"e": "f"}}, staging_path="./staging", syn=syn + dataset_obj={"d": {"e": "f"}}, + staging_path=STAGING_PATH, + gx_folder=GX_FOLDER, + syn=syn, ) self.patch_process_dataset.assert_any_call( - dataset_obj={"g": {"h": "i"}}, staging_path="./staging", syn=syn + dataset_obj={"g": {"h": "i"}}, + staging_path=STAGING_PATH, + gx_folder=GX_FOLDER, + syn=syn, ) self.patch_create_data_manifest.assert_called_once_with( parent="destination", syn=syn ) self.patch_df_to_csv.assert_called_once_with( df=self.patch_create_data_manifest.return_value, - staging_path="./staging", + staging_path=STAGING_PATH, filename="data_manifest.csv", )