Sage-Bionetworks · BWMac · Mar 18, 2024 · Mar 18, 2024 · Mar 18, 2024 · Mar 18, 2024
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -151,11 +151,11 @@ This package has a `src/agoradatatools/etl/transform` submodule.  This folder ho
 
 ### Great Expectations
 
-This package uses [Great Expectations](https://greatexpectations.io/) to validate output data.  The `src/agoradatatools/great_expectations` folder houses our file system data context and Great Expectations-specific configuration files. Eventually, our goal is for each `agora-data-tools` output dataset to be convered by an expectation suite. To add data validation for more datasets, follow these steps:
+This package uses [Great Expectations](https://greatexpectations.io/) to validate output data.  The `src/agoradatatools/great_expectations` folder houses our file system data context and Great Expectations-specific configuration files. Eventually, our goal is for each `agora-data-tools` dataset to be convered by an expectation suite. To add data validation for more datasets, follow these steps:
 
-1. Create a new expectation suite by defining the expectations for the new dataset in a Jupyter Notebook inside the `gx_suite_definitions` folder. Use `metabolomics.ipynb` as an example. You can find a catalog of existing expectations [here](https://greatexpectations.io/expectations/).
+1. Create a new expectation suite by defining the expectations for the dataset in a Jupyter Notebook inside the `gx_suite_definitions` folder. Use `metabolomics.ipynb` as an example. You can find a catalog of existing expectations [here](https://greatexpectations.io/expectations/).
 1. Run the notebook to generate the new expectation suite. It should populate as a JSON file in the `/great_expectations/expectations` folder.
-1. Add support for running Great Expectations on a dataset by adding the `gx_folder` key to the configuration for the datatset in both `test_config.yaml` and `config.yaml`. The `gx_folder` should be the Synapse ID pointing of a folder where generated HTML reports from Great Expectations for that dataset should be uploaded. If a folder specific to your dataset does not yet exist in the proper locations ([Prod](https://www.synapse.org/#!Synapse:syn52948668), [Testing](https://www.synapse.org/#!Synapse:syn52948670)), create folders with the same name as the dataset itself and copy the new folders' Synapse IDs to the config files.
+1. Add support for running Great Expectations on a dataset by adding `gx_enabled: true`  to the configuration for the datatset in both `test_config.yaml` and `config.yaml`. After updating the config files reports should be uploaded in the proper locations ([Prod](https://www.synapse.org/#!Synapse:syn52948668), [Testing](https://www.synapse.org/#!Synapse:syn52948670)) when data processing is complete.
 1. Test data processing by running `adt test_config.yaml` and ensure that HTML reports with all expectations are generated and uploaded to the proper folder in Synapse.
 
 #### Custom Expectations

diff --git a/README.md b/README.md
@@ -122,6 +122,8 @@ python -m pytest
 ## Config
 Parameters:
 - `destination`: Defines the default target location (folder) that the generated json files are written to; this value can be overridden on a per-dataset basis
+- `staging_path`: Defines the location of the staging folder that the generated json files are written to
+- `gx_folder`: Defines the Synapse ID of the folder that generated GX reports are written to
 - `datasets/<dataset>`: Each generated json file is named `<dataset>.json`
 - `datasets/<dataset>/files`: A list of source files for the dataset
     - `name`: The name of the source file (this name is the reference the code will use to retrieve a file from the configuration)

diff --git a/config.yaml b/config.yaml
@@ -1,5 +1,6 @@
 destination: &dest syn12177492
 staging_path: ./staging
+gx_folder: syn52948668
 sources:
   - genes_biodomains:
     genes_biodomains_files: &genes_biodomains_files
@@ -62,7 +63,7 @@ datasets:
         ensembl_id: ensembl_gene_id
         goterm_name: go_terms
       destination: *dest
-      gx_folder: syn53127958
+      gx_enabled: true
       gx_nested_columns:
         - gene_biodomains
 
@@ -81,7 +82,7 @@ datasets:
         ensembl_gene_id: ensg
         hgnc_gene_id: gname
       destination: *dest
-      gx_folder: syn53461513
+      gx_enabled: true
 
   - proteomics:
       files: *agora_proteomics_files
@@ -100,7 +101,7 @@ datasets:
         genename: hgnc_symbol
         ensg: ensembl_gene_id
       destination: *dest
-      gx_folder: syn53469660
+      gx_enabled: true
 
   - proteomics_srm:
       files: *agora_proteomics_srm_files
@@ -120,7 +121,7 @@ datasets:
       provenance:
         - syn24184512.9
       destination: *dest
-      gx_folder: syn53710839
+      gx_enabled: true
 
   - metabolomics:
       files:
@@ -131,7 +132,7 @@ datasets:
       provenance:
         - syn26064497.1
       destination: *dest
-      gx_folder: syn52948669
+      gx_enabled: true
 
   - gene_info:
       files:
@@ -209,7 +210,7 @@ datasets:
         - syn12615624.18
         - syn12615633.18
       destination: *dest
-      gx_folder: syn53616579
+      gx_enabled: true
       gx_nested_columns:
         - members
 
@@ -227,7 +228,7 @@ datasets:
         overall: target_risk_score
         omicsscore: multi_omics_score
       destination: *dest
-      gx_folder: syn53453229
+      gx_enabled: true
 
   - network:
       files:
@@ -286,4 +287,4 @@ datasets:
         - *agora_proteomics_tmt_provenance
         - *agora_proteomics_srm_provenance
       destination: *dest
-      gx_folder: syn53463345
+      gx_enabled: true
diff --git a/src/agoradatatools/gx.py b/src/agoradatatools/gx.py
@@ -85,9 +85,9 @@ def _get_results_path(self, checkpoint_result: CheckpointResult) -> str:
             *original_results_path_items,
         )
 
-        timestamp_file_name = original_results_path_items[-2] + ".html"
+        expectation_suite_name = self.expectation_suite_name + ".html"
         new_results_path_items = original_results_path_items
-        new_results_path_items[-1] = timestamp_file_name
+        new_results_path_items[-1] = expectation_suite_name
         new_results_path = os.path.join(
             self.validations_path,
             *new_results_path_items,
@@ -107,6 +107,7 @@ def _upload_results_file_to_synapse(self, results_path: str) -> None:
                 name=f"Great Expectations {self.expectation_suite_name} results",
                 executed="https://github.com/Sage-Bionetworks/agora-data-tools",
             ),
+            forceVersion=True,
         )
 
     @staticmethod

diff --git a/src/agoradatatools/process.py b/src/agoradatatools/process.py
@@ -60,13 +60,15 @@ def apply_custom_transformations(datasets: dict, dataset_name: str, dataset_obj:
 def process_dataset(
     dataset_obj: dict,
     staging_path: str,
+    gx_folder: str,
     syn: synapseclient.Synapse,
 ) -> tuple:
     """Takes in a dataset from the configuration file and passes it through the ETL process
 
     Args:
         dataset_obj (dict): A dataset defined in the configuration file
         staging_path (str): Staging path
+        gx_folder (str): Synapse ID of the folder where Great Expectations reports should be uploaded
         syn (synapseclient.Synapse): synapseclient.Synapse session.
 
     Returns:
@@ -121,12 +123,12 @@ def process_dataset(
         )
 
     # run great expectations on dataset if expectation suite exists
-    if "gx_folder" in dataset_obj[dataset_name].keys():
+    if "gx_enabled" in dataset_obj[dataset_name].keys():
         gx_runner = GreatExpectationsRunner(
             syn=syn,
             dataset_path=json_path,
             dataset_name=dataset_name,
-            upload_folder=dataset_obj[dataset_name]["gx_folder"],
+            upload_folder=gx_folder,
             nested_columns=(
                 dataset_obj[dataset_name]["gx_nested_columns"]
                 if "gx_nested_columns" in dataset_obj[dataset_name].keys()
@@ -200,6 +202,7 @@ def process_all_files(
                 process_dataset(
                     dataset_obj=dataset,
                     staging_path=staging_path,
+                    gx_folder=config["gx_folder"],
                     syn=syn,
                 )
             except Exception as e:

diff --git a/test_config.yaml b/test_config.yaml
@@ -1,5 +1,6 @@
 destination: &dest syn17015333
 staging_path: ./staging
+gx_folder: syn52948670
 sources:
   - genes_biodomains:
     genes_biodomains_files: &genes_biodomains_files
@@ -62,7 +63,7 @@ datasets:
         ensembl_id: ensembl_gene_id
         goterm_name: go_terms
       destination: *dest
-      gx_folder: syn53127956
+      gx_enabled: true
       gx_nested_columns:
         - gene_biodomains
 
@@ -81,7 +82,7 @@ datasets:
         ensembl_gene_id: ensg
         hgnc_gene_id: gname
       destination: *dest
-      gx_folder: syn53461511
+      gx_enabled: true
 
   - proteomics:
       files: *agora_proteomics_files
@@ -100,7 +101,7 @@ datasets:
         genename: hgnc_symbol
         ensg: ensembl_gene_id
       destination: *dest
-      gx_folder: syn53469659
+      gx_enabled: true
 
   - proteomics_srm:
       files: *agora_proteomics_srm_files
@@ -120,7 +121,7 @@ datasets:
       provenance:
         - syn24184512.9
       destination: *dest
-      gx_folder: syn53710838
+      gx_enabled: true
 
   - metabolomics:
       files:
@@ -131,7 +132,7 @@ datasets:
       provenance:
         - syn26064497.1
       destination: *dest
-      gx_folder: syn52948671
+      gx_enabled: true
 
   - gene_info:
       files:
@@ -209,7 +210,7 @@ datasets:
         - syn12615624.18
         - syn12615633.18
       destination: *dest
-      gx_folder: syn53616774
+      gx_enabled: true
       gx_nested_columns:
         - members
 
@@ -227,7 +228,7 @@ datasets:
         overall: target_risk_score
         omicsscore: multi_omics_score
       destination: *dest
-      gx_folder: syn53453225
+      gx_enabled: true
 
   - network:
       files:
@@ -286,4 +287,4 @@ datasets:
         - *agora_proteomics_tmt_provenance
         - *agora_proteomics_srm_provenance
       destination: *dest
-      gx_folder: syn53463344
+      gx_enabled: true
diff --git a/tests/test_gx.py b/tests/test_gx.py
@@ -80,7 +80,10 @@ def test_check_if_expectation_suite_exists_returns_true_when_the_expectation_sui
         assert self.good_runner._check_if_expectation_suite_exists() is True
 
     def test_get_results_path(self):
-        expected = self.good_runner.validations_path + "/test/path/to/to.html"
+        expected = (
+            self.good_runner.validations_path
+            + f"/test/path/to/{self.good_runner.expectation_suite_name}.html"
+        )
         mocked_checkpoint_result = mock.create_autospec(CheckpointResult)
         mocked_validation_result_identifier = mock.create_autospec(
             ValidationResultIdentifier(
@@ -103,7 +106,8 @@ def test_get_results_path(self):
             patch_list_validation_result_identifiers.assert_called_once()
             patch_copy.assert_called_once_with(
                 self.good_runner.validations_path + "/test/path/to/file.html",
-                self.good_runner.validations_path + "/test/path/to/to.html",
+                self.good_runner.validations_path
+                + f"/test/path/to/{self.good_runner.expectation_suite_name}.html",
             )
             assert result == expected
 
@@ -116,6 +120,7 @@ def test_upload_results_file_to_synapse(self):
                     name=f"Great Expectations {self.good_runner.expectation_suite_name} results",
                     executed="https://github.com/Sage-Bionetworks/agora-data-tools",
                 ),
+                forceVersion=True,
             )
 
     def test_that_convert_nested_columns_to_json_converts_nested_columns_to_json(self):

diff --git a/tests/test_process.py b/tests/test_process.py
@@ -9,6 +9,9 @@
 from agoradatatools.errors import ADTDataProcessingError
 from agoradatatools.etl import extract, load, utils
 
+STAGING_PATH = "./staging"
+GX_FOLDER = "test_folder"
+
 
 class TestProcessDataset:
     dataset_object = {
@@ -87,7 +90,8 @@ def teardown_method(self):
     def test_process_dataset_with_column_rename(self, syn: Any):
         process.process_dataset(
             dataset_obj=self.dataset_object_col_rename,
-            staging_path="./staging",
+            staging_path=STAGING_PATH,
+            gx_folder=GX_FOLDER,
             syn=syn,
         )
         self.patch_rename_columns.assert_called_once_with(
@@ -99,7 +103,8 @@ def test_process_dataset_with_column_rename(self, syn: Any):
     def test_process_dataset_custom_transformations(self, syn: Any):
         process.process_dataset(
             dataset_obj=self.dataset_object_custom_transform,
-            staging_path="./staging",
+            staging_path=STAGING_PATH,
+            gx_folder=GX_FOLDER,
             syn=syn,
         )
         self.patch_custom_transform.assert_called_once_with(
@@ -119,7 +124,8 @@ def test_process_dataset_custom_transformations(self, syn: Any):
     def test_process_dataset_with_agora_rename(self, syn: Any):
         process.process_dataset(
             dataset_obj=self.dataset_object_col_rename,
-            staging_path="./staging",
+            staging_path=STAGING_PATH,
+            gx_folder=GX_FOLDER,
             syn=syn,
         )
         self.patch_rename_columns.assert_called_once_with(
@@ -133,10 +139,13 @@ def test_process_dataset_type_dict(self, syn: Any):
             dict()
         )  # test if it is a dictionary later
         process.process_dataset(
-            dataset_obj=self.dataset_object, staging_path="./staging", syn=syn
+            dataset_obj=self.dataset_object,
+            staging_path=STAGING_PATH,
+            gx_folder=GX_FOLDER,
+            syn=syn,
         )
         self.patch_dict_to_json.assert_called_once_with(
-            df={}, staging_path="./staging", filename="neuropath_corr.json"
+            df={}, staging_path=STAGING_PATH, filename="neuropath_corr.json"
         )
         self.patch_rename_columns.assert_not_called()
         self.patch_custom_transform.assert_not_called()
@@ -168,13 +177,16 @@ def test_create_data_manifest_no_none(self, syn: Any):
 
 
 class TestProcessAllFiles:
+    CONFIG_PATH = "./path/to/config"
+
     @pytest.fixture(scope="function", autouse=True)
     def setup_method(self):
         self.patch_get_config = patch.object(
             utils,
             "_get_config",
             return_value={
                 "destination": "destination",
+                "gx_folder": GX_FOLDER,
                 "datasets": [{"a": {"b": "c"}}, {"d": {"e": "f"}}, {"g": {"h": "i"}}],
             },
         ).start()
@@ -198,8 +210,8 @@ def teardown_method(self):
         mock.patch.stopall()
 
     def test_process_all_files_config_path(self, syn: Any):
-        process.process_all_files(syn=syn, config_path="path/to/config")
-        self.patch_get_config.assert_called_once_with(config_path="path/to/config")
+        process.process_all_files(syn=syn, config_path=self.CONFIG_PATH)
+        self.patch_get_config.assert_called_once_with(config_path=self.CONFIG_PATH)
 
     def test_process_all_files_no_config_path(self, syn: Any):
         process.process_all_files(syn=syn, config_path=None)
@@ -208,25 +220,34 @@ def test_process_all_files_no_config_path(self, syn: Any):
     def test_process_all_files_process_dataset_fails(self, syn: Any):
         with pytest.raises(ADTDataProcessingError):
             self.patch_process_dataset.side_effect = Exception
-            process.process_all_files(syn=syn, config_path="path/to/config")
+            process.process_all_files(syn=syn, config_path=self.CONFIG_PATH)
             self.patch_create_data_manifest.assert_not_called()
 
     def test_process_all_files_full(self, syn: Any):
         process.process_all_files(syn=syn, config_path=None)
         self.patch_process_dataset.assert_any_call(
-            dataset_obj={"a": {"b": "c"}}, staging_path="./staging", syn=syn
+            dataset_obj={"a": {"b": "c"}},
+            staging_path=STAGING_PATH,
+            gx_folder=GX_FOLDER,
+            syn=syn,
         )
         self.patch_process_dataset.assert_any_call(
-            dataset_obj={"d": {"e": "f"}}, staging_path="./staging", syn=syn
+            dataset_obj={"d": {"e": "f"}},
+            staging_path=STAGING_PATH,
+            gx_folder=GX_FOLDER,
+            syn=syn,
         )
         self.patch_process_dataset.assert_any_call(
-            dataset_obj={"g": {"h": "i"}}, staging_path="./staging", syn=syn
+            dataset_obj={"g": {"h": "i"}},
+            staging_path=STAGING_PATH,
+            gx_folder=GX_FOLDER,
+            syn=syn,
         )
         self.patch_create_data_manifest.assert_called_once_with(
             parent="destination", syn=syn
         )
         self.patch_df_to_csv.assert_called_once_with(
             df=self.patch_create_data_manifest.return_value,
-            staging_path="./staging",
+            staging_path=STAGING_PATH,
             filename="data_manifest.csv",
         )