diff --git a/modelad_test_config.yaml b/modelad_test_config.yaml index 28a54be..d09652a 100644 --- a/modelad_test_config.yaml +++ b/modelad_test_config.yaml @@ -15,3 +15,16 @@ datasets: custom_transformations: 1 column_rename: agedeath: age_death + + - pathology: + files: + - name: pathology + id: syn61357279 + format: csv + final_format: json + provenance: + - syn61357279 + destination: *dest + custom_transformations: 1 + column_rename: + agedeath: age_death diff --git a/src/agoradatatools/etl/transform/__init__.py b/src/agoradatatools/etl/transform/__init__.py index 2bbd6b9..3a840c6 100644 --- a/src/agoradatatools/etl/transform/__init__.py +++ b/src/agoradatatools/etl/transform/__init__.py @@ -16,7 +16,9 @@ ) from agoradatatools.etl.transform.team_info import transform_team_info from agoradatatools.etl.transform.proteomics import transform_proteomics -from agoradatatools.etl.transform.biomarkers import transform_biomarkers +from agoradatatools.etl.transform.immunohisto_transform import ( + immunohisto_transform, +) __all__ = [ "transform_distribution_data", @@ -29,5 +31,5 @@ "transform_rnaseq_differential_expression", "transform_team_info", "transform_proteomics", - "transform_biomarkers", + "immunohisto_transform", ] diff --git a/src/agoradatatools/etl/transform/biomarkers.py b/src/agoradatatools/etl/transform/biomarkers.py deleted file mode 100644 index d820e0b..0000000 --- a/src/agoradatatools/etl/transform/biomarkers.py +++ /dev/null @@ -1,46 +0,0 @@ -""" -This module contains the transformation logic for the biomarkers dataset. -This is for the Model AD project. -""" - -import pandas as pd -from typing import Dict - - -def transform_biomarkers(datasets: Dict[str, pd.DataFrame]) -> pd.DataFrame: - """ - Takes a dictionary of dataset DataFrames, extracts the biomarkers - DataFrame, and transforms it into a DataFrame grouped by - 'model', 'type', 'age_death', 'tissue', and 'units'. - - Args: - datasets (Dict[str, pd.DataFrame]): Dictionary of dataset names mapped to their DataFrame. - - Returns: - pd.DataFrame: A DataFrame containing biomarker data modeled after intended final structure. - """ - biomarkers_dataset = datasets["biomarkers"] - group_columns = ["model", "type", "age_death", "tissue", "units"] - point_columns = ["genotype", "measurement", "sex"] - - missing_columns = [ - col - for col in group_columns + point_columns - if col not in biomarkers_dataset.columns - ] - if missing_columns: - raise ValueError( - f"Biomarker dataset missing columns: {', '.join(missing_columns)}" - ) - - biomarkers_dataset = biomarkers_dataset.fillna("none") - data_rows = [] - - grouped = biomarkers_dataset.groupby(group_columns) - - for group_key, group in grouped: - entry = dict(zip(group_columns, group_key)) - entry["points"] = group[point_columns].to_dict("records") - data_rows.append(entry) - - return pd.DataFrame(data_rows) diff --git a/src/agoradatatools/etl/transform/immunohisto_transform.py b/src/agoradatatools/etl/transform/immunohisto_transform.py new file mode 100644 index 0000000..5f296c4 --- /dev/null +++ b/src/agoradatatools/etl/transform/immunohisto_transform.py @@ -0,0 +1,52 @@ +""" +This module contains the transformation logic for the biomarkers and pathology datasets. +This is for the Model AD project. +""" + +import pandas as pd +from typing import Dict, List + + +def immunohisto_transform( + datasets: Dict[str, pd.DataFrame], + dataset_name: str, + group_columns: List[str] = ["model", "type", "age_death", "tissue", "units"], + extra_columns: List[str] = ["genotype", "measurement", "sex"], + extra_column_name: str = "points", +) -> pd.DataFrame: + """ + Takes a dictionary of dataset DataFrames, extracts the 'dataset_name' + DataFrame, and transforms it into a DataFrame grouped by group_columns. + Will include extra_columns in the group. + + Args: + datasets (Dict[str, pd.DataFrame]): Dictionary of dataset names mapped to their DataFrame. + dataset_name (str): The name of the dataset to transform. + group_columns (List[str], optional): List of columns to group by. Defaults to ['model', 'type', 'age_death', 'tissue', 'units']. + extra_columns (List[str], optional): List of columns to include in the group. Defaults to ['genotype', 'measurement', 'sex']. + extra_column_name (str, optional): Name of the column containing the extra columns. Defaults to 'points'. + + Returns: + pd.DataFrame: A DataFrame grouped by the group_columns. + """ + dataset = datasets[dataset_name] + + missing_columns = [ + col for col in group_columns + extra_columns if col not in dataset.columns + ] + if missing_columns: + raise ValueError( + f"{dataset_name} dataset missing columns: {', '.join(missing_columns)}" + ) + + dataset = dataset.fillna("none") + data_rows = [] + + grouped = dataset.groupby(group_columns) + + for group_key, group in grouped: + entry = dict(zip(group_columns, group_key)) + entry[extra_column_name] = group[extra_columns].to_dict("records") + data_rows.append(entry) + + return pd.DataFrame(data_rows) diff --git a/src/agoradatatools/process.py b/src/agoradatatools/process.py index 4623214..0b518e1 100644 --- a/src/agoradatatools/process.py +++ b/src/agoradatatools/process.py @@ -61,8 +61,10 @@ def apply_custom_transformations( if dataset_name in ["proteomics", "proteomics_tmt", "proteomics_srm"]: df = datasets[dataset_name] return transform.transform_proteomics(df=df) - if dataset_name == "biomarkers": - return transform.transform_biomarkers(datasets=datasets) + if dataset_name in ["biomarkers", "pathology"]: + return transform.immunohisto_transform( + datasets=datasets, dataset_name=dataset_name + ) else: return None diff --git a/tests/test_assets/biomarkers/input/biomarkers_duplicated_input.csv b/tests/test_assets/immunohisto_transform/input/immunohisto_transform_duplicated_input.csv similarity index 100% rename from tests/test_assets/biomarkers/input/biomarkers_duplicated_input.csv rename to tests/test_assets/immunohisto_transform/input/immunohisto_transform_duplicated_input.csv diff --git a/tests/test_assets/biomarkers/input/biomarkers_extra_column.csv b/tests/test_assets/immunohisto_transform/input/immunohisto_transform_extra_column.csv similarity index 100% rename from tests/test_assets/biomarkers/input/biomarkers_extra_column.csv rename to tests/test_assets/immunohisto_transform/input/immunohisto_transform_extra_column.csv diff --git a/tests/test_assets/biomarkers/input/biomarkers_good_test_input.csv b/tests/test_assets/immunohisto_transform/input/immunohisto_transform_good_test_input.csv similarity index 100% rename from tests/test_assets/biomarkers/input/biomarkers_good_test_input.csv rename to tests/test_assets/immunohisto_transform/input/immunohisto_transform_good_test_input.csv diff --git a/tests/test_assets/biomarkers/input/biomarkers_missing_column.csv b/tests/test_assets/immunohisto_transform/input/immunohisto_transform_missing_column.csv similarity index 100% rename from tests/test_assets/biomarkers/input/biomarkers_missing_column.csv rename to tests/test_assets/immunohisto_transform/input/immunohisto_transform_missing_column.csv diff --git a/tests/test_assets/biomarkers/input/biomarkers_missing_input.csv b/tests/test_assets/immunohisto_transform/input/immunohisto_transform_missing_input.csv similarity index 100% rename from tests/test_assets/biomarkers/input/biomarkers_missing_input.csv rename to tests/test_assets/immunohisto_transform/input/immunohisto_transform_missing_input.csv diff --git a/tests/test_assets/biomarkers/input/biomarkers_none_input.csv b/tests/test_assets/immunohisto_transform/input/immunohisto_transform_none_input.csv similarity index 100% rename from tests/test_assets/biomarkers/input/biomarkers_none_input.csv rename to tests/test_assets/immunohisto_transform/input/immunohisto_transform_none_input.csv diff --git a/tests/test_assets/biomarkers/output/biomarkers_duplicated_output.json b/tests/test_assets/immunohisto_transform/output/immunohisto_transform_duplicated_output.json similarity index 100% rename from tests/test_assets/biomarkers/output/biomarkers_duplicated_output.json rename to tests/test_assets/immunohisto_transform/output/immunohisto_transform_duplicated_output.json diff --git a/tests/test_assets/biomarkers/output/biomarkers_extra_column_output.json b/tests/test_assets/immunohisto_transform/output/immunohisto_transform_extra_column_output.json similarity index 100% rename from tests/test_assets/biomarkers/output/biomarkers_extra_column_output.json rename to tests/test_assets/immunohisto_transform/output/immunohisto_transform_extra_column_output.json diff --git a/tests/test_assets/biomarkers/output/biomarkers_good_test_output.json b/tests/test_assets/immunohisto_transform/output/immunohisto_transform_good_test_output.json similarity index 100% rename from tests/test_assets/biomarkers/output/biomarkers_good_test_output.json rename to tests/test_assets/immunohisto_transform/output/immunohisto_transform_good_test_output.json diff --git a/tests/test_assets/biomarkers/output/biomarkers_missing_output.json b/tests/test_assets/immunohisto_transform/output/immunohisto_transform_missing_output.json similarity index 100% rename from tests/test_assets/biomarkers/output/biomarkers_missing_output.json rename to tests/test_assets/immunohisto_transform/output/immunohisto_transform_missing_output.json diff --git a/tests/test_assets/biomarkers/output/biomarkers_none_output.json b/tests/test_assets/immunohisto_transform/output/immunohisto_transform_none_output.json similarity index 100% rename from tests/test_assets/biomarkers/output/biomarkers_none_output.json rename to tests/test_assets/immunohisto_transform/output/immunohisto_transform_none_output.json diff --git a/tests/transform/test_biomarkers.py b/tests/transform/test_biomarkers.py deleted file mode 100644 index 280ce78..0000000 --- a/tests/transform/test_biomarkers.py +++ /dev/null @@ -1,73 +0,0 @@ -import os - -import pandas as pd -import pytest - -from agoradatatools.etl.transform import biomarkers - - -class TestTransformBiomarkers: - data_files_path = "tests/test_assets/biomarkers" - pass_test_data = [ - ( - # Pass with good fake data - "biomarkers_good_test_input.csv", - "biomarkers_good_test_output.json", - ), - ( - # Pass with duplicated data - "biomarkers_duplicated_input.csv", - "biomarkers_duplicated_output.json", - ), - ( - # Pass with none data - "biomarkers_none_input.csv", - "biomarkers_none_output.json", - ), - ( - # Pass with missing data - "biomarkers_missing_input.csv", - "biomarkers_missing_output.json", - ), - ( - # Pass with extra column - "biomarkers_extra_column.csv", - "biomarkers_extra_column_output.json", - ), - ] - pass_test_ids = [ - "Pass with good fake data", - "Pass with duplicated data", - "Pass with none data", - "Pass with missing data", - "Pass with extra column", - ] - fail_test_data = [("biomarkers_missing_column.csv")] - fail_test_ids = [("Fail with missing column")] - - @pytest.mark.parametrize( - "biomarkers_file, expected_output_file", pass_test_data, ids=pass_test_ids - ) - def test_transform_biomarkers_should_pass( - self, biomarkers_file, expected_output_file - ): - biomarkers_df = pd.read_csv( - os.path.join(self.data_files_path, "input", biomarkers_file) - ) - output_df = pd.DataFrame( - biomarkers.transform_biomarkers(datasets={"biomarkers": biomarkers_df}) - ) - expected_df = pd.read_json( - os.path.join(self.data_files_path, "output", expected_output_file), - ) - pd.testing.assert_frame_equal(output_df, expected_df) - - @pytest.mark.parametrize("biomarkers_file", fail_test_data, ids=fail_test_ids) - def test_transform_biomarkers_should_fail( - self, biomarkers_file, error_type: BaseException = ValueError - ): - biomarkers_df = pd.read_csv( - os.path.join(self.data_files_path, "input", biomarkers_file) - ) - with pytest.raises(error_type): - biomarkers.transform_biomarkers(datasets={"biomarkers": biomarkers_df}) diff --git a/tests/transform/test_immunohisto_transform.py b/tests/transform/test_immunohisto_transform.py new file mode 100644 index 0000000..f1ad2d5 --- /dev/null +++ b/tests/transform/test_immunohisto_transform.py @@ -0,0 +1,85 @@ +import os + +import pandas as pd +import pytest + +from agoradatatools.etl.transform.immunohisto_transform import ( + immunohisto_transform, +) + + +class TestTransformGeneralModelAD: + data_files_path = "tests/test_assets/immunohisto_transform" + pass_test_data = [ + ( + # Pass with good fake data + "immunohisto_transform_good_test_input.csv", + "immunohisto_transform_good_test_output.json", + ), + ( + # Pass with duplicated data + "immunohisto_transform_duplicated_input.csv", + "immunohisto_transform_duplicated_output.json", + ), + ( + # Pass with none data + "immunohisto_transform_none_input.csv", + "immunohisto_transform_none_output.json", + ), + ( + # Pass with missing data + "immunohisto_transform_missing_input.csv", + "immunohisto_transform_missing_output.json", + ), + ( + # Pass with extra column + "immunohisto_transform_extra_column.csv", + "immunohisto_transform_extra_column_output.json", + ), + ] + pass_test_ids = [ + "Pass with good fake data", + "Pass with duplicated data", + "Pass with none data", + "Pass with missing data", + "Pass with extra column", + ] + fail_test_data = [("immunohisto_transform_missing_column.csv")] + fail_test_ids = [("Fail with missing column")] + + @pytest.mark.parametrize( + "immunohisto_transform_file, expected_output_file", + pass_test_data, + ids=pass_test_ids, + ) + def test_immunohisto_transform_should_pass( + self, immunohisto_transform_file, expected_output_file + ): + immunohisto_transform_df = pd.read_csv( + os.path.join(self.data_files_path, "input", immunohisto_transform_file) + ) + output_df = pd.DataFrame( + immunohisto_transform( + datasets={"immunohisto_transform": immunohisto_transform_df}, + dataset_name="immunohisto_transform", + ) + ) + expected_df = pd.read_json( + os.path.join(self.data_files_path, "output", expected_output_file), + ) + pd.testing.assert_frame_equal(output_df, expected_df) + + @pytest.mark.parametrize( + "immunohisto_transform_file", fail_test_data, ids=fail_test_ids + ) + def test_immunohisto_transform_should_fail( + self, immunohisto_transform_file, error_type: BaseException = ValueError + ): + immunohisto_transform_df = pd.read_csv( + os.path.join(self.data_files_path, "input", immunohisto_transform_file) + ) + with pytest.raises(error_type): + immunohisto_transform( + datasets={"immunohisto_transform": immunohisto_transform_df}, + dataset_name="immunohisto_transform", + )