Skip to content

Commit

Permalink
MG45 - ADT: Pathology data ETL (#149)
Browse files Browse the repository at this point in the history
* Generalized the biomarkers transform so it can work for the pathology dataset as well

* Updated test file names and functions to reflect biomarkers to modelAD_general_transform change

* Added pathology dataset to the modelad_test_config.yaml

* Updating function names and desctiptions to generalized transform name

* Addressing PR comments

---------

Co-authored-by: Beatriz Saldana <bsaldana@w262.lan>
  • Loading branch information
beatrizsaldana and Beatriz Saldana authored Oct 9, 2024
1 parent e02c29b commit 105ddc9
Show file tree
Hide file tree
Showing 18 changed files with 158 additions and 123 deletions.
13 changes: 13 additions & 0 deletions modelad_test_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,16 @@ datasets:
custom_transformations: 1
column_rename:
agedeath: age_death

- pathology:
files:
- name: pathology
id: syn61357279
format: csv
final_format: json
provenance:
- syn61357279
destination: *dest
custom_transformations: 1
column_rename:
agedeath: age_death
6 changes: 4 additions & 2 deletions src/agoradatatools/etl/transform/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
)
from agoradatatools.etl.transform.team_info import transform_team_info
from agoradatatools.etl.transform.proteomics import transform_proteomics
from agoradatatools.etl.transform.biomarkers import transform_biomarkers
from agoradatatools.etl.transform.immunohisto_transform import (
immunohisto_transform,
)

__all__ = [
"transform_distribution_data",
Expand All @@ -29,5 +31,5 @@
"transform_rnaseq_differential_expression",
"transform_team_info",
"transform_proteomics",
"transform_biomarkers",
"immunohisto_transform",
]
46 changes: 0 additions & 46 deletions src/agoradatatools/etl/transform/biomarkers.py

This file was deleted.

52 changes: 52 additions & 0 deletions src/agoradatatools/etl/transform/immunohisto_transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
"""
This module contains the transformation logic for the biomarkers and pathology datasets.
This is for the Model AD project.
"""

import pandas as pd
from typing import Dict, List


def immunohisto_transform(
datasets: Dict[str, pd.DataFrame],
dataset_name: str,
group_columns: List[str] = ["model", "type", "age_death", "tissue", "units"],
extra_columns: List[str] = ["genotype", "measurement", "sex"],
extra_column_name: str = "points",
) -> pd.DataFrame:
"""
Takes a dictionary of dataset DataFrames, extracts the 'dataset_name'
DataFrame, and transforms it into a DataFrame grouped by group_columns.
Will include extra_columns in the group.
Args:
datasets (Dict[str, pd.DataFrame]): Dictionary of dataset names mapped to their DataFrame.
dataset_name (str): The name of the dataset to transform.
group_columns (List[str], optional): List of columns to group by. Defaults to ['model', 'type', 'age_death', 'tissue', 'units'].
extra_columns (List[str], optional): List of columns to include in the group. Defaults to ['genotype', 'measurement', 'sex'].
extra_column_name (str, optional): Name of the column containing the extra columns. Defaults to 'points'.
Returns:
pd.DataFrame: A DataFrame grouped by the group_columns.
"""
dataset = datasets[dataset_name]

missing_columns = [
col for col in group_columns + extra_columns if col not in dataset.columns
]
if missing_columns:
raise ValueError(
f"{dataset_name} dataset missing columns: {', '.join(missing_columns)}"
)

dataset = dataset.fillna("none")
data_rows = []

grouped = dataset.groupby(group_columns)

for group_key, group in grouped:
entry = dict(zip(group_columns, group_key))
entry[extra_column_name] = group[extra_columns].to_dict("records")
data_rows.append(entry)

return pd.DataFrame(data_rows)
6 changes: 4 additions & 2 deletions src/agoradatatools/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,10 @@ def apply_custom_transformations(
if dataset_name in ["proteomics", "proteomics_tmt", "proteomics_srm"]:
df = datasets[dataset_name]
return transform.transform_proteomics(df=df)
if dataset_name == "biomarkers":
return transform.transform_biomarkers(datasets=datasets)
if dataset_name in ["biomarkers", "pathology"]:
return transform.immunohisto_transform(
datasets=datasets, dataset_name=dataset_name
)
else:
return None

Expand Down
73 changes: 0 additions & 73 deletions tests/transform/test_biomarkers.py

This file was deleted.

85 changes: 85 additions & 0 deletions tests/transform/test_immunohisto_transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import os

import pandas as pd
import pytest

from agoradatatools.etl.transform.immunohisto_transform import (
immunohisto_transform,
)


class TestTransformGeneralModelAD:
data_files_path = "tests/test_assets/immunohisto_transform"
pass_test_data = [
(
# Pass with good fake data
"immunohisto_transform_good_test_input.csv",
"immunohisto_transform_good_test_output.json",
),
(
# Pass with duplicated data
"immunohisto_transform_duplicated_input.csv",
"immunohisto_transform_duplicated_output.json",
),
(
# Pass with none data
"immunohisto_transform_none_input.csv",
"immunohisto_transform_none_output.json",
),
(
# Pass with missing data
"immunohisto_transform_missing_input.csv",
"immunohisto_transform_missing_output.json",
),
(
# Pass with extra column
"immunohisto_transform_extra_column.csv",
"immunohisto_transform_extra_column_output.json",
),
]
pass_test_ids = [
"Pass with good fake data",
"Pass with duplicated data",
"Pass with none data",
"Pass with missing data",
"Pass with extra column",
]
fail_test_data = [("immunohisto_transform_missing_column.csv")]
fail_test_ids = [("Fail with missing column")]

@pytest.mark.parametrize(
"immunohisto_transform_file, expected_output_file",
pass_test_data,
ids=pass_test_ids,
)
def test_immunohisto_transform_should_pass(
self, immunohisto_transform_file, expected_output_file
):
immunohisto_transform_df = pd.read_csv(
os.path.join(self.data_files_path, "input", immunohisto_transform_file)
)
output_df = pd.DataFrame(
immunohisto_transform(
datasets={"immunohisto_transform": immunohisto_transform_df},
dataset_name="immunohisto_transform",
)
)
expected_df = pd.read_json(
os.path.join(self.data_files_path, "output", expected_output_file),
)
pd.testing.assert_frame_equal(output_df, expected_df)

@pytest.mark.parametrize(
"immunohisto_transform_file", fail_test_data, ids=fail_test_ids
)
def test_immunohisto_transform_should_fail(
self, immunohisto_transform_file, error_type: BaseException = ValueError
):
immunohisto_transform_df = pd.read_csv(
os.path.join(self.data_files_path, "input", immunohisto_transform_file)
)
with pytest.raises(error_type):
immunohisto_transform(
datasets={"immunohisto_transform": immunohisto_transform_df},
dataset_name="immunohisto_transform",
)

0 comments on commit 105ddc9

Please sign in to comment.