Skip to content

Commit

Permalink
Simplifying transform_biomarkers() to make it more readable and maint…
Browse files Browse the repository at this point in the history
…ainable
  • Loading branch information
Beatriz Saldana committed Sep 30, 2024
1 parent e73210a commit 31c5806
Showing 1 changed file with 14 additions and 39 deletions.
53 changes: 14 additions & 39 deletions src/agoradatatools/etl/transform/biomarkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,53 +19,28 @@ def transform_biomarkers(datasets: Dict[str, pd.DataFrame]) -> List[Dict[str, An
Returns:
List[Dict[str, Any]]: a list of dictionaries containing biomarker data modeled after intended final JSON structure
"""
if "biomarkers" not in datasets:
raise ValueError("Biomarkers dataset not found in datasets dictionary")
biomarkers_dataset = datasets["biomarkers"]
expected_columns = [
"model",
"type",
"ageDeath",
"tissue",
"units",
"genotype",
"measurement",
"sex",
group_columns = ["model", "type", "ageDeath", "tissue", "units"]
point_columns = ["genotype", "measurement", "sex"]

missing_columns = [
col
for col in group_columns + point_columns
if col not in biomarkers_dataset.columns
]
if not set(expected_columns).issubset(set(biomarkers_dataset.columns)):
missing_columns = [
s for s in set(expected_columns) if s not in biomarkers_dataset.columns
]
if missing_columns:
raise ValueError(
f"Biomarker dataset does not contain expected columns. Missing column(s): {missing_columns}"
f"Biomarker dataset missing columns: {', '.join(missing_columns)}"
)

biomarkers_dataset = biomarkers_dataset.fillna("none")
data_as_list = []
grouped = biomarkers_dataset.groupby(
["model", "type", "ageDeath", "tissue", "units"]
)

for (model, type_, ageDeath, tissue, units), group in grouped:
# Create the base structure for each group
entry = {
"model": model,
"type": type_,
"ageDeath": ageDeath,
"tissue": tissue,
"units": units,
"points": [],
}

# Append the measurement, genotype, and sex for each row
for _, row in group.iterrows():
point = {
"genotype": row["genotype"],
"measurement": row["measurement"],
"sex": row["sex"],
}
entry["points"].append(point)
grouped = biomarkers_dataset.groupby(group_columns)

# Add the entry to the list
for group_key, group in grouped:
entry = dict(zip(group_columns, group_key))
entry["points"] = group[point_columns].to_dict("records")
data_as_list.append(entry)

return data_as_list

0 comments on commit 31c5806

Please sign in to comment.