diff --git a/src/agoradatatools/etl/transform/biomarkers.py b/src/agoradatatools/etl/transform/biomarkers.py index 2fac786..6c1bab1 100644 --- a/src/agoradatatools/etl/transform/biomarkers.py +++ b/src/agoradatatools/etl/transform/biomarkers.py @@ -19,53 +19,28 @@ def transform_biomarkers(datasets: Dict[str, pd.DataFrame]) -> List[Dict[str, An Returns: List[Dict[str, Any]]: a list of dictionaries containing biomarker data modeled after intended final JSON structure """ - if "biomarkers" not in datasets: - raise ValueError("Biomarkers dataset not found in datasets dictionary") biomarkers_dataset = datasets["biomarkers"] - expected_columns = [ - "model", - "type", - "ageDeath", - "tissue", - "units", - "genotype", - "measurement", - "sex", + group_columns = ["model", "type", "ageDeath", "tissue", "units"] + point_columns = ["genotype", "measurement", "sex"] + + missing_columns = [ + col + for col in group_columns + point_columns + if col not in biomarkers_dataset.columns ] - if not set(expected_columns).issubset(set(biomarkers_dataset.columns)): - missing_columns = [ - s for s in set(expected_columns) if s not in biomarkers_dataset.columns - ] + if missing_columns: raise ValueError( - f"Biomarker dataset does not contain expected columns. Missing column(s): {missing_columns}" + f"Biomarker dataset missing columns: {', '.join(missing_columns)}" ) + biomarkers_dataset = biomarkers_dataset.fillna("none") data_as_list = [] - grouped = biomarkers_dataset.groupby( - ["model", "type", "ageDeath", "tissue", "units"] - ) - - for (model, type_, ageDeath, tissue, units), group in grouped: - # Create the base structure for each group - entry = { - "model": model, - "type": type_, - "ageDeath": ageDeath, - "tissue": tissue, - "units": units, - "points": [], - } - # Append the measurement, genotype, and sex for each row - for _, row in group.iterrows(): - point = { - "genotype": row["genotype"], - "measurement": row["measurement"], - "sex": row["sex"], - } - entry["points"].append(point) + grouped = biomarkers_dataset.groupby(group_columns) - # Add the entry to the list + for group_key, group in grouped: + entry = dict(zip(group_columns, group_key)) + entry["points"] = group[point_columns].to_dict("records") data_as_list.append(entry) return data_as_list