Simplifying transform_biomarkers() to make it more readable and maint…

…ainable
Sage-Bionetworks · Sep 30, 2024 · 31c5806 · 31c5806
1 parent e73210a
commit 31c5806
Showing 1 changed file with 14 additions and 39 deletions.
diff --git a/src/agoradatatools/etl/transform/biomarkers.py b/src/agoradatatools/etl/transform/biomarkers.py
@@ -19,53 +19,28 @@ def transform_biomarkers(datasets: Dict[str, pd.DataFrame]) -> List[Dict[str, An
     Returns:
         List[Dict[str, Any]]: a list of dictionaries containing biomarker data modeled after intended final JSON structure
     """
-    if "biomarkers" not in datasets:
-        raise ValueError("Biomarkers dataset not found in datasets dictionary")
     biomarkers_dataset = datasets["biomarkers"]
-    expected_columns = [
-        "model",
-        "type",
-        "ageDeath",
-        "tissue",
-        "units",
-        "genotype",
-        "measurement",
-        "sex",
+    group_columns = ["model", "type", "ageDeath", "tissue", "units"]
+    point_columns = ["genotype", "measurement", "sex"]
+
+    missing_columns = [
+        col
+        for col in group_columns + point_columns
+        if col not in biomarkers_dataset.columns
     ]
-    if not set(expected_columns).issubset(set(biomarkers_dataset.columns)):
-        missing_columns = [
-            s for s in set(expected_columns) if s not in biomarkers_dataset.columns
-        ]
+    if missing_columns:
         raise ValueError(
-            f"Biomarker dataset does not contain expected columns. Missing column(s): {missing_columns}"
+            f"Biomarker dataset missing columns: {', '.join(missing_columns)}"
         )
+
     biomarkers_dataset = biomarkers_dataset.fillna("none")
     data_as_list = []
-    grouped = biomarkers_dataset.groupby(
-        ["model", "type", "ageDeath", "tissue", "units"]
-    )
-
-    for (model, type_, ageDeath, tissue, units), group in grouped:
-        # Create the base structure for each group
-        entry = {
-            "model": model,
-            "type": type_,
-            "ageDeath": ageDeath,
-            "tissue": tissue,
-            "units": units,
-            "points": [],
-        }
 
-        # Append the measurement, genotype, and sex for each row
-        for _, row in group.iterrows():
-            point = {
-                "genotype": row["genotype"],
-                "measurement": row["measurement"],
-                "sex": row["sex"],
-            }
-            entry["points"].append(point)
+    grouped = biomarkers_dataset.groupby(group_columns)
 
-        # Add the entry to the list
+    for group_key, group in grouped:
+        entry = dict(zip(group_columns, group_key))
+        entry["points"] = group[point_columns].to_dict("records")
         data_as_list.append(entry)
 
     return data_as_list