Skip to content

Commit

Permalink
Merge pull request #82 from Sage-Bionetworks/jbeck/AG-1143/transform_…
Browse files Browse the repository at this point in the history
…distribution_data_testing

AG-1143/transform distribution data testing
  • Loading branch information
jaclynbeck-sage authored Jul 13, 2023
2 parents 9868ff2 + 7ab0fc0 commit 8fccf16
Show file tree
Hide file tree
Showing 13 changed files with 804 additions and 29 deletions.
5 changes: 4 additions & 1 deletion config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -223,9 +223,12 @@
overall_max_score: 5
genetics_max_score: 3
omics_max_score: 2
lit_max_score: 2
provenance:
- syn25575156.13
column_rename:
overall: target_risk_score
geneticsscore: genetics_score
omicsscore: multi_omics_score
destination: *dest

- rna_distribution_data:
Expand Down
45 changes: 20 additions & 25 deletions src/agoradatatools/etl/transform/distribution_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

def calculate_distribution(df: pd.DataFrame, col: str, is_scored, upper_bound) -> dict:
if is_scored:
df = df[df[is_scored] == "Y"] # df does not have the isscored
df = df[df[is_scored] == "Y"]
# If isscored is blank/NaN, take all rows with at least one "Y" in any isscored column
else:
df = df[df.isin(["Y"]).any(axis=1)]

Expand All @@ -26,15 +27,17 @@ def calculate_distribution(df: pd.DataFrame, col: str, is_scored, upper_bound) -
distribution, bins=10, precision=3, include_lowest=True, right=True
).value_counts(sort=False)
)
obj["distribution"][
0
] -= 1 # since this was calculated with the artificial 0 value, we subtract it
obj["distribution"][
-1
] -= 1 # since this was calculated with the artificial upper_bound, we subtract it

# obj["distribution"][0] is for the lowest bin, which includes values of 0. Since this was
# calculated with an extra artificial 0 value, we subtract 1 to get the real count.
obj["distribution"][0] -= 1

# obj["distribution"][-1] (end of the list) is for the highest bin, which includes the upper
# bound. Since this was calculated with an extra artificial upper_bound, we subtract 1 as above.
obj["distribution"][-1] -= 1

discard, obj["bins"] = list(
pd.cut(distribution, bins=10, precision=3, retbins=True)
pd.cut(distribution, bins=10, precision=3, include_lowest=True, right=True, retbins=True)
)
obj["bins"] = np.around(obj["bins"].tolist()[1:], 2)
base = [0, *obj["bins"][:-1]]
Expand All @@ -45,10 +48,10 @@ def calculate_distribution(df: pd.DataFrame, col: str, is_scored, upper_bound) -
obj["max"] = np.around(df[col].max(), 4)
obj["mean"] = np.around(df[col].mean(), 4)
obj["first_quartile"] = np.around(
df[col].quantile(q=0.25, interpolation="midpoint")
df[col].quantile(q=0.25, interpolation="midpoint"), 4
)
obj["third_quartile"] = np.around(
df[col].quantile(q=0.75, interpolation="midpoint")
df[col].quantile(q=0.75, interpolation="midpoint"), 4
)

return obj
Expand All @@ -59,48 +62,40 @@ def transform_distribution_data(
overall_max_score,
genetics_max_score,
omics_max_score,
lit_max_score,
):
overall_scores = datasets["overall_scores"]
interesting_columns = [
"ensg",
"overall",
"geneticsscore",
"omicsscore",
"literaturescore",
"target_risk_score",
"genetics_score",
"multi_omics_score",
]

# create mapping to deal with missing values as they take different shape across the fields
scored = ["isscored_genetics", "isscored_omics", "isscored_lit"]
scored = ["isscored_genetics", "isscored_omics"]
mapping = dict(zip(interesting_columns[2:], scored))
mapping["overall"] = None
mapping["target_risk_score"] = None

# create mapping for max score values from config
max_score = dict(
zip(
interesting_columns[1:],
[overall_max_score, genetics_max_score, omics_max_score, lit_max_score],
[overall_max_score, genetics_max_score, omics_max_score],
)
)

overall_scores = overall_scores[interesting_columns + scored]
overall_scores = overall_scores[interesting_columns + scored].drop_duplicates()

neo_matrix = {}
for col in interesting_columns[1:]: # excludes the ENSG
neo_matrix[col] = calculate_distribution(
overall_scores, col, mapping[col], max_score[col]
)

neo_matrix["target_risk_score"] = neo_matrix.pop("overall")
neo_matrix["genetics_score"] = neo_matrix.pop("geneticsscore")
neo_matrix["multi_omics_score"] = neo_matrix.pop("omicsscore")
neo_matrix["literature_score"] = neo_matrix.pop("literaturescore")

additional_data = [
{"name": "Target Risk Score", "syn_id": "syn25913473", "wiki_id": "621071"},
{"name": "Genetic Risk Score", "syn_id": "syn25913473", "wiki_id": "621069"},
{"name": "Multi-omic Risk Score", "syn_id": "syn25913473", "wiki_id": "621070"},
{"name": "Literature Score", "syn_id": "syn25913473", "wiki_id": "613105"},
]
for col, additional in zip(neo_matrix.keys(), additional_data):
neo_matrix[col]["name"] = additional["name"]
Expand Down
3 changes: 1 addition & 2 deletions src/agoradatatools/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@ def apply_custom_transformations(datasets: dict, dataset_name: str, dataset_obj:
genetics_max_score=dataset_obj["custom_transformations"][
"genetics_max_score"
],
omics_max_score=dataset_obj["custom_transformations"]["omics_max_score"],
lit_max_score=dataset_obj["custom_transformations"]["lit_max_score"],
omics_max_score=dataset_obj["custom_transformations"]["omics_max_score"]
)
if dataset_name == "team_info":
return transform.transform_team_info(datasets=datasets)
Expand Down
5 changes: 4 additions & 1 deletion test_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -223,9 +223,12 @@
overall_max_score: 5
genetics_max_score: 3
omics_max_score: 2
lit_max_score: 2
provenance:
- syn25575156.13
column_rename:
overall: target_risk_score
geneticsscore: genetics_score
omicsscore: multi_omics_score
destination: *dest

- rna_distribution_data:
Expand Down
Loading

0 comments on commit 8fccf16

Please sign in to comment.