Skip to content

Commit

Permalink
Merge pull request #70 from Sage-Bionetworks/bwmac/ag-838/independent…
Browse files Browse the repository at this point in the history
…_transforms

[AG-838] Support independent transforms
  • Loading branch information
BWMac authored May 5, 2023
2 parents 170f0d4 + ced0be6 commit 0684a29
Show file tree
Hide file tree
Showing 17 changed files with 952 additions and 924 deletions.
674 changes: 0 additions & 674 deletions src/agoradatatools/etl/transform.py

This file was deleted.

31 changes: 31 additions & 0 deletions src/agoradatatools/etl/transform/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""Submodule for Agora Data Tools Transformations"""

from agoradatatools.etl.transform.distribution_data import (
transform_distribution_data,
)
from agoradatatools.etl.transform.gene_info import transform_gene_info
from agoradatatools.etl.transform.genes_biodomains import (
transform_genes_biodomains,
)
from agoradatatools.etl.transform.overall_scores import (
transform_overall_scores,
)
from agoradatatools.etl.transform.proteomics_distribution import (
create_proteomics_distribution_data,
)
from agoradatatools.etl.transform.rna_distribution import (
transform_rna_distribution_data,
transform_rna_seq_data,
)
from agoradatatools.etl.transform.team_info import transform_team_info

__all__ = [
"transform_distribution_data",
"transform_gene_info",
"transform_genes_biodomains",
"transform_overall_scores",
"create_proteomics_distribution_data",
"transform_rna_distribution_data",
"transform_rna_seq_data",
"transform_team_info",
]
110 changes: 110 additions & 0 deletions src/agoradatatools/etl/transform/distribution_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import pandas as pd
import numpy as np


def calculate_distribution(df: pd.DataFrame, col: str, is_scored, upper_bound) -> dict:
if is_scored:
df = df[df[is_scored] == "Y"] # df does not have the isscored
else:
df = df[df.isin(["Y"]).any(axis=1)]

if df[col].dtype == object:
df = df.copy() # Necessary to prevent SettingWithCopy warning
df[col] = df[col].astype(float)

obj = {}

# In order to smooth out the bins and make sure the entire range from 0
# to the theoretical maximum value has been found, we create a copy of the
# column with both 0 and that maximum value added to it. We use the copy to calculate
# distributions and bins, and subtract the values at the end

distribution = pd.concat([df[col], pd.Series([0, upper_bound])], ignore_index=True)

obj["distribution"] = list(
pd.cut(
distribution, bins=10, precision=3, include_lowest=True, right=True
).value_counts(sort=False)
)
obj["distribution"][
0
] -= 1 # since this was calculated with the artificial 0 value, we subtract it
obj["distribution"][
-1
] -= 1 # since this was calculated with the artificial upper_bound, we subtract it

discard, obj["bins"] = list(
pd.cut(distribution, bins=10, precision=3, retbins=True)
)
obj["bins"] = np.around(obj["bins"].tolist()[1:], 2)
base = [0, *obj["bins"][:-1]]
obj["bins"] = zip(base, obj["bins"])
obj["bins"] = list(obj["bins"])

obj["min"] = np.around(df[col].min(), 4)
obj["max"] = np.around(df[col].max(), 4)
obj["mean"] = np.around(df[col].mean(), 4)
obj["first_quartile"] = np.around(
df[col].quantile(q=0.25, interpolation="midpoint")
)
obj["third_quartile"] = np.around(
df[col].quantile(q=0.75, interpolation="midpoint")
)

return obj


def transform_distribution_data(
datasets: dict,
overall_max_score,
genetics_max_score,
omics_max_score,
lit_max_score,
):
overall_scores = datasets["overall_scores"]
interesting_columns = [
"ensg",
"overall",
"geneticsscore",
"omicsscore",
"literaturescore",
]

# create mapping to deal with missing values as they take different shape across the fields
scored = ["isscored_genetics", "isscored_omics", "isscored_lit"]
mapping = dict(zip(interesting_columns[2:], scored))
mapping["overall"] = None

# create mapping for max score values from config
max_score = dict(
zip(
interesting_columns[1:],
[overall_max_score, genetics_max_score, omics_max_score, lit_max_score],
)
)

overall_scores = overall_scores[interesting_columns + scored]

neo_matrix = {}
for col in interesting_columns[1:]: # excludes the ENSG
neo_matrix[col] = calculate_distribution(
overall_scores, col, mapping[col], max_score[col]
)

neo_matrix["target_risk_score"] = neo_matrix.pop("overall")
neo_matrix["genetics_score"] = neo_matrix.pop("geneticsscore")
neo_matrix["multi_omics_score"] = neo_matrix.pop("omicsscore")
neo_matrix["literature_score"] = neo_matrix.pop("literaturescore")

additional_data = [
{"name": "Target Risk Score", "syn_id": "syn25913473", "wiki_id": "621071"},
{"name": "Genetic Risk Score", "syn_id": "syn25913473", "wiki_id": "621069"},
{"name": "Multi-omic Risk Score", "syn_id": "syn25913473", "wiki_id": "621070"},
{"name": "Literature Score", "syn_id": "syn25913473", "wiki_id": "613105"},
]
for col, additional in zip(neo_matrix.keys(), additional_data):
neo_matrix[col]["name"] = additional["name"]
neo_matrix[col]["syn_id"] = additional["syn_id"]
neo_matrix[col]["wiki_id"] = additional["wiki_id"]

return neo_matrix
151 changes: 151 additions & 0 deletions src/agoradatatools/etl/transform/gene_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
import pandas as pd
import numpy as np

from agoradatatools.etl.utils import nest_fields


def transform_gene_info(
datasets: dict, adjusted_p_value_threshold, protein_level_threshold
):
"""
This function will perform transformations and incrementally create a dataset called gene_info.
Each dataset will be left_joined onto gene_info, starting with gene_metadata.
"""
gene_metadata = datasets["gene_metadata"]
igap = datasets["igap"]
eqtl = datasets["eqtl"]
proteomics = datasets["proteomics"]
rna_change = datasets["rna_expression_change"]
proteomics_tmt = datasets["agora_proteomics_tmt"]
target_list = datasets["target_list"]
median_expression = datasets["median_expression"]
druggability = datasets["druggability"]

# Modify the data before merging

# All genes in this list should have 'is_igap' = True when added to gene_info.
# Creating the column here automatically adds the column in to gene_info
# during merge, with True values correctly populated.
igap["is_igap"] = True

# Get the smallest adj_p_val for each gene, to determine significance
rna_change = (
rna_change.groupby("ensembl_gene_id")["adj_p_val"].agg("min").reset_index()
)

# Get the smallest cor_pval for each protein, to determine significance
proteomics_concat = pd.concat([proteomics, proteomics_tmt])
proteomics_concat = proteomics_concat.dropna(
subset=["log2_fc", "cor_pval", "ci_lwr", "ci_upr"]
)
proteomics_concat = (
proteomics_concat.groupby("ensembl_gene_id")["cor_pval"]
.agg("min")
.reset_index()
)

# these are the interesting columns of the druggability dataset
useful_columns = [
"geneid",
"sm_druggability_bucket",
"safety_bucket",
"abability_bucket",
"pharos_class",
"classification",
"safety_bucket_definition",
"abability_bucket_definition",
]
druggability = druggability[useful_columns]

target_list = nest_fields(
df=target_list, grouping="ensembl_gene_id", new_column="nominated_target"
)

median_expression = nest_fields(
df=median_expression, grouping="ensembl_gene_id", new_column="median_expression"
)

druggability = nest_fields(
df=druggability, grouping="geneid", new_column="druggability"
)
druggability.rename(columns={"geneid": "ensembl_gene_id"}, inplace=True)

# Merge all the datasets

gene_info = gene_metadata

for dataset in [
igap,
eqtl,
rna_change,
proteomics_concat,
target_list,
median_expression,
druggability,
]:
gene_info = pd.merge(
left=gene_info,
right=dataset,
on="ensembl_gene_id",
how="outer",
validate="one_to_one",
)

# Populate values for rows that didn't exist in the individual datasets

gene_info.fillna(
{"is_igap": False, "has_eqtl": False, "adj_p_val": -1, "cor_pval": -1},
inplace=True,
)

# fillna doesn't work for creating an empty array, need this function instead
gene_info["alias"] = gene_info.apply(
lambda row: row["alias"]
if isinstance(row["alias"], np.ndarray)
else np.ndarray(0, dtype=object),
axis=1,
)

gene_info["rna_brain_change_studied"] = gene_info["adj_p_val"] != -1
gene_info["rna_in_ad_brain_change"] = (
gene_info["adj_p_val"] <= adjusted_p_value_threshold
) & gene_info["rna_brain_change_studied"]

gene_info["protein_brain_change_studied"] = gene_info["cor_pval"] != -1
gene_info["protein_in_ad_brain_change"] = (
gene_info["cor_pval"] <= protein_level_threshold
) & gene_info["protein_brain_change_studied"]

# create 'nominations' field
gene_info["nominations"] = gene_info.apply(
lambda row: len(row["nominated_target"])
if isinstance(row["nominated_target"], list)
else np.NaN,
axis=1,
)

# Remove some extra columns that got added during merges
gene_info = gene_info[
[
"ensembl_gene_id",
"name",
"summary",
"symbol",
"alias",
"is_igap",
"has_eqtl",
"rna_in_ad_brain_change",
"rna_brain_change_studied",
"protein_in_ad_brain_change",
"protein_brain_change_studied",
"nominated_target",
"median_expression",
"druggability",
"nominations",
]
]

# Make sure there are no N/A Ensembl IDs
gene_info = gene_info.dropna(subset=["ensembl_gene_id"])

return gene_info
Loading

0 comments on commit 0684a29

Please sign in to comment.