-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Update pipeline files for IBD browser (#85)
Co-authored-by: Riley Grant <grantrileyh@gmail.com>
- Loading branch information
1 parent
438f87e
commit aee0519
Showing
4 changed files
with
147 additions
and
2 deletions.
There are no files selected for viewing
Empty file.
83 changes: 83 additions & 0 deletions
83
data_pipeline/data_pipeline/datasets/ibd/ibd_gene_results.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
import hail as hl | ||
|
||
from data_pipeline.config import pipeline_config | ||
|
||
|
||
def prepare_gene_results(): | ||
results = hl.read_table(pipeline_config.get("IBD", "gene_results_path")) | ||
|
||
results = results.select_globals() | ||
|
||
# Select result fields, discard gene information | ||
results = results.select( | ||
"gene_id", | ||
"analysis_group", | ||
"burden_test", | ||
"P", | ||
"BETA", | ||
"SE", | ||
"freq", | ||
"n_alleles_case", | ||
"n_alleles_control", | ||
"n_samples_case", | ||
"n_samples_control", | ||
"HetP", | ||
) | ||
|
||
# pylint: disable-next=anomalous-backslash-in-string | ||
results = results.annotate(burden_test=results.burden_test.replace("\.", "_").replace("\+", "_").lower()) | ||
|
||
final_results = None | ||
|
||
consequence_categories = results.aggregate(hl.agg.collect_as_set(results.burden_test)) | ||
per_category_fields = [ | ||
"P", | ||
"BETA", | ||
"SE", | ||
"freq", | ||
"n_alleles_case", | ||
"n_alleles_control", | ||
"n_samples_case", | ||
"n_samples_control", | ||
"HetP", | ||
] | ||
for category in consequence_categories: | ||
category_results = results.filter(results.burden_test == category) | ||
category_results = category_results.key_by("gene_id", "analysis_group") | ||
category_results = category_results.select( | ||
n_cases=category_results.n_samples_case, | ||
n_controls=category_results.n_samples_control, | ||
**{f"{category}_{field}": category_results[field] for field in per_category_fields}, | ||
) | ||
|
||
if final_results: | ||
final_results = final_results.join(category_results.drop("n_cases", "n_controls"), "outer") | ||
|
||
# N cases/controls should be the same for all consequence categories for a gene/analysis group. | ||
# However, if there are no variants of a certain consequence category found in a gene, then | ||
# N cases/controls for that gene/analysis group/consequence category will be missing. | ||
final_results = final_results.annotate( | ||
n_cases=hl.or_else( | ||
final_results.n_cases, category_results[final_results.gene_id, final_results.analysis_group].n_cases | ||
), | ||
n_controls=hl.or_else( | ||
final_results.n_controls, | ||
category_results[final_results.gene_id, final_results.analysis_group].n_controls, | ||
), | ||
) | ||
else: | ||
final_results = category_results | ||
|
||
if final_results: | ||
final_results = final_results.group_by("gene_id").aggregate( | ||
group_results=hl.agg.collect(final_results.row.drop("gene_id")) | ||
) | ||
final_results = final_results.annotate( | ||
group_results=hl.dict( | ||
final_results.group_results.map( | ||
lambda group_result: (group_result.analysis_group, group_result.drop("analysis_group")) | ||
) | ||
) | ||
) | ||
|
||
return final_results |
57 changes: 57 additions & 0 deletions
57
data_pipeline/data_pipeline/datasets/ibd/ibd_variant_results.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
import hail as hl | ||
|
||
from data_pipeline.config import pipeline_config | ||
|
||
|
||
def prepare_variant_results(): | ||
results = hl.read_table(pipeline_config.get("IBD", "variant_results_path")) | ||
|
||
# Get unique variants from results table | ||
variants = results.group_by(results.locus, results.alleles).aggregate() | ||
|
||
# Looks like ac_control was mistakenly encoded as a string, e.g. "[83198, 0]" | ||
results = results.annotate( | ||
# pylint: disable-next=anomalous-backslash-in-string, unnecessary-lambda | ||
ac_control=hl.map(lambda x: hl.int(x), results.ac_control.replace("\[", "").replace("\]", "").split(", ")) | ||
) | ||
|
||
# Select AC/AF numbers for the alternate allele | ||
results = results.annotate(ac_case=results.ac_case[1], ac_ctrl=results.ac_control[1]) | ||
|
||
results = results.drop("ac_control") | ||
|
||
results = results.filter((results.ac_case > 0) | (results.ac_ctrl > 0)) | ||
|
||
# Annotate variants with a struct for each analysis group | ||
results = results.group_by("locus", "alleles").aggregate(group_results=hl.agg.collect(results.row_value)) | ||
results = results.annotate( | ||
group_results=hl.dict( | ||
results.group_results.map( | ||
lambda group_result: (group_result.analysis_group, group_result.drop("analysis_group")) | ||
) | ||
) | ||
) | ||
|
||
variants = variants.annotate(**results[variants.locus, variants.alleles]) | ||
|
||
# Merge variant annotations for canonical transcripts | ||
annotations = hl.read_table(pipeline_config.get("IBD", "variant_annotations_path")) | ||
|
||
# Not actually sure which annotations we need | ||
annotations = annotations.select( | ||
gene_id=annotations.gene_id_canonical, | ||
consequence=annotations.most_severe_consequence, | ||
hgvsc=annotations.hgvsc_canonical.split(":")[-1], | ||
hgvsp=annotations.hgvsp_canonical.split(":")[-1], | ||
info=hl.struct( | ||
cadd=annotations.cadd_phred, | ||
revel=annotations.revel_score, | ||
polyphen=annotations.polyphen_score_canonical, | ||
splice_ai=annotations.splice_ai_score, | ||
primate_ai=annotations.primate_ai_score, | ||
), | ||
) | ||
|
||
variants = variants.annotate(**annotations[variants.locus, variants.alleles]) | ||
|
||
return variants |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters