Skip to content

Commit

Permalink
Update pipeline files for IBD browser (#85)
Browse files Browse the repository at this point in the history
Co-authored-by: Riley Grant <grantrileyh@gmail.com>
  • Loading branch information
mattsolo1 and rileyhgrant authored Feb 5, 2024
1 parent 438f87e commit aee0519
Show file tree
Hide file tree
Showing 4 changed files with 147 additions and 2 deletions.
Empty file.
83 changes: 83 additions & 0 deletions data_pipeline/data_pipeline/datasets/ibd/ibd_gene_results.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import hail as hl

from data_pipeline.config import pipeline_config


def prepare_gene_results():
results = hl.read_table(pipeline_config.get("IBD", "gene_results_path"))

results = results.select_globals()

# Select result fields, discard gene information
results = results.select(
"gene_id",
"analysis_group",
"burden_test",
"P",
"BETA",
"SE",
"freq",
"n_alleles_case",
"n_alleles_control",
"n_samples_case",
"n_samples_control",
"HetP",
)

# pylint: disable-next=anomalous-backslash-in-string
results = results.annotate(burden_test=results.burden_test.replace("\.", "_").replace("\+", "_").lower())

final_results = None

consequence_categories = results.aggregate(hl.agg.collect_as_set(results.burden_test))
per_category_fields = [
"P",
"BETA",
"SE",
"freq",
"n_alleles_case",
"n_alleles_control",
"n_samples_case",
"n_samples_control",
"HetP",
]
for category in consequence_categories:
category_results = results.filter(results.burden_test == category)
category_results = category_results.key_by("gene_id", "analysis_group")
category_results = category_results.select(
n_cases=category_results.n_samples_case,
n_controls=category_results.n_samples_control,
**{f"{category}_{field}": category_results[field] for field in per_category_fields},
)

if final_results:
final_results = final_results.join(category_results.drop("n_cases", "n_controls"), "outer")

# N cases/controls should be the same for all consequence categories for a gene/analysis group.
# However, if there are no variants of a certain consequence category found in a gene, then
# N cases/controls for that gene/analysis group/consequence category will be missing.
final_results = final_results.annotate(
n_cases=hl.or_else(
final_results.n_cases, category_results[final_results.gene_id, final_results.analysis_group].n_cases
),
n_controls=hl.or_else(
final_results.n_controls,
category_results[final_results.gene_id, final_results.analysis_group].n_controls,
),
)
else:
final_results = category_results

if final_results:
final_results = final_results.group_by("gene_id").aggregate(
group_results=hl.agg.collect(final_results.row.drop("gene_id"))
)
final_results = final_results.annotate(
group_results=hl.dict(
final_results.group_results.map(
lambda group_result: (group_result.analysis_group, group_result.drop("analysis_group"))
)
)
)

return final_results
57 changes: 57 additions & 0 deletions data_pipeline/data_pipeline/datasets/ibd/ibd_variant_results.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import hail as hl

from data_pipeline.config import pipeline_config


def prepare_variant_results():
results = hl.read_table(pipeline_config.get("IBD", "variant_results_path"))

# Get unique variants from results table
variants = results.group_by(results.locus, results.alleles).aggregate()

# Looks like ac_control was mistakenly encoded as a string, e.g. "[83198, 0]"
results = results.annotate(
# pylint: disable-next=anomalous-backslash-in-string, unnecessary-lambda
ac_control=hl.map(lambda x: hl.int(x), results.ac_control.replace("\[", "").replace("\]", "").split(", "))
)

# Select AC/AF numbers for the alternate allele
results = results.annotate(ac_case=results.ac_case[1], ac_ctrl=results.ac_control[1])

results = results.drop("ac_control")

results = results.filter((results.ac_case > 0) | (results.ac_ctrl > 0))

# Annotate variants with a struct for each analysis group
results = results.group_by("locus", "alleles").aggregate(group_results=hl.agg.collect(results.row_value))
results = results.annotate(
group_results=hl.dict(
results.group_results.map(
lambda group_result: (group_result.analysis_group, group_result.drop("analysis_group"))
)
)
)

variants = variants.annotate(**results[variants.locus, variants.alleles])

# Merge variant annotations for canonical transcripts
annotations = hl.read_table(pipeline_config.get("IBD", "variant_annotations_path"))

# Not actually sure which annotations we need
annotations = annotations.select(
gene_id=annotations.gene_id_canonical,
consequence=annotations.most_severe_consequence,
hgvsc=annotations.hgvsc_canonical.split(":")[-1],
hgvsp=annotations.hgvsp_canonical.split(":")[-1],
info=hl.struct(
cadd=annotations.cadd_phred,
revel=annotations.revel_score,
polyphen=annotations.polyphen_score_canonical,
splice_ai=annotations.splice_ai_score,
primate_ai=annotations.primate_ai_score,
),
)

variants = variants.annotate(**annotations[variants.locus, variants.alleles])

return variants
9 changes: 7 additions & 2 deletions data_pipeline/pipeline_config.ini
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[datasets]
datasets = ASC,BipEx,Epi25,SCHEMA
datasets = ASC,BipEx,Epi25,SCHEMA,IBD

[ASC]
gene_results_path = gs://asc-browser/ASC_gene_results_table_for_browser_2019-04-14.tsv
Expand All @@ -25,6 +25,11 @@ gene_results_path = gs://schema-browser/200831/schema-results-table-supplement.t
variant_results_path = gs://schema-browser/200911/2020-09-11_schema-browser-variant-results-table-meta-rare-denovos-common-merged.ht
variant_annotations_path = gs://schema-browser/200911/2020-09-11_schema-browser-variant-annotation-table.ht

[IBD]
gene_results_path = gs://ibd-browser/09-11-2023/gene_based_results.ht
variant_results_path = gs://ibd-browser/09-11-2023/variants_results.ht
variant_annotations_path = gs://ibd-browser/09-11-2023/variants_annotations.ht

[reference_data]
grch37_gencode_path = gs://exome-results-browsers/reference/gencode.v19.gtf.bgz
grch38_gencode_path = gs://exome-results-browsers/reference/gencode.v29.gtf.bgz
Expand All @@ -44,4 +49,4 @@ service-account = erb-data-pipeline@exac-gnomad.iam.gserviceaccount.com

[output]
# Path for intermediate Hail files.
staging_path = gs://exome-results-browsers/data/230118
staging_path = gs://exome-results-browsers/data/231116

0 comments on commit aee0519

Please sign in to comment.