Update pipeline files for IBD browser (#85)

Co-authored-by: Riley Grant <grantrileyh@gmail.com>
broadinstitute · Feb 5, 2024 · aee0519 · aee0519
1 parent 438f87e
commit aee0519
Show file tree

Hide file tree

Showing 4 changed files with 147 additions and 2 deletions.
diff --git a/data_pipeline/data_pipeline/datasets/ibd/__init__.py b/data_pipeline/data_pipeline/datasets/ibd/__init__.py
diff --git a/data_pipeline/data_pipeline/datasets/ibd/ibd_gene_results.py b/data_pipeline/data_pipeline/datasets/ibd/ibd_gene_results.py
@@ -0,0 +1,83 @@
+import hail as hl
+
+from data_pipeline.config import pipeline_config
+
+
+def prepare_gene_results():
+    results = hl.read_table(pipeline_config.get("IBD", "gene_results_path"))
+
+    results = results.select_globals()
+
+    # Select result fields, discard gene information
+    results = results.select(
+        "gene_id",
+        "analysis_group",
+        "burden_test",
+        "P",
+        "BETA",
+        "SE",
+        "freq",
+        "n_alleles_case",
+        "n_alleles_control",
+        "n_samples_case",
+        "n_samples_control",
+        "HetP",
+    )
+
+    # pylint: disable-next=anomalous-backslash-in-string
+    results = results.annotate(burden_test=results.burden_test.replace("\.", "_").replace("\+", "_").lower())
+
+    final_results = None
+
+    consequence_categories = results.aggregate(hl.agg.collect_as_set(results.burden_test))
+    per_category_fields = [
+        "P",
+        "BETA",
+        "SE",
+        "freq",
+        "n_alleles_case",
+        "n_alleles_control",
+        "n_samples_case",
+        "n_samples_control",
+        "HetP",
+    ]
+    for category in consequence_categories:
+        category_results = results.filter(results.burden_test == category)
+        category_results = category_results.key_by("gene_id", "analysis_group")
+        category_results = category_results.select(
+            n_cases=category_results.n_samples_case,
+            n_controls=category_results.n_samples_control,
+            **{f"{category}_{field}": category_results[field] for field in per_category_fields},
+        )
+
+        if final_results:
+            final_results = final_results.join(category_results.drop("n_cases", "n_controls"), "outer")
+
+            # N cases/controls should be the same for all consequence categories for a gene/analysis group.
+            # However, if there are no variants of a certain consequence category found in a gene, then
+            # N cases/controls for that gene/analysis group/consequence category will be missing.
+            final_results = final_results.annotate(
+                n_cases=hl.or_else(
+                    final_results.n_cases, category_results[final_results.gene_id, final_results.analysis_group].n_cases
+                ),
+                n_controls=hl.or_else(
+                    final_results.n_controls,
+                    category_results[final_results.gene_id, final_results.analysis_group].n_controls,
+                ),
+            )
+        else:
+            final_results = category_results
+
+    if final_results:
+        final_results = final_results.group_by("gene_id").aggregate(
+            group_results=hl.agg.collect(final_results.row.drop("gene_id"))
+        )
+        final_results = final_results.annotate(
+            group_results=hl.dict(
+                final_results.group_results.map(
+                    lambda group_result: (group_result.analysis_group, group_result.drop("analysis_group"))
+                )
+            )
+        )
+
+    return final_results
diff --git a/data_pipeline/data_pipeline/datasets/ibd/ibd_variant_results.py b/data_pipeline/data_pipeline/datasets/ibd/ibd_variant_results.py
@@ -0,0 +1,57 @@
+import hail as hl
+
+from data_pipeline.config import pipeline_config
+
+
+def prepare_variant_results():
+    results = hl.read_table(pipeline_config.get("IBD", "variant_results_path"))
+
+    # Get unique variants from results table
+    variants = results.group_by(results.locus, results.alleles).aggregate()
+
+    # Looks like ac_control was mistakenly encoded as a string, e.g. "[83198, 0]"
+    results = results.annotate(
+        # pylint: disable-next=anomalous-backslash-in-string, unnecessary-lambda
+        ac_control=hl.map(lambda x: hl.int(x), results.ac_control.replace("\[", "").replace("\]", "").split(", "))
+    )
+
+    # Select AC/AF numbers for the alternate allele
+    results = results.annotate(ac_case=results.ac_case[1], ac_ctrl=results.ac_control[1])
+
+    results = results.drop("ac_control")
+
+    results = results.filter((results.ac_case > 0) | (results.ac_ctrl > 0))
+
+    # Annotate variants with a struct for each analysis group
+    results = results.group_by("locus", "alleles").aggregate(group_results=hl.agg.collect(results.row_value))
+    results = results.annotate(
+        group_results=hl.dict(
+            results.group_results.map(
+                lambda group_result: (group_result.analysis_group, group_result.drop("analysis_group"))
+            )
+        )
+    )
+
+    variants = variants.annotate(**results[variants.locus, variants.alleles])
+
+    # Merge variant annotations for canonical transcripts
+    annotations = hl.read_table(pipeline_config.get("IBD", "variant_annotations_path"))
+
+    # Not actually sure which annotations we need
+    annotations = annotations.select(
+        gene_id=annotations.gene_id_canonical,
+        consequence=annotations.most_severe_consequence,
+        hgvsc=annotations.hgvsc_canonical.split(":")[-1],
+        hgvsp=annotations.hgvsp_canonical.split(":")[-1],
+        info=hl.struct(
+            cadd=annotations.cadd_phred,
+            revel=annotations.revel_score,
+            polyphen=annotations.polyphen_score_canonical,
+            splice_ai=annotations.splice_ai_score,
+            primate_ai=annotations.primate_ai_score,
+        ),
+    )
+
+    variants = variants.annotate(**annotations[variants.locus, variants.alleles])
+
+    return variants
diff --git a/data_pipeline/pipeline_config.ini b/data_pipeline/pipeline_config.ini
@@ -1,5 +1,5 @@
 [datasets]
-datasets = ASC,BipEx,Epi25,SCHEMA
+datasets = ASC,BipEx,Epi25,SCHEMA,IBD
 
 [ASC]
 gene_results_path = gs://asc-browser/ASC_gene_results_table_for_browser_2019-04-14.tsv
@@ -25,6 +25,11 @@ gene_results_path = gs://schema-browser/200831/schema-results-table-supplement.t
 variant_results_path = gs://schema-browser/200911/2020-09-11_schema-browser-variant-results-table-meta-rare-denovos-common-merged.ht
 variant_annotations_path = gs://schema-browser/200911/2020-09-11_schema-browser-variant-annotation-table.ht
 
+[IBD]
+gene_results_path = gs://ibd-browser/09-11-2023/gene_based_results.ht 
+variant_results_path = gs://ibd-browser/09-11-2023/variants_results.ht 
+variant_annotations_path = gs://ibd-browser/09-11-2023/variants_annotations.ht 
+
 [reference_data]
 grch37_gencode_path = gs://exome-results-browsers/reference/gencode.v19.gtf.bgz
 grch38_gencode_path = gs://exome-results-browsers/reference/gencode.v29.gtf.bgz
@@ -44,4 +49,4 @@ service-account = erb-data-pipeline@exac-gnomad.iam.gserviceaccount.com
 
 [output]
 # Path for intermediate Hail files.
-staging_path = gs://exome-results-browsers/data/230118
+staging_path = gs://exome-results-browsers/data/231116