Skip to content

Commit

Permalink
Added test case to cover user-reported bug in which a read that was s…
Browse files Browse the repository at this point in the history
…upposed to be Genomic was misclassified as Intergenic
  • Loading branch information
Dana Elizabeth Wyman committed Dec 5, 2019
1 parent 59567e9 commit 12525e1
Show file tree
Hide file tree
Showing 5 changed files with 78 additions and 0 deletions.
17 changes: 17 additions & 0 deletions testing_suite/build_test_databases.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,23 @@
print(e)
sys.exit("Database initialization failed on chr22 annotation")

try:
os.system("mkdir -p scratch/multiexon_read_overlapping_monoexon_transcript")
subprocess.check_output(
["talon_initialize_database",
"--f", "input_files/multiexon_read_overlapping_monoexon_transcript/HMGB1P1.gtf",
"--a", "gencode_v29",
"--5p", "500",
"--3p", "300",
"--idprefix", "ENCODEH",
"--l", "300",
"--g", "hg38", "--o",
"scratch/multiexon_read_overlapping_monoexon_transcript/talon"])

except Exception as e:
print(e)
sys.exit("Database initialization failed on HMGB1P1 annotation")

# Actually perform the toy_mod run
try:
subprocess.check_output(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
chr20 HAVANA gene 57488392 57489027 . - . gene_id "ENSG00000124097.7"; gene_type "processed_pseudogene"; gene_name "HMGB1P1"; level 1; tag "pseudo_consens"; havana_gene "OTTHUMG00000032823.3";
chr20 HAVANA transcript 57488392 57489027 . - . gene_id "ENSG00000124097.7"; transcript_id "ENST00000522557.1"; gene_type "processed_pseudogene"; gene_name "HMGB1P1"; transcript_type "processed_pseudogene"; transcript_name "HMGB1P1-201"; level 1; transcript_support_level "NA"; ont "PGO:0000004"; tag "pseudo_consens"; tag "basic"; havana_gene "OTTHUMG00000032823.3"; havana_transcript "OTTHUMT00000079848.3";
chr20 HAVANA exon 57488392 57489027 . - . gene_id "ENSG00000124097.7"; transcript_id "ENST00000522557.1"; gene_type "processed_pseudogene"; gene_name "HMGB1P1"; transcript_type "processed_pseudogene"; transcript_name "HMGB1P1-201"; exon_number 1; exon_id "ENSE00002091063.1"; level 1; transcript_support_level "NA"; ont "PGO:0000004"; tag "pseudo_consens"; tag "basic"; havana_gene "OTTHUMG00000032823.3"; havana_transcript "OTTHUMT00000079848.3";
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
test,test,test,input_files/multiexon_read_overlapping_monoexon_transcript/read.sam
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
@HD VN:1.6 SO:coordinate
@SQ SN:chr20 LN:64444167
m54119_190202_095143/62849336/ccs 16 chr20 57487908 60 1997M5298N74M1S * 0 0 GCAAAAACAGTACGATTTATTTTTTTATTTTTTTAATAAAAAAATTGTATTTACTTAGAAGCATTCAGAATGTCAACAAAACAGCTGCAACTTTTTTTTTTTGCAATTACAGAGTGGTATTCAGTTAACAGAACAATTATTTCGTATAAGCTGCATCAGAGACAACTGAAGATGAAAAAACTACCATCCCCATATATAACTAATTTGTGCTGCGCACCAACAAGAACCTGCTTTAAATTTCCATGCCAATTTACAACCCCCATACTCTACCAGGCAAGGTTAGTGACTATTGAAAATACCACCAGGACAGGGCCATCTAAAGACACATTTGGTAGTGTATTAACCATACAAAAAACGACACTGTACAGTTTAAAAACAAATCTTACACAGCCTTACATTTCAATTTTTTTCTTTAAAAGGAGTGAGTTGTGTACAGGGGCGTTAAATGCTTTAAGACAAGAAAAAAAACTGCGCTAGAGCCAACTTATTCATCATCATCATCTTCTTCATCTTCTTCATCTTCCTCCTCCTCATCCTCTTCATCTTCCTCATCTTCCTCCTCTTCCTTCTTTTTCTTGCTTTTTTCAGCTTTGACAACTCCCTTTTTTGCTGCCTCAGGCTTTCCTTTAGCTTGATATGCAGCAATATCCTTTTCGTATTTTTCCTTCAGCTTCGCAGCCTTCTTTTCACCAGGCTGCTTGTCATCTGCAGCAGTGTTATTCCACATCTCTCCCAGTTTCTTCGCAACATCACCAATGGACAGGCCAGGATGTTCTCCTTTGATTTTTGGGTGATACTCAGAGCAGAACAGGAAGAAGGCCGAAGGAGGCCTCTTGGGTGCATTGGGATCCTTGAACTTCTTTTTTGTCTCCCCTTTGGGAGGGATATAGGTTTTCATTTGTCTTTCATAATGGGTCTTGTCCGCCTTTGCCATATCCTCAAATTTTCCTTTCTCTTTAGCAGACATGGTCTTCCACCTCTCTGAGCACTTGTTAGAAAACTCTGAGAAGTTGACTGAAGCATCTGAGTGCTTCTTCTTATGCTCCTCCCGACAAGTTTGCACAAAAAATGCATATGATGACATTTTGCCTCTCGGCTTCTTAGGATCTCCTTTGCCCATGTTTAGTTATTTTTCTTCAGCGAGGCACAGAGTCGCCCAGTGCCCGTACGGCTCTCACTTGCCCCGGCGCTGTCTCTATGGAGCTCAATGTACTGCAGTGGCTAACAGTACGATTTATAATAGCACTCTCCCCAAAGGAAAATCCATGACAAACTTTGATACAGATCCAACATCTGTGCAAGATGTCTACGATAGATACAAAACAATGATGAAAGAAATCAAAGAAGGCCTAAATGATTGGAGGGATACACTCTTCTGTTCACGGATTGGGAGACTCAGTATTGTTAAAGTGTAAAATTCTTTCCAAATTGATCTATACATTCATTCCCCCAAAGATCTCAGCAACATTTTCTTTTTGCATAAATTCACAAGTGGATTTGAAAATTTAGATGGAAAGGCAAAGGATTCAGAATCACCAAAAACAACTTTAGAAAAGAAGAACAAAGCTGGAGGACTTGCATTATCTGAGTTTGAGGCTGACTATACGCTGTCAAGACAGCCTGGTTCTGATGAAGGACTAGCTTGGGAAAGAATGGAGCTACAACACCCACACATGTGGCCAATACTTTTTTTTTTTTTTTTGAGACGAAGTCTCACTGTAGCCCAGGCTGGAGTGCAGTGGTGTGATCTTGGCTCACGACAACCTCCGCCTCCATGGCTCAAGCCATTCTTGTGCCTCAGCCTCTGAGGAGCAGGGACTACAGGTGTGCGCCACCACATCCAGATAATATTTTGTATTTTAGTAGAGATGAGGTTTCACCATGTTGCCCAGCGTGGTCTCAAACTCCTGAGCTCAGGCGATCTGTCCTCCTCGGCTTCCCAAAGTGCTGGGATTACAGGCGTGAGCCACCGTGCCTGTCAATACATTTTAATACAGCAAGCATTCACGAACTCAGAGCCAACTGGTGTCCTCACTCCCAGGTTTTGCAAAGGTCCCTTCTCTTCCTTGCCC * ms:i:2008 AS:i:1984 nn:i:0 ts:A:+ tp:A:P cm:i:613 s1:i:1944 s2:i:988 de:f:0.0053 rl:i:291 NM:i:0 MD:Z:2071 jM:B:c,2 jI:B:i,57489905,57495202 RG:Z:test
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import pytest
from talon import talon, init_refs
import sqlite3
import pysam
@pytest.mark.integration

class TestMultExonReadOverlapsMonoGene(object):

def test_transcript_assigned_intergenic(self):
""" This test covers a case reported by a user where a read overlaps
the ~600bp mono-exonic pseudogene HMGB1P1. The read itself has
2 exons, the second of which contains the small pseudogene inside.
earlier versions of TALON classified the read as intergenic,
when it was actually supposed to be genomic """

# Set up references
database = "scratch/multiexon_read_overlapping_monoexon_transcript/talon.db"
conn = sqlite3.connect(database)
conn.row_factory = sqlite3.Row
cursor = conn.cursor()

build = "hg38"
talon.get_counters(database)
run_info = talon.init_run_info(database, build)
struct_collection = talon.prepare_data_structures(cursor, run_info)

# Use pysam to get the read from the SAM file
sam_file = "input_files/multiexon_read_overlapping_monoexon_transcript/read.sam"
with pysam.AlignmentFile(sam_file) as sam:
for entry in sam:
sam_record = entry
break

# Get read attributes
chrom = sam_record.reference_name
strand = "-" if sam_record.is_reverse else "+"
sam_start = sam_record.reference_start
sam_end = sam_record.reference_end

# Do we get any overlap with the reference gene?
best_gene, match_strand = talon.search_for_overlap_with_gene(chrom, min(sam_start, sam_end),
max(sam_start, sam_end), strand,
cursor, run_info,
struct_collection.tmp_gene)
assert best_gene == 1
assert match_strand == "-"

annotation_info = talon.annotate_read(sam_record, cursor, run_info,
struct_collection, mode = 0)

assert annotation_info['gene_ID'] == 1
assert annotation_info['transcript_ID'] == 2
assert 'genomic_transcript' in annotation_info['transcript_novelty'][0]

0 comments on commit 12525e1

Please sign in to comment.