diff --git a/testing_suite/build_test_databases.py b/testing_suite/build_test_databases.py index 8c95375..248acb4 100644 --- a/testing_suite/build_test_databases.py +++ b/testing_suite/build_test_databases.py @@ -89,6 +89,23 @@ print(e) sys.exit("Database initialization failed on chr22 annotation") +try: + os.system("mkdir -p scratch/multiexon_read_overlapping_monoexon_transcript") + subprocess.check_output( + ["talon_initialize_database", + "--f", "input_files/multiexon_read_overlapping_monoexon_transcript/HMGB1P1.gtf", + "--a", "gencode_v29", + "--5p", "500", + "--3p", "300", + "--idprefix", "ENCODEH", + "--l", "300", + "--g", "hg38", "--o", + "scratch/multiexon_read_overlapping_monoexon_transcript/talon"]) + +except Exception as e: + print(e) + sys.exit("Database initialization failed on HMGB1P1 annotation") + # Actually perform the toy_mod run try: subprocess.check_output( diff --git a/testing_suite/input_files/multiexon_read_overlapping_monoexon_transcript/HMGB1P1.gtf b/testing_suite/input_files/multiexon_read_overlapping_monoexon_transcript/HMGB1P1.gtf new file mode 100644 index 0000000..af7304f --- /dev/null +++ b/testing_suite/input_files/multiexon_read_overlapping_monoexon_transcript/HMGB1P1.gtf @@ -0,0 +1,3 @@ +chr20 HAVANA gene 57488392 57489027 . - . gene_id "ENSG00000124097.7"; gene_type "processed_pseudogene"; gene_name "HMGB1P1"; level 1; tag "pseudo_consens"; havana_gene "OTTHUMG00000032823.3"; +chr20 HAVANA transcript 57488392 57489027 . - . gene_id "ENSG00000124097.7"; transcript_id "ENST00000522557.1"; gene_type "processed_pseudogene"; gene_name "HMGB1P1"; transcript_type "processed_pseudogene"; transcript_name "HMGB1P1-201"; level 1; transcript_support_level "NA"; ont "PGO:0000004"; tag "pseudo_consens"; tag "basic"; havana_gene "OTTHUMG00000032823.3"; havana_transcript "OTTHUMT00000079848.3"; +chr20 HAVANA exon 57488392 57489027 . - . gene_id "ENSG00000124097.7"; transcript_id "ENST00000522557.1"; gene_type "processed_pseudogene"; gene_name "HMGB1P1"; transcript_type "processed_pseudogene"; transcript_name "HMGB1P1-201"; exon_number 1; exon_id "ENSE00002091063.1"; level 1; transcript_support_level "NA"; ont "PGO:0000004"; tag "pseudo_consens"; tag "basic"; havana_gene "OTTHUMG00000032823.3"; havana_transcript "OTTHUMT00000079848.3"; diff --git a/testing_suite/input_files/multiexon_read_overlapping_monoexon_transcript/config.csv b/testing_suite/input_files/multiexon_read_overlapping_monoexon_transcript/config.csv new file mode 100644 index 0000000..41a31ce --- /dev/null +++ b/testing_suite/input_files/multiexon_read_overlapping_monoexon_transcript/config.csv @@ -0,0 +1 @@ +test,test,test,input_files/multiexon_read_overlapping_monoexon_transcript/read.sam diff --git a/testing_suite/input_files/multiexon_read_overlapping_monoexon_transcript/read.sam b/testing_suite/input_files/multiexon_read_overlapping_monoexon_transcript/read.sam new file mode 100644 index 0000000..f61d7a9 --- /dev/null +++ b/testing_suite/input_files/multiexon_read_overlapping_monoexon_transcript/read.sam @@ -0,0 +1,3 @@ +@HD VN:1.6 SO:coordinate +@SQ SN:chr20 LN:64444167 +m54119_190202_095143/62849336/ccs 16 chr20 57487908 60 1997M5298N74M1S * 0 0 GCAAAAACAGTACGATTTATTTTTTTATTTTTTTAATAAAAAAATTGTATTTACTTAGAAGCATTCAGAATGTCAACAAAACAGCTGCAACTTTTTTTTTTTGCAATTACAGAGTGGTATTCAGTTAACAGAACAATTATTTCGTATAAGCTGCATCAGAGACAACTGAAGATGAAAAAACTACCATCCCCATATATAACTAATTTGTGCTGCGCACCAACAAGAACCTGCTTTAAATTTCCATGCCAATTTACAACCCCCATACTCTACCAGGCAAGGTTAGTGACTATTGAAAATACCACCAGGACAGGGCCATCTAAAGACACATTTGGTAGTGTATTAACCATACAAAAAACGACACTGTACAGTTTAAAAACAAATCTTACACAGCCTTACATTTCAATTTTTTTCTTTAAAAGGAGTGAGTTGTGTACAGGGGCGTTAAATGCTTTAAGACAAGAAAAAAAACTGCGCTAGAGCCAACTTATTCATCATCATCATCTTCTTCATCTTCTTCATCTTCCTCCTCCTCATCCTCTTCATCTTCCTCATCTTCCTCCTCTTCCTTCTTTTTCTTGCTTTTTTCAGCTTTGACAACTCCCTTTTTTGCTGCCTCAGGCTTTCCTTTAGCTTGATATGCAGCAATATCCTTTTCGTATTTTTCCTTCAGCTTCGCAGCCTTCTTTTCACCAGGCTGCTTGTCATCTGCAGCAGTGTTATTCCACATCTCTCCCAGTTTCTTCGCAACATCACCAATGGACAGGCCAGGATGTTCTCCTTTGATTTTTGGGTGATACTCAGAGCAGAACAGGAAGAAGGCCGAAGGAGGCCTCTTGGGTGCATTGGGATCCTTGAACTTCTTTTTTGTCTCCCCTTTGGGAGGGATATAGGTTTTCATTTGTCTTTCATAATGGGTCTTGTCCGCCTTTGCCATATCCTCAAATTTTCCTTTCTCTTTAGCAGACATGGTCTTCCACCTCTCTGAGCACTTGTTAGAAAACTCTGAGAAGTTGACTGAAGCATCTGAGTGCTTCTTCTTATGCTCCTCCCGACAAGTTTGCACAAAAAATGCATATGATGACATTTTGCCTCTCGGCTTCTTAGGATCTCCTTTGCCCATGTTTAGTTATTTTTCTTCAGCGAGGCACAGAGTCGCCCAGTGCCCGTACGGCTCTCACTTGCCCCGGCGCTGTCTCTATGGAGCTCAATGTACTGCAGTGGCTAACAGTACGATTTATAATAGCACTCTCCCCAAAGGAAAATCCATGACAAACTTTGATACAGATCCAACATCTGTGCAAGATGTCTACGATAGATACAAAACAATGATGAAAGAAATCAAAGAAGGCCTAAATGATTGGAGGGATACACTCTTCTGTTCACGGATTGGGAGACTCAGTATTGTTAAAGTGTAAAATTCTTTCCAAATTGATCTATACATTCATTCCCCCAAAGATCTCAGCAACATTTTCTTTTTGCATAAATTCACAAGTGGATTTGAAAATTTAGATGGAAAGGCAAAGGATTCAGAATCACCAAAAACAACTTTAGAAAAGAAGAACAAAGCTGGAGGACTTGCATTATCTGAGTTTGAGGCTGACTATACGCTGTCAAGACAGCCTGGTTCTGATGAAGGACTAGCTTGGGAAAGAATGGAGCTACAACACCCACACATGTGGCCAATACTTTTTTTTTTTTTTTTGAGACGAAGTCTCACTGTAGCCCAGGCTGGAGTGCAGTGGTGTGATCTTGGCTCACGACAACCTCCGCCTCCATGGCTCAAGCCATTCTTGTGCCTCAGCCTCTGAGGAGCAGGGACTACAGGTGTGCGCCACCACATCCAGATAATATTTTGTATTTTAGTAGAGATGAGGTTTCACCATGTTGCCCAGCGTGGTCTCAAACTCCTGAGCTCAGGCGATCTGTCCTCCTCGGCTTCCCAAAGTGCTGGGATTACAGGCGTGAGCCACCGTGCCTGTCAATACATTTTAATACAGCAAGCATTCACGAACTCAGAGCCAACTGGTGTCCTCACTCCCAGGTTTTGCAAAGGTCCCTTCTCTTCCTTGCCC * ms:i:2008 AS:i:1984 nn:i:0 ts:A:+ tp:A:P cm:i:613 s1:i:1944 s2:i:988 de:f:0.0053 rl:i:291 NM:i:0 MD:Z:2071 jM:B:c,2 jI:B:i,57489905,57495202 RG:Z:test diff --git a/testing_suite/test_multiexon_read_overlapping_monoexon_transcript.py b/testing_suite/test_multiexon_read_overlapping_monoexon_transcript.py new file mode 100644 index 0000000..7ff6a21 --- /dev/null +++ b/testing_suite/test_multiexon_read_overlapping_monoexon_transcript.py @@ -0,0 +1,54 @@ +import pytest +from talon import talon, init_refs +import sqlite3 +import pysam +@pytest.mark.integration + +class TestMultExonReadOverlapsMonoGene(object): + + def test_transcript_assigned_intergenic(self): + """ This test covers a case reported by a user where a read overlaps + the ~600bp mono-exonic pseudogene HMGB1P1. The read itself has + 2 exons, the second of which contains the small pseudogene inside. + earlier versions of TALON classified the read as intergenic, + when it was actually supposed to be genomic """ + + # Set up references + database = "scratch/multiexon_read_overlapping_monoexon_transcript/talon.db" + conn = sqlite3.connect(database) + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + + build = "hg38" + talon.get_counters(database) + run_info = talon.init_run_info(database, build) + struct_collection = talon.prepare_data_structures(cursor, run_info) + + # Use pysam to get the read from the SAM file + sam_file = "input_files/multiexon_read_overlapping_monoexon_transcript/read.sam" + with pysam.AlignmentFile(sam_file) as sam: + for entry in sam: + sam_record = entry + break + + # Get read attributes + chrom = sam_record.reference_name + strand = "-" if sam_record.is_reverse else "+" + sam_start = sam_record.reference_start + sam_end = sam_record.reference_end + + # Do we get any overlap with the reference gene? + best_gene, match_strand = talon.search_for_overlap_with_gene(chrom, min(sam_start, sam_end), + max(sam_start, sam_end), strand, + cursor, run_info, + struct_collection.tmp_gene) + assert best_gene == 1 + assert match_strand == "-" + + annotation_info = talon.annotate_read(sam_record, cursor, run_info, + struct_collection, mode = 0) + + assert annotation_info['gene_ID'] == 1 + assert annotation_info['transcript_ID'] == 2 + assert 'genomic_transcript' in annotation_info['transcript_novelty'][0] +