Merge pull request #87 from bergmanlab/tebreak

adding tebreak as an additional method. Reformatting config files to be consistent across methods. Updating documentation.
bergmanlab · Jun 8, 2021 · 93369ef · 93369ef
2 parents a393b73 + a6e0273
commit 93369ef
Show file tree

Hide file tree

Showing 92 changed files with 2,064 additions and 908 deletions.
diff --git a/Snakefile b/Snakefile
@@ -11,6 +11,7 @@ include: config['args']['mcc_path']+"/snakefiles/retroseq.snakefile"
 include: config['args']['mcc_path']+"/snakefiles/tepid.snakefile"
 include: config['args']['mcc_path']+"/snakefiles/teflon.snakefile"
 include: config['args']['mcc_path']+"/snakefiles/jitterbug.snakefile"
+include: config['args']['mcc_path']+"/snakefiles/tebreak.snakefile"
 
 rule setup_reads:
     input:
@@ -223,7 +224,9 @@ rule sam_to_bam:
     output:
         bam = config['mcc']['bam'],
         flagstat = config['mcc']['flagstat'],
-        tmp_bam = temp(config['mcc']['mcc_files']+config['args']['run_id']+".tmp.bam")
+        metrics = config['mcc']['mcc_files']+config['args']['run_id']+".metrics",
+        tmp_bam = temp(config['mcc']['mcc_files']+config['args']['run_id']+".tmp.bam"),
+        tmp2_bam = temp(config['mcc']['mcc_files']+config['args']['run_id']+".tmp2.bam")
 
     script:
         config['args']['mcc_path']+"/scripts/preprocessing/sam_to_bam.py"

diff --git a/auxiliary/simulation/mcc_sim.yml b/auxiliary/simulation/mcc_sim.yml
@@ -10,4 +10,5 @@ dependencies:
   - wgsim 
   - seqtk
   - samtools
-  - bedtools
+  - bedtools
+  - art
diff --git a/auxiliary/simulation/mcclintock_simulation.py b/auxiliary/simulation/mcclintock_simulation.py
@@ -43,7 +43,7 @@ def main():
         fastq2 = modified_reference.replace(".fasta", "_2.fastq")
         if not os.path.exists(fastq1) or not os.path.exists(fastq2):
             num_pairs = calculate_num_pairs(modified_reference, args.coverage, args.length, single=args.single)
-            fastq1, fastq2 = create_synthetic_reads(modified_reference, num_pairs, args.length, args.insert, args.error, x, args.out, run_id=args.runid, seed=args.seed)
+            fastq1, fastq2 = create_synthetic_reads(args.sim, modified_reference, args.coverage, num_pairs, args.length, args.insert, args.error, x, args.out, run_id=args.runid, seed=args.seed)
 
         if not os.path.exists(summary_report):
             run_mcclintock(fastq1, fastq2, args.reference, args.consensus, args.locations, args.taxonomy, x, args.proc, args.out, args.config, args.keep_intermediate, run_id=args.runid, single=args.single, reverse=reverse)
@@ -71,6 +71,7 @@ def parse_args():
     parser.add_argument("--end", type=int, help="The number of replicates to run. [default = 300]", required=False)
     parser.add_argument("--seed", type=str, help="a seed to the random number generator so runs can be replicated", required=False)
     parser.add_argument("--runid", type=str, help="a string to prepend to output files so that multiple runs can be run at the same time without file name clashes", required=False)
+    parser.add_argument("--sim", type=str, help="Short read simulator to use (options=wgsim,art) [default = wgsim]", required=False)
     parser.add_argument("-s","--single", action="store_true", help="runs the simulation in single ended mode", required=False)
 
     args = parser.parse_args()
@@ -142,6 +143,13 @@ def parse_args():
     elif args.strand != "plus" and args.strand != "minus":
         sys.exit("ERROR: --strand must be plus or minus \n")
 
+
+    if args.sim is None:
+        args.sim = "wgsim"
+    elif args.sim not in ["wgsim", "art"]:
+        sys.exit("ERROR: --sim must be wgsim or art \n")
+
+
     return args
 
 def run_command(cmd_list, log=None):
@@ -374,20 +382,31 @@ def calculate_num_pairs(fasta, coverage, length, single=False):
 
     return num_pairs
 
-def create_synthetic_reads(reference, num_pairs, length, insert, error, rep, out, run_id="", seed=None):
+def create_synthetic_reads(simulator, reference, coverage, num_pairs, length, insert, error, rep, out, run_id="", seed=None):
     if seed is not None:
         random.seed(seed+"create_synthetic_reads"+str(rep))
     else:
         random.seed(str(datetime.now())+"create_synthetic_reads"+str(rep))
 
     seed_for_wgsim = random.randint(0,1000)
 
+    tmp_fastq1 = reference.replace(".fasta", "") + "1.fq"
+    tmp_fastq2 = reference.replace(".fasta", "") + "2.fq"
+
     fastq1 = reference.replace(".fasta", "_1.fastq")
     fastq2 = reference.replace(".fasta", "_2.fastq")
     report = reference.replace(".fasta", "_wgsim_report.txt")
 
-    command = ["wgsim", "-1", str(length), "-2", str(length), "-d", str(insert), "-N", str(num_pairs), "-S", str(seed_for_wgsim), "-e", str(error), "-h", reference, fastq1, fastq2]
+    if simulator == "wgsim":
+        command = ["wgsim", "-1", str(length), "-2", str(length), "-d", str(insert), "-N", str(num_pairs), "-S", str(seed_for_wgsim), "-e", str(error), "-h", reference, tmp_fastq1, tmp_fastq2]
+
+    else:
+        command = ["art_illumina", "-ss", "HS25", "--rndSeed", str(seed_for_wgsim), "-sam", "-i", reference, "-p", "-l", str(length), "-f", str(coverage), "-m", str(insert), "-s", "10", "-o", reference.replace(".fasta", "")]
+
+
     run_command_stdout(command, report)
+    run_command(["mv", tmp_fastq1, fastq1])
+    run_command(["mv", tmp_fastq2, fastq2])
 
     return fastq1, fastq2
 

diff --git a/auxiliary/simulation/mcclintock_simulation_analysis.py b/auxiliary/simulation/mcclintock_simulation_analysis.py
@@ -317,8 +317,15 @@ def write_combined_metrics(forward_metrics, reverse_metrics, outfile):
     with open(outfile, "w") as out:
         out.write(",".join(forward_metrics[0]) + "\n")
         for method in metrics.keys():
-            precision = metrics[method][0] / (metrics[method][0] + metrics[method][1])
-            recall = metrics[method][0] / (metrics[method][0] + metrics[method][2])
+            if metrics[method][0] + metrics[method][1] > 0:
+                precision = metrics[method][0] / (metrics[method][0] + metrics[method][1])
+            else:
+                precision = 0
+
+            if metrics[method][0] + metrics[method][2] > 0:
+                recall = metrics[method][0] / (metrics[method][0] + metrics[method][2])
+            else:
+                recall = 0
             outline = [method, str(metrics[method][0]), str(metrics[method][1]), str(metrics[method][2]), str(precision), str(recall)]
             out.write(",".join(outline) + "\n")
 

diff --git a/auxiliary/simulation/yeast/yeast_config.json b/auxiliary/simulation/yeast/yeast_config.json
@@ -20,6 +20,6 @@
 
     "mcclintock": {
         "path" : "/home/pjb68507/git//mcclintock/",
-        "methods" : "ngs_te_mapper,ngs_te_mapper2,relocate,relocate2,temp,temp2,retroseq,popoolationte,popoolationte2,te-locate,teflon"
+        "methods" : "ngs_te_mapper,ngs_te_mapper2,relocate,relocate2,temp,temp2,retroseq,popoolationte,popoolationte2,te-locate,teflon,tebreak"
     }
 }
diff --git a/config/coverage/coverage.py b/config/coverage/coverage.py
@@ -1,10 +1,9 @@
-
-# omits the edges of the TE sequence coverage when calculating average depth across the element
-OMIT_EDGES = True
-
-# If OMIT_EDGES = True, use the read length as the length of the edges to omit
-OMIT_EDGES_READ_LENGTH = True
-
-# IF OMIT_EDGES = True and OMIT_EDGES_READ_LENGTH = False
-# use this value as the length of the edges to omit
-OMIT_EDGES_LENGTH = 300
+PARAMS = {
+    # omits the edges of the TE sequence coverage when calculating average depth across the element
+    "omit_edges": True,
+    # If OMIT_EDGES = True, use the read length as the length of the edges to omit
+    "omit_edges_read_length" : True,
+    # IF OMIT_EDGES = True and OMIT_EDGES_READ_LENGTH = False
+    # use this value as the length of the edges to omit
+    "omit_edges_length" : 300
+}
diff --git a/config/ngs_te_mapper/ngs_te_mapper_post.py b/config/ngs_te_mapper/ngs_te_mapper_post.py
@@ -1,2 +1,3 @@
-
-MIN_READ_SUPPORT = 0
+PARAMS = {
+    "min_read_support" : 0
+}
diff --git a/config/ngs_te_mapper/ngs_te_mapper_run.py b/config/ngs_te_mapper/ngs_te_mapper_run.py
@@ -1,3 +1,4 @@
-
-# max length of junction read overlap to consider a target site duplication
-MAX_TSD = 20
+PARAMS = {
+    # max length of junction read overlap to consider a target site duplication
+    "tsd=" : 20
+}
diff --git a/config/ngs_te_mapper2/ngs_te_mapper2_run.py b/config/ngs_te_mapper2/ngs_te_mapper2_run.py
@@ -1,17 +1,15 @@
-
-# merge window for identifying TE clusters
-WINDOW = 10
-
-# minimum mapping quality of alignment
-MIN_MAPQ = 20
-
-# minimum allele frequency
-MIN_ALLELE_FREQUENCY = 0.1
-
-# max length of junction read overlap to consider a target site duplication
-MAX_TSD = 25
-
-# maximum gap size
-MAX_GAP = 5
+PARAMS = {
+    "--window" : 10,
+    "--min_mapq" : 20,
+    "--min_af" : 0.1,
+    "--tsd_max" : 25,
+    "--gap_max" : 5
+}
+
+#   --window WINDOW       merge window for identifying TE clusters (default = 10)
+#   --min_mapq MIN_MAPQ   minimum mapping quality of alignment (default = 20)
+#   --min_af MIN_AF       minimum allele frequency (default = 0.1)
+#   --tsd_max TSD_MAX     maximum TSD size (default = 25)
+#   --gap_max GAP_MAX     maximum gap size (default = 5)
 
 
diff --git a/config/popoolationte/popoolationte_post.py b/config/popoolationte/popoolationte_post.py
@@ -1,6 +1,6 @@
-
-# requires that final results have support on both ends of prediction
-REQUIRE_BOTH_END_SUPPORT=True
-
-# threshold for the minimum acceptable fraction of the reads supporting the prediction
-PERCENT_READ_SUPPORT_THRESHOLD=0.1
+PARAMS = {
+    # requires that final results have support on both ends of prediction
+    "require_both_end_support" : True,
+    # threshold for the minimum acceptable fraction of the reads supporting the prediction
+    "percent_read_support_threshold" : 0.1
+}
diff --git a/config/popoolationte/popoolationte_run.py b/config/popoolationte/popoolationte_run.py
@@ -1,3 +1,23 @@
+
+PARAMS = {
+    "identify-te-insertsites.pl" : {
+        "--min-count" : 3,
+        "--min-map-qual" : 15
+    },
+    "crosslink-te-sites.pl" : {
+        "--single-site-shift": 100
+    },
+    "update-teinserts-with-knowntes.pl" : {
+        "--single-site-shift": 100
+    },
+    "estimate-polymorphism.pl" : {
+        "--min-map-qual": 15
+    },
+    "filter-teinserts.pl" : {
+        "--min-count": 5
+    }
+}
+
 '''
 NAME
     perl identify-te-insertsites.pl - Identifies TE insertion sites (forward
@@ -11,27 +31,7 @@
     --min-map-qual
         the minimum mapping quality; this will only apply to reads mapping
         to a reference contig.
-'''
-IDENTIFY_TE_INSERTSITES = {
-    "min-count" : 3,
-    "min-map-qual" : 15
-}
-
-
-
-
-CROSSLINK_TE_SITES = {
-    "single-site-shift": 100
-}
-
 
-
-UPDATE_TEINSERTS_WITH_KNOWNTES = {
-    "single-site-shift": 100
-}
-
-
-'''
 NAME
     perl estimate-polymorphism.pl - Estimte the insertion frequencies for a
     given set of TE insertions
@@ -43,13 +43,4 @@
     --min-map-qual
         the minimum mapping quality
 
-'''
-
-ESTIMATE_POLYMORPHISM = {
-    "min-map-qual": 15
-}
-
-
-FILTER = {
-    "min-count": 5
-}
+'''
diff --git a/config/popoolationte2/popoolationte2_post.py b/config/popoolationte2/popoolationte2_post.py
@@ -1,6 +1,7 @@
+PARAMS = {
+    # require that the TE prediction have support from both junctions
+    "require_both_end_support" : True,
 
-# require that the TE prediction have support from both junctions
-REQUIRE_BOTH_END_SUPPORT = True
-
-# threshold for the minimum acceptable (average physical coverage supporting the insertion of the given TE) / (average physical coverage)
-FREQUENCY_THRESHOLD=0.1
+    # threshold for the minimum acceptable (average physical coverage supporting the insertion of the given TE) / (average physical coverage)
+    "frequency_threshold" : 0.1
+}
diff --git a/config/popoolationte2/popoolationte2_run.py b/config/popoolationte2/popoolationte2_run.py
@@ -1,4 +1,39 @@
 
+PARAMS = {
+    "ppileup" : {
+        "--map-qual": 15,
+        "--sr-mindist" : 10000,
+        "--id-up-quant": 0.01
+    },
+
+    "subsampleppileup" : {
+        "run" : False,
+        "--target-coverage": 100,
+        "--with-replace": False
+    },
+
+    "identifySignatures" : {
+        "--min-count": 2.0,
+        "--signature-window": "median",
+        "--min-valley": "median",
+        "--chunk-distance": 5
+    },
+
+    "updateStrand" : {
+        "--map-qual": 15,
+        "--max-disagreement": 0.1,
+        "--sr-mindist": 10000,
+        "--id-up-quant": 0.01
+    },
+
+    "pairupSignatures" : {
+        "--min-distance": -100,
+        "--max-distance": 500,
+        "--max-freq-diff": 1.0
+    }
+
+}
+
 #ppileup
 # create a physical pileup file from one or multiple bam files
 
@@ -11,11 +46,6 @@
 # --id-up-quant         paired end fragments with an insert size in the upper
 #                       quantile will be ignored [fraction]; default=0.01
 
-ppileup = {
-    "map-qual": 15,
-    "sr-mindist" : 10000,
-    "id-up-quant": 0.01
-}
 
 
 # subsampleppileup 
@@ -28,11 +58,7 @@
 # == Parameters for fine tuning ==
 # --with-replace        use sampling with replacement instead of without replacement
 
-subsampleppileup = {
-    "run" : False,
-    "target-coverage": 100,
-    "with-replace": False
-}
+
 
 
 # identifySignatures
@@ -51,12 +77,7 @@
 # --chunk-distance      minimum distance between chromosomal chunks in multiples
 #                       of --min-valley [int]; default=5
 
-identifySignatures = {
-    "min-count": 2.0,
-    "signature-window": "median",
-    "min-valley": "median",
-    "chunk-distance": 5
-}
+
 
 
 # updateStrand
@@ -73,12 +94,6 @@
 # --id-up-quant         paired end fragments with an insert size in the upper
 #                       quantile will be ignored [fraction]; default=0.01
 
-updateStrand = {
-    "map-qual": 15,
-    "max-disagreement": 0.1,
-    "sr-mindist": 10000,
-    "id-up-quant": 0.01
-}
 
 
 # pairupSignatures
@@ -92,8 +107,3 @@
 # --max-freq-diff       the maximum frequency difference between signatures;
 #                       default=1.0
 
-pairupSignatures = {
-    "min-distance": -100,
-    "max-distance": 500,
-    "max-freq-diff": 1.0
-}
diff --git a/config/relocate/relocate_post.py b/config/relocate/relocate_post.py
@@ -1,5 +1,6 @@
-REF_LEFT_THRESHOLD = 0
-REF_RIGHT_THRESHOLD = 0
-
-NONREF_LEFT_THRESHOLD = 0
-NONREF_RIGHT_THRESHOLD = 0
+PARAMS = {
+    "ref_left_threshold" : 0,
+    "ref_right_threshold" : 0,
+    "nonref_left_threshold" : 0,
+    "nonref_right_threshold" : 0
+}