Merge pull request #355 from nf-core/dev

nf-core · Apr 24, 2023 · c45b22c · c45b22c
2 parents 58afbbc + 613a4f2
commit c45b22c
Show file tree

Hide file tree

Showing 23 changed files with 600 additions and 117 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,29 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## v2.3.0 = [2022/04/24]
+
+### Added
+
+- Shell specification to bash
+- COSMIC password put into quotes
+- Trimmed reads QC in MultiQC
+- Add `ARRIBA_VISUALISATION` to processed affected by `--skip_vis`
+- Option `fusionreport_filter` to in/activate fusionreport displaying of fusions detected by 2 or more tools
+
+### Changed
+
+- `Arriba` visualisation now runs for FusionInspector (combined tools) results, not only `Arriba` results
+- Updated metro map with trimming options and placed `Arriba` visualisation after `FusionInspector`
+- Exit with error when using squid in combination with any ensembl version different from 102
+
+### Fixed
+
+- Channel issue with indexing of files with using `--cram squid`
+- `Arriba` references published in the correct folder
+
+### Removed
+
 ## v2.2.0 - [2022/03/13]
 
 ### Added

diff --git a/README.md b/README.md
@@ -61,7 +61,6 @@ In rnafusion the full-sized test includes reference building and fusion detectio
    - [Samtool](https://github.com/samtools/samtools) sort
    - [Samtool](https://github.com/samtools/samtools) index
    - [Arriba](https://github.com/suhrig/arriba) fusion detection
-   - [Arriba](https://github.com/suhrig/arriba) visualisation
 5. Pizzly subworkflow
    - [Kallisto](https://pachterlab.github.io/kallisto/) quantification
    - [Pizzly](https://github.com/pmelsted/pizzly) fusion detection
@@ -81,6 +80,7 @@ In rnafusion the full-sized test includes reference building and fusion detectio
    - [Fusion-report](https://github.com/matq007/fusion-report)
 10. FusionInspector subworkflow
     - [FusionInspector](https://github.com/FusionInspector/FusionInspector)
+    - [Arriba](https://github.com/suhrig/arriba) visualisation
 11. Stringtie subworkflow
     - [StringTie](https://ccb.jhu.edu/software/stringtie/index.shtml)
 12. Present QC for raw reads ([`MultiQC`](http://multiqc.info/))

diff --git a/bin/megafusion.py b/bin/megafusion.py
@@ -0,0 +1,197 @@
+#!/usr/bin/env python
+
+import argparse
+import logging
+import sys
+from pathlib import Path
+import pandas as pd
+import ast
+
+logger = logging.getLogger()
+
+FUSIONINSPECTOR_MAP = {
+    "fusion": {"column": 0, "delimiter": "\t", "element": 0},
+    "chromosomeA": {"column": 7, "delimiter": ":", "element": 0},
+    "chromosomeB": {"column": 10, "delimiter": ":", "element": 0},
+    "posA": {"column": 7, "delimiter": ":", "element": 1},
+    "posB": {"column": 10, "delimiter": ":", "element": 1},
+    "strand1": {"column": 7, "delimiter": ":", "element": 2},
+    "strand2": {"column": 10, "delimiter": ":", "element": 2},
+    "geneA": {"column": 0, "delimiter": "--", "element": 0},
+    "geneB": {"column": 0, "delimiter": "--", "element": 1},
+    "split_reads": {"column": 1, "delimiter": "\t", "element": 0},
+    "discordant_pairs": {"column": 2, "delimiter": "\t", "element": 0},
+    "ffpm": {"column": 25, "delimiter": "\t", "element": 0},
+}
+
+
+def parse_args(argv=None):
+    """Define and immediately parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Validate and transform a tabular samplesheet.",
+        epilog="Example: python check_samplesheet.py samplesheet.csv samplesheet.valid.csv",
+    )
+    parser.add_argument(
+        "--fusioninspector",
+        metavar="FUSIONINSPECTOR",
+        type=Path,
+        help="FusionInspector output in TSV format.",
+    )
+    parser.add_argument(
+        "--fusionreport",
+        metavar="FUSIONREPORT",
+        type=Path,
+        help="Fusionreport output in TSV format.",
+    )
+    parser.add_argument("--sample", metavar="SAMPLE", type=Path, help="Sample name.", default="Sample")
+    parser.add_argument(
+        "--out",
+        metavar="OUT",
+        type=Path,
+        help="Output path.",
+    )
+    return parser.parse_args(argv)
+
+
+def header_def(sample):
+    return '##fileformat=VCFv4.1\n\
+##ALT=<ID=BND,Description="Break end">\n\
+##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">\n\
+##INFO=<ID=CHRA,Number=1,Type=String,Description="Chromosome A">\n\
+##INFO=<ID=CHRB,Number=1,Type=String,Description="Chromosome B">\n\
+##INFO=<ID=GENEA,Number=.,Type=String,Description="Gene A">\n\
+##INFO=<ID=GENEB,Number=.,Type=String,Description="Gene B">\n\
+##INFO=<ID=ORIENTATION,Number=.,Type=String,Description="Strand1 and strand2 directions">\n\
+##INFO=<ID=FOUND_DB,Number=.,Type=String,Description="Databases in which the fusion has been found">\n\
+##INFO=<ID=ARRIBA,Number=.,Type=String,Description="Found by arriba">\n\
+##INFO=<ID=FUSIONCATCHER,Number=.,Type=String,Description="Found by fusioncatcher">\n\
+##INFO=<ID=PIZZLY,Number=.,Type=String,Description="Found by pizzly">\n\
+##INFO=<ID=SQUID,Number=.,Type=String,Description="Found by squid">\n\
+##INFO=<ID=STARFUSION,Number=.,Type=String,Description="Found by starfusion">\n\
+##INFO=<ID=TOOL_HITS,Number=.,Type=Integer,Description="Number of tools that found the fusion">\n\
+##INFO=<ID=SCORE,Number=.,Type=Float,Description="Score from fusionreport">\n\
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n\
+##FORMAT=<ID=DV,Number=1,Type=Integer,Description="Number of paired-ends that support the event">\n\
+##FORMAT=<ID=RV,Number=1,Type=Integer,Description="Number of split reads that support the event">\n\
+##FORMAT=<ID=FFPM,Number=1,Type=Float,Description="Fusion fragments per million total RNA-seq fragments">\n\
+#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{}'.format(
+        sample
+    )
+
+
+def read_fusioninspector(fusioninspector_file, col_num, delimiter, element):
+    with open(fusioninspector_file) as fusioninspector:
+        return [line.split()[col_num].split(delimiter)[element] for line in fusioninspector if not line.startswith("#")]
+
+
+def build_fusioninspector_dataframe(file, map):
+    new_dict = {}
+    for key in FUSIONINSPECTOR_MAP:
+        new_dict[key] = read_fusioninspector(
+            file,
+            map[key]["column"],
+            map[key]["delimiter"],
+            map[key]["element"],
+        )
+    return pd.DataFrame.from_dict(new_dict).set_index("fusion")
+
+
+def read_build_fusionreport(fusionreport_file):
+    with open(fusionreport_file) as fusionreport:
+        from_html = [line.split('rows": [')[1] for line in fusionreport if 'name="fusion_list' in line]
+        expression = from_html[0].split('], "tool')[0]
+        return pd.DataFrame.from_dict(ast.literal_eval(expression)).set_index("fusion")
+
+
+def column_manipulation(df):
+    df["ALT"] = ""
+    df = df.reset_index()
+    df["FORMAT"] = "GT:DV:RV:FFPM"
+    df["ID"] = "."
+    df["QUAL"] = "."
+    df["FILTER"] = "PASS"
+    df["REF"] = "N"
+
+    for index, row in df.iterrows():
+        # ALT
+        if not row["strand1"] in ["+", "-"] or not row["strand2"] in ["+", "-"]:
+            df.loc[index, "ALT"] = "N[{}:{}[".format(df["chromosomeB"], row["posB"])
+        elif row["strand1"] == "-" and row["strand2"] == "-":
+            df.loc[index, "ALT"] = "[{}:{}[N".format(row["chromosomeB"], row["posB"])
+        elif row["strand1"] == "+" and row["strand2"] == "-":
+            df.loc[index, "ALT"] = "N]{}:{}]".format(row["chromosomeB"], row["posB"])
+        elif row["strand1"] == "-" and row["strand2"] == "+":
+            df.loc[index, "ALT"] = "N]{}:{}]".format(row["chromosomeB"], row["posB"])
+        else:
+            df.loc[index, "ALT"] = "N[{}:{}[".format(row["chromosomeB"], row["posB"])
+        # INFO
+        df.loc[index, "INFO"] = (
+            "SVTYPE=BND;CHRA={};CHRB={};GENEA={};GENEB={};ORIENTATION={},{};FOUND_DB={};"
+            "ARRIBA={};FUSIONCATCHER={};PIZZLY={};SQUID={};STARFUSION={};TOOL_HITS={};SCORE={}".format(
+                row["chromosomeA"],
+                row["chromosomeB"],
+                row["geneA"],
+                row["geneB"],
+                row["strand1"],
+                row["strand2"],
+                row["found_db"],
+                row["arriba"],
+                row["fusioncatcher"],
+                row["pizzly"],
+                row["squid"],
+                row["starfusion"],
+                row["tools_hits"],
+                row["score"],
+            )
+        )
+        # FORMAT
+        df.loc[index, "Sample"] = "./1:{}:{}:{}".format(row["split_reads"], row["discordant_pairs"], row["ffpm"])
+    return df
+
+
+def write_vcf(df_to_print, header, out_file):
+    df_to_print[
+        [
+            "chromosomeA",
+            "posA",
+            "ID",
+            "REF",
+            "ALT",
+            "QUAL",
+            "FILTER",
+            "INFO",
+            "FORMAT",
+            "Sample",
+        ]
+    ].to_csv(
+        path_or_buf=out_file,
+        sep="\t",
+        header=None,
+        index=False,
+    )
+
+    with open(out_file, "r+") as f:
+        content = f.read()
+        f.seek(0, 0)
+        f.write(header.rstrip("\r\n") + "\n" + content)
+
+
+def megafusion(fusioninspector_in_file, fusionreport_in_file, sample, out):
+    """Convert fusion information from FusionInspector and fusion-report into a vcf file. Adapted from https://github.com/J35P312/MegaFusion"""
+    merged_df = build_fusioninspector_dataframe(fusioninspector_in_file, FUSIONINSPECTOR_MAP).join(
+        read_build_fusionreport(fusionreport_in_file), how="left"
+    )
+    write_vcf(column_manipulation(merged_df), header_def(sample), out)
+
+
+def main(argv=None):
+    """Coordinate argument parsing and program execution."""
+    args = parse_args(argv)
+    if not args.fusioninspector.is_file() or not args.fusionreport.is_file():
+        logger.error(f"The given input file {args.fusioninspector} or {args.fusionreport} was not found!")
+        sys.exit(2)
+    megafusion(args.fusioninspector, args.fusionreport, args.sample, args.out)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/conf/base.config b/conf/base.config
@@ -13,6 +13,7 @@ process {
     cpus   = { check_max( 1    * task.attempt, 'cpus'   ) }
     memory = { check_max( 6.GB * task.attempt, 'memory' ) }
     time   = { check_max( 4.h  * task.attempt, 'time'   ) }
+    shell  = ['/bin/bash', '-euo', 'pipefail']
 
     errorStrategy = { task.exitStatus in [140,143,137,104,134,139] ? 'retry' : 'finish' }
     maxRetries    = 1

diff --git a/conf/modules.config b/conf/modules.config
@@ -27,7 +27,16 @@ process {
         ext.prefix =  { "${meta.id}.arriba" }
     }
 
+    withName: ARRIBA_DOWNLOAD {
+        publishDir = [
+            path: { "${params.genomes_base}/arriba" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
     withName: ARRIBA_VISUALISATION {
+        ext.when         = { !params.fusioninspector_only && (params.starfusion || params.all) }
         publishDir = [
             path: { "${params.outdir}/arriba_visualisation" },
             mode: params.publish_dir_mode,
@@ -57,6 +66,7 @@ process {
 
     withName: FASTQC {
         ext.args = '--quiet'
+        ext.when         = { !params.skip_qc }
     }
 
     withName: FASTQC_FOR_TRIM {
@@ -89,6 +99,7 @@ process {
     withName: FUSIONREPORT {
         ext.when         = { !params.skip_vis }
         ext.args         = "--export csv"
+        ext.args2        = { params.fusionreport_filter ? "--tool-cutoff 2" : "--tool-cutoff 1"}
         publishDir = [
             path: { "${params.outdir}/fusionreport/${meta.id}" },
             mode: params.publish_dir_mode,
@@ -129,6 +140,15 @@ process {
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
         ]
     }
+    withName: MEGAFUSION {
+        ext.when = {!params.fusioninspector_only}
+    }
+
+
+
+    withName: MULTIQC {
+        ext.when         = { !params.skip_qc }
+    }
 
     withName: PICARD_COLLECTRNASEQMETRICS {
         ext.when         = { !params.skip_qc && !params.fusioninspector_only && (params.starfusion || params.all) }
@@ -207,10 +227,10 @@ process {
         ]
     }
 
-    withName: SAMTOOLS_SORT_FOR_SQUID {
+    withName: SAMTOOLS_SORT_FOR_SQUID_CHIMERIC {
         ext.prefix = { "${meta.id}_chimeric_sorted" }
         publishDir = [
-            path: { "${params.outdir}/samtools_sort_for_squid" },
+            path: { "${params.outdir}/samtools_sort_for_squid_chimeric" },
             mode: params.publish_dir_mode,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
@@ -225,11 +245,11 @@ process {
         ]
     }
 
-    withName: SAMTOOLS_VIEW_FOR_SQUID {
+    withName: SAMTOOLS_VIEW_FOR_SQUID_CHIMERIC {
         ext.prefix = { "${meta.id}_chimeric" }
         ext.args = { "--output-fmt bam" }
         publishDir = [
-            path: { "${params.outdir}/samtools_view_for_squid" },
+            path: { "${params.outdir}/samtools_view_for_squid_chimeric" },
             mode: params.publish_dir_mode,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]

diff --git a/docs/images/nf-core-rnafusion_metro_map.png b/docs/images/nf-core-rnafusion_metro_map.png