Merge pull request #191 from UPHL-BioNGS/update-20240417

Update 20240417
UPHL-BioNGS · Apr 25, 2024 · e773ce5 · e773ce5
2 parents ff093db + 7a742fa
commit e773ce5
Show file tree

Hide file tree

Showing 19 changed files with 208 additions and 44 deletions.
diff --git a/.github/workflows/roary.yml b/.github/workflows/roary.yml
@@ -0,0 +1,41 @@
+name: Test Grandeur just_msa with roary workflow
+
+on: [pull_request, workflow_dispatch]
+
+jobs:
+
+  test:
+    runs-on: ubuntu-20.04
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Install Nextflow
+        run: |
+          wget -qO- get.nextflow.io | bash
+          sudo mv nextflow /usr/local/bin/
+
+          nextflow -version
+          
+      - name: Run Grandeur
+        run: |
+          docker --version
+
+          wget -q https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/904/864/595/GCA_904864595.1_INF333/GCA_904864595.1_INF333_genomic.fna.gz             && gzip -d GCA_904864595.1_INF333_genomic.fna.gz
+          wget -q https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/013/783/245/GCA_013783245.1_ASM1378324v1/GCA_013783245.1_ASM1378324v1_genomic.fna.gz && gzip -d GCA_013783245.1_ASM1378324v1_genomic.fna.gz
+          wget -q https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/026/626/185/GCA_026626185.1_ASM2662618v1/GCA_026626185.1_ASM2662618v1_genomic.fna.gz && gzip -d GCA_026626185.1_ASM2662618v1_genomic.fna.gz 
+          wget -q https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/020/808/985/GCA_020808985.1_ASM2080898v1/GCA_020808985.1_ASM2080898v1_genomic.fna.gz && gzip -d GCA_020808985.1_ASM2080898v1_genomic.fna.gz
+          wget -q https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/904/863/225/GCA_904863225.1_KSB1_6J/GCA_904863225.1_KSB1_6J_genomic.fna.gz           && gzip -d GCA_904863225.1_KSB1_6J_genomic.fna.gz
+          
+          mkdir fastas
+          mv *fna fastas/.
+
+          nextflow run . -profile docker,just_msa -c .github/workflows/github_actions.config --fastas fastas --aligner roary
+
+      - name: Check MSA files
+        run: |
+          for file in grandeur/*/summary_statistics.txt grandeur/iqtree2/iqtree.treefile.nwk grandeur/snp-dists/snp_matrix.txt
+          do
+            head $file
+            wc -l $file
+          done
diff --git a/bin/summary.py b/bin/summary.py
@@ -38,6 +38,7 @@
 pbptyper       = 'pbptyper_summary.tsv'
 plasmidfinder  = 'plasmidfinder_result.tsv'
 quast          = 'quast_report.tsv'
+quast_contig   = 'quast_contig_report.tsv'
 seqsero2       = 'seqsero2_results.txt'
 serotypefinder = 'serotypefinder_results.txt'
 shigatyper_hit = 'shigatyper_hits.txt'
@@ -54,7 +55,7 @@
 ##########################################
 
 csv_files = [ legsta, mykrobe ]
-tsv_files = [ quast, drprg, elgato, seqsero2, kleborate, mlst, emmtyper, pbptyper, shigatyper ]
+tsv_files = [ drprg, elgato, seqsero2, kleborate, mlst, emmtyper, pbptyper, shigatyper ]
 
 ##########################################
 # exiting if no input files              #
@@ -290,6 +291,30 @@
     summary_df = pd.merge(summary_df, new_df, left_on="sample", right_on=analysis + "_sample", how = 'left')
     summary_df.drop(analysis + "_sample", axis=1, inplace=True)
 
+# quast : combining both files
+q_df  = pd.DataFrame()
+qc_df = pd.DataFrame()
+if exists(quast):
+    print("Adding results for " + quast)
+    file = quast
+    analysis = str(file).split("_")[0]
+    q_df = pd.read_table(file, dtype = str, index_col= False)
+    q_df = q_df.add_prefix(analysis + "_")
+    q_df.columns = [x.lower() for x in q_df.columns]
+
+if exists(quast_contig):
+    print("Adding results for " + quast_contig)
+    file = quast_contig
+    analysis = str(file).split("_")[0]
+    qc_df = pd.read_table(file, dtype = str, index_col= False)
+    qc_df = qc_df.add_prefix(analysis + "_")
+    qc_df.columns = [x.lower() for x in qc_df.columns]
+
+if exists(quast) or exists(quast_contig):
+    new_df = pd.concat([q_df, qc_df])
+    summary_df = pd.merge(summary_df, new_df, left_on="sample", right_on=analysis + "_sample", how = 'left')
+    summary_df.drop(analysis + "_sample", axis=1, inplace=True)
+
 # serotypefinder : splitting O and H groups, getting the top hit for O and H group, combining rows
 if exists(serotypefinder) :
     file = serotypefinder
@@ -475,6 +500,10 @@ def fill_predicted_organism(row):
 
     if 'quast_Total length' in summary_df:
         summary_df['quast_estimated_genome_size'] = summary_df['quast_Total length']
+
+    if 'quast_avg. coverage dept' in summary_df:
+        summary_df['quast_estimated_coverage']    = summary_df['quast_avg. coverage dept'].astype(float)
+    elif 'quast_Total length' in summary_df:
         summary_df['quast_estimated_coverage']    = summary_df['total_bases'].astype(float) / summary_df['quast_estimated_genome_size'].astype(float)
 
     cov_columns = []

diff --git a/main.nf b/main.nf
@@ -85,6 +85,7 @@ params.current_datasets     = false
 params.skip_extras          = false
 params.exclude_top_hit      = false
 params.msa                  = false
+params.aligner              = 'panaroo'
 
 
 // ##### ##### ##### ##### ##### ##### ##### ##### ##### #####
@@ -117,7 +118,8 @@ def paramCheck(keys) {
     "min_core_genes",
     "current_datasets",
     "skip_extras",
-    "exclude_top_hit"]
+    "exclude_top_hit",
+    "aligner"]
 
   for(key in keys){
     if (key !in set_keys){
@@ -218,7 +220,7 @@ if (params.fasta_list) {
   ch_fastas = params.fastas
     ? Channel
       .fromPath("${params.fastas}/*{.fa,.fasta,.fna}")
-      .view { "fasta file : $it" }
+      .view { "Fasta file found : ${it.baseName}" }
       .map { it ->
         meta = [id: it.baseName]
         tuple( meta, file(it, checkIfExists: true))

diff --git a/modules/local/amrfinderplus.nf b/modules/local/amrfinderplus.nf
@@ -2,7 +2,7 @@ process amrfinderplus {
   tag           "${meta.id}"
   label         "process_high"
   publishDir    params.outdir, mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-  container     'staphb/ncbi-amrfinderplus:3.12.8-2024-01-31.1'
+  container     'staphb/ncbi-amrfinderplus:3.12.8-2024-01-31.1_2'
   errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
   time          '30m'
 

diff --git a/modules/local/bbduk.nf b/modules/local/bbduk.nf
@@ -3,14 +3,14 @@ process bbduk {
   label         "process_medium"
   publishDir    params.outdir, mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
   container     'staphb/bbtools:39.01'
-  errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
+  //errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
   time          '10m'
 
   input:
   tuple val(meta), file(reads)
 
   output:
-  tuple val(meta), file("bbduk/*_rmphix_R{1,2}.fastq.gz"),  emit: fastq
+  tuple val(meta), file("bbduk/*_rmphix_R{1,2}.fastq.gz"),  emit: fastq, optional: true
   path "bbduk/*",                                           emit: files
   path "bbduk/*.phix.stats.txt",                            emit: stats
   path "logs/${task.process}/*.log",  emit: log
@@ -31,7 +31,7 @@ process bbduk {
       in2=${reads[1]} \
       out1=bbduk/${prefix}_rmphix_R1.fastq.gz \
       out2=bbduk/${prefix}_rmphix_R2.fastq.gz \
-      outm=bbduk/${prefix}.matched_phix.fq \
+      outm=bbduk/${prefix}.matched_phix.fastq.gz \
       ref=/opt/bbmap/resources/phix174_ill.ref.fa.gz \
       stats=bbduk/${prefix}.phix.stats.txt \
       threads=${task.cpus} \
@@ -43,3 +43,5 @@ process bbduk {
     END_VERSIONS
   """
 }
+
+//ref=/bbmap/resources/phix174_ill.ref.fa.gz \
diff --git a/modules/local/datasets.nf b/modules/local/datasets.nf
@@ -2,7 +2,7 @@ process datasets_summary {
   tag           "${taxon}"
   label         "process_single"
   publishDir    params.outdir, mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-  container     'staphb/ncbi-datasets:16.2.0'
+  container     'staphb/ncbi-datasets:16.10.3'
   time          '1h'
   errorStrategy { task.attempt < 2 ? 'retry' : 'ignore' }
 
@@ -52,7 +52,7 @@ process datasets_download {
   // because there's no way to specify threads
   label         "process_medium"
   publishDir    path: "${params.outdir}", mode: 'copy', pattern: "logs/*/*log"
-  container     'staphb/ncbi-datasets:16.2.0'
+  container     'staphb/ncbi-datasets:16.10.3'
   time          '5h'
   errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
 

diff --git a/modules/local/fastp.nf b/modules/local/fastp.nf
@@ -10,9 +10,9 @@ process fastp {
   tuple val(meta), file(reads)
 
   output:
-  tuple val(meta), file("fastp/*_fastp_R{1,2}.fastq.gz"), emit: fastq
-  path "fastp/*_fastp.html",                              emit: html
-  path "fastp/*_fastp.json",                              emit: fastp_files
+  tuple val(meta), file("fastp/*_fastp_R{1,2}.fastq.gz"), emit: fastq, optional: true
+  path "fastp/*_fastp.html",                              emit: html, optional: true
+  path "fastp/*_fastp.json",                              emit: fastp_files, optional: true
   path "logs/${task.process}/*.{log,err}",                emit: log
   tuple val(meta), env(passed_reads),                     emit: fastp_results
   path  "versions.yml",                                   emit: versions

diff --git a/modules/local/mlst.nf b/modules/local/mlst.nf
@@ -2,7 +2,7 @@ process mlst {
   tag           "${meta.id}"
   label         "process_medium"
   publishDir    params.outdir, mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-  container     'staphb/mlst:2.23.0-2024-03-11'
+  container     'staphb/mlst:2.23.0-2024-04-01'
   maxForks      10
   errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
   time          '10m'
@@ -25,7 +25,8 @@ process mlst {
 
     mlst ${args} \
       --threads ${task.cpus} \
-      ${contig} \
+      ${contig} | \
+      tr ' ' '_' \
       > ${prefix}_mlst.txt
 
     python3 ${script} ${prefix}_mlst.txt mlst/${prefix}_mlst.tsv mlst ${prefix}

diff --git a/modules/local/panaroo.nf b/modules/local/panaroo.nf
@@ -19,8 +19,8 @@ process panaroo {
   task.ext.when == null || task.ext.when
 
   shell:
-  def args       = task.ext.args   ?: '--clean-mode strict --remove-invalid-genes'
-  def prefix     = task.ext.prefix ?: "panaroo"
+  def args       = task.ext.args   ?: '--clean-mode strict --remove-invalid-genes --alignment core'
+  def prefix     = task.ext.prefix ?: 'panaroo'
   def assemblies = gff.join(' ')
   """
     mkdir -p logs/${task.process}

diff --git a/modules/local/plasmidfinder.nf b/modules/local/plasmidfinder.nf
@@ -2,7 +2,7 @@ process plasmidfinder {
   tag           "${meta.id}"
   label         "process_medium"
   publishDir    params.outdir, mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-  container     'staphb/plasmidfinder:2.1.6'
+  container     'staphb/plasmidfinder:2.1.6_2024-03-07'
   time          '10m'
   errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
 

diff --git a/modules/local/quast.nf b/modules/local/quast.nf
@@ -7,13 +7,14 @@ process quast {
   errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
 
   input:
-  tuple val(meta), file(contigs)
+  tuple val(meta), file(contigs), file(reads)
 
   output:
   path "quast/*"                                                   , emit: files
   path "quast/*_quast_report.tsv"                  , optional: true, emit: for_multiqc
   tuple val(meta), file("quast/*_quast_report.tsv"), optional: true, emit: results
-  path "quast/*/transposed_report.tsv"             , optional: true, emit: collect
+  path "quast/*/quast_transposed_report.tsv"             , optional: true, emit: collect
+  path "quast/*/quast_transposed_report_contig.tsv"      , optional: true, emit: collect_contig
   path "logs/${task.process}/*.log"                                , emit: log
   path "versions.yml"                                              , emit: versions
 
@@ -23,6 +24,8 @@ process quast {
   shell:
   def args   = task.ext.args   ?: ''
   def prefix = task.ext.prefix ?: "${meta.id}"
+  def fastq  = reads[1] ? "--pe1 ${reads[0]} --pe2 ${reads[1]}" : ""
+  def fin    = reads[1] ? "quast/${prefix}/quast_transposed_report.tsv" : "quast/${prefix}/quast_transposed_report_contig.tsv"
   """
     mkdir -p ${task.process} logs/${task.process}
     log_file=logs/${task.process}/${prefix}.${workflow.sessionId}.log
@@ -31,6 +34,7 @@ process quast {
       ${contigs} \
       --output-dir quast/${prefix} \
       --threads ${task.cpus} \
+      ${fastq} \
       | tee -a \$log_file
 
     if [ -f "quast/${prefix}/report.tsv" ] ; then cp quast/${prefix}/report.tsv quast/${prefix}_quast_report.tsv ; fi
@@ -39,7 +43,7 @@ process quast {
     then
       head -n 1 quast/${prefix}/transposed_report.tsv | awk '{print "sample\\t" \$0 }' > quast/${prefix}/transposed_report.tsv.tmp
       tail -n 1 quast/${prefix}/transposed_report.tsv | awk -v sample=${prefix} '{print sample "\\t" \$0}' >> quast/${prefix}/transposed_report.tsv.tmp
-      mv quast/${prefix}/transposed_report.tsv.tmp quast/${prefix}/transposed_report.tsv
+      mv quast/${prefix}/transposed_report.tsv.tmp ${fin}
     fi
 
     cat <<-END_VERSIONS > versions.yml

diff --git a/modules/local/roary.nf b/modules/local/roary.nf
@@ -0,0 +1,38 @@
+process roary {
+    tag           "Core Genome Alignment"
+    label         "process_high"
+    publishDir    params.outdir, mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+    container     'staphb/roary:3.13.0'
+    errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
+    time          '10h'
+
+    input:
+    file(contigs)
+
+    output:
+    path "roary/*"                                                                       , emit: files
+    path "roary/fixed_input_files/*"                                                     , emit: roary_input_files
+    tuple path("roary/core_gene_alignment.aln"), path("roary/gene_presence_absence.Rtab"), emit: core_gene_alignment, optional: true
+    path "logs/${task.process}/${task.process}.${workflow.sessionId}.log"                , emit: log_files
+    path "versions.yml"                                                                  , emit: versions
+
+    shell:
+    def args       = task.ext.args   ?: ''
+    def prefix     = task.ext.prefix ?: 'roary'
+    """
+        mkdir -p logs/${task.process}
+        log_file=logs/${task.process}/${task.process}.${workflow.sessionId}.log
+
+        roary ${args} \
+        -p ${task.cpus} \
+        -f roary \
+        -e -n \
+        *.gff \
+        | tee -a \$log_file
+
+        cat <<-END_VERSIONS > versions.yml
+        "${task.process}":
+            roary: \$( roary --version )
+        END_VERSIONS
+    """
+}
diff --git a/modules/local/seqsero2.nf b/modules/local/seqsero2.nf
@@ -2,7 +2,7 @@ process seqsero2 {
   tag           "${meta.id}"
   label         "process_medium"
   publishDir    params.outdir, mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-  container     'staphb/seqsero2:1.2.1'
+  container     'staphb/seqsero2:1.3.1'
   time          '10m'
   errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
 

diff --git a/modules/local/serotypefinder.nf b/modules/local/serotypefinder.nf
@@ -2,7 +2,7 @@ process serotypefinder {
   tag           "${meta.id}"
   label         "process_medium"
   publishDir    params.outdir, mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-  container     'staphb/serotypefinder:2.0.1'
+  container     'staphb/serotypefinder:2.0.2'
   maxForks      10
   time          '10m'
   errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}