Merge pull request #67 from UPHL-BioNGS/update-20230523

Update 20230523
UPHL-BioNGS · May 25, 2023 · 24782d5 · 24782d5
2 parents b812f58 + cf91d31
commit 24782d5
Show file tree

Hide file tree

Showing 12 changed files with 238 additions and 96 deletions.
diff --git a/bin/summary.py b/bin/summary.py
@@ -6,6 +6,7 @@
 ##########################################
 
 import pandas as pd
+import json
 from os.path import exists
 
 ##########################################
@@ -20,7 +21,6 @@
 emmtyper       = 'emmtyper_summary.tsv'
 fastani        = 'fastani_summary.csv'
 fastqc         = 'fastqc_summary.csv'
-fastqscan      = 'fastqscan_summary.csv'
 kleborate      = 'kleborate_results.tsv'
 kraken2        = 'kraken2_summary.csv'
 legsta         = 'legsta_summary.csv'
@@ -32,25 +32,28 @@
 seqsero2       = 'seqsero2_results.txt'
 serotypefinder = 'serotypefinder_results.txt'
 shigatyper     = 'shigatyper_results.txt'
+size           = 'size_results.csv'
+multiqc_json   = 'multiqc_data.json'
+multiqc_stats  = 'multiqc_general_stats.txt'
 
 # final files
 final          = 'grandeur_summary'
-extended       = 'grandeur_extended_summary'
+extended       = 'summary/grandeur_extended_summary'
 
 ##########################################
 # grouping similar files                 #
 ##########################################
 
-csv_files = [ fastqscan, legsta ]
+csv_files = [ legsta ]
 tsv_files = [ quast, seqsero2, kleborate, mlst, emmtyper , pbptyper]
-
-top_hit    = [ fastani ]
+top_hit   = [ fastani ]
 
 ##########################################
 # creating the summary dataframe         #
 ##########################################
 
 summary_df = pd.read_csv(names, dtype = str)
+summary_df['warnings'] = ''
 columns = list(summary_df.columns)
 
 # csv files
@@ -109,9 +112,11 @@
     new_df['organism (per mapped reads)'] = new_df['name'] + ' (' + new_df['bam0_read_map_p'] + ')'
     new_df = new_df[['sample', 'organism (per mapped reads)']]
     new_df = new_df.groupby('sample', as_index=False).agg({'organism (per mapped reads)': lambda x: list(x)})
+    new_df['warning'] = new_df['organism (per mapped reads)'].apply(lambda x: "Blobtools multiple organisms," if ','.join(x).count(',') >= 2 else "")
     new_df = new_df.add_prefix(analysis + '_')
     summary_df = pd.merge(summary_df, new_df, left_on="sample", right_on=analysis + "_sample", how = 'left')
     summary_df.drop(analysis + "_sample", axis=1, inplace=True)
+    summary_df['warnings'] = summary_df['warnings'] + summary_df['blobtools_warning']
 
 # fastani : merging relevant rows into one
 if exists(fastani) :
@@ -122,9 +127,11 @@
     new_df['genome (ANI estimate)'] = new_df['reference'].str.split('_').str[0] + " " + new_df['reference'].str.split("_").str[1] + " " + new_df['reference'].str.split('_').str[-2] + "_" + new_df['reference'].str.split('_').str[-1] + " (" + new_df['ANI estimate'] + ")"
     new_df = new_df[['sample', 'genome (ANI estimate)']]
     new_df = new_df.groupby('sample', as_index=False).agg({'genome (ANI estimate)': lambda x: list(x)})
+    new_df['warning'] = new_df['genome (ANI estimate)'].apply(lambda x: "Multiple FastANI hits," if ','.join(x).count(',') >= 4 else "")
     new_df = new_df.add_prefix(analysis + '_')
     summary_df = pd.merge(summary_df, new_df, left_on="sample", right_on=analysis + "_sample", how = 'left')
     summary_df.drop(analysis + "_sample", axis=1, inplace=True)
+    summary_df['warnings'] = summary_df['warnings'] + summary_df['fastani_warning']
 
 # fastqc : merging relevant rows into one
 if exists(fastqc) :
@@ -155,9 +162,11 @@
     new_df['organism (per fragment)'] = new_df['Scientific name'] + " (" + new_df['Percentage of fragments'] + ' ' + new_df['Type'] + ")"    
     new_df = new_df[['Sample', 'organism (per fragment)']]
     new_df = new_df.groupby('Sample', as_index=False).agg({'organism (per fragment)': lambda x: list(x)})
+    new_df['warning'] = new_df['organism (per fragment)'].apply(lambda x: "Kraken2 multiple organisms," if ','.join(x).count(',') >= 2 else "")
     new_df = new_df.add_prefix(analysis + '_')
     summary_df = pd.merge(summary_df, new_df, left_on="sample", right_on=analysis + "_Sample", how = 'left')
     summary_df.drop(analysis + "_Sample", axis=1, inplace=True)
+    summary_df['warnings'] = summary_df['warnings'] + summary_df['kraken2_warning']
 
 # mash : top hit of potentially two different files
 if exists(mash) :
@@ -213,6 +222,97 @@
     summary_df = pd.merge(summary_df, new_df, left_on="sample", right_on=analysis + "_sample", how = 'left')
     summary_df.drop(analysis + "_sample", axis=1, inplace=True)
 
+# multiqc : bbduk and fastp
+
+if exists(multiqc_json) :
+    file = multiqc_json
+    print("Adding analysis parsed via multiqc in " + file)
+    with open(file) as multiqc_data:
+        data = json.load(multiqc_data)
+
+        # fastp filtered reads
+        if "fastp_filtered_reads_plot" in data["report_plot_data"].keys():
+            samples = [sample.replace("_rmphix_R1", "") for sample in data["report_plot_data"]['fastp_filtered_reads_plot']['samples'][0]]
+            fastp_passedreads_df = pd.DataFrame(samples, columns=['fastp_sample'])
+            fastp_passedreads_df['fastp_passed_reads'] = data["report_plot_data"]['fastp_filtered_reads_plot']['datasets'][0][0]['data']
+            summary_df = pd.merge(summary_df, fastp_passedreads_df, left_on="sample", right_on="fastp_sample", how = 'left')
+            summary_df.drop("fastp_sample", axis=1, inplace=True)
+
+        # bbduk phix reads
+        if "bbmap" in data['report_saved_raw_data'].keys():
+            print("Adding in phix reads from bbmap")
+            samples = [sample.replace(".phix", "") for sample in data['report_saved_raw_data']['bbmap']['stats'].keys()]
+            phix_reads=[]
+            for sample in data['report_saved_raw_data']['bbmap']['stats'].keys() :
+                phix_reads.append(data['report_saved_raw_data']['bbmap']['stats'][sample]['kv']['Matched'])
+            bbduk_phixreads_df = pd.DataFrame(samples, columns=['bbduk_sample'])
+            bbduk_phixreads_df['bbduk_phix_reads'] = phix_reads
+            summary_df = pd.merge(summary_df, bbduk_phixreads_df, left_on="sample", right_on="bbduk_sample", how = 'left')
+            summary_df.drop("bbduk_sample", axis=1, inplace=True)
+
+if exists(multiqc_stats) : 
+    file = multiqc_stats
+    print("Adding analysis parsed via multiqc in " + file)
+    new_df = pd.read_table(file, dtype = str, index_col= False)
+    if "FastQC_mqc-generalstats-fastqc-avg_sequence_length" in new_df.columns :
+        tmp_df = new_df[["Sample","FastQC_mqc-generalstats-fastqc-avg_sequence_length"]]
+        tmp_df["fastqc_avg_length"]= tmp_df["FastQC_mqc-generalstats-fastqc-avg_sequence_length"]
+        tmp_df.drop("FastQC_mqc-generalstats-fastqc-avg_sequence_length", axis=1, inplace=True)
+
+        summary_df["possible_fastqc_name"] = summary_df['file'].str.split(" ").str[0].str.split(".").str[0]
+        summary_df = pd.merge(summary_df, tmp_df, left_on="possible_fastqc_name", right_on="Sample", how = 'left')
+        summary_df.drop("Sample", axis=1, inplace=True)
+        summary_df.drop("possible_fastqc_name", axis=1, inplace=True)
+
+# size : getting the size and coverage and warning if there's too much stdev
+if exists(size) :
+    file = size
+    print("Adding coverage information from sizes in " + file)
+    new_df = pd.read_csv(file, dtype = str, index_col= False)
+    new_df['datasets_size'] = new_df['datasets_size'].astype('float').astype('Int32')
+    new_df['expected_size'] = new_df['expected_size'].astype('float').astype('Int32')
+    new_df['mash_size'] = new_df['mash_size'].astype('float').astype('Int32')
+    new_df['quast_size'] = new_df['quast_size'].astype('float').astype('Int32')
+    new_df['average'] = new_df[["datasets_size","expected_size","mash_size", "quast_size"]].mean(axis = 1, skipna = True)
+    new_df['stdev'] = new_df[["datasets_size","expected_size","mash_size", "quast_size"]].std(axis = 1, skipna = True)
+    new_df['stdev_ratio'] = new_df['stdev']/new_df['average']
+    new_df['warning'] = new_df['stdev_ratio'].apply(lambda x: "Variable genome size," if x >= 0.1 else "")
+    new_df = new_df.add_prefix("size_")
+
+    summary_df = pd.merge(summary_df, new_df, left_on="sample", right_on="size_sample", how = 'left')
+    summary_df.drop("size_sample", axis=1, inplace=True)
+    summary_df.drop("size_genus", axis=1, inplace=True)
+    summary_df.drop("size_species", axis=1, inplace=True)
+    summary_df.drop("size_accession", axis=1, inplace=True)
+    summary_df['warnings'] = summary_df['warnings'] + summary_df['size_warning']
+
+    if "fastqc_total sequences" in summary_df:
+        summary_df['total_bases']          = summary_df['fastqc_total sequences'].astype('Int32') * summary_df['fastqc_avg_length'].astype(float)
+    else:
+        summary_df['total_bases']          = summary_df['quast_Total length'].astype(float)
+
+    summary_df['coverage']                 = summary_df['total_bases'].astype(float) /  summary_df['size_size'].astype(float)
+    summary_df['coverage_for_1.5M_genome'] = summary_df['total_bases'].astype(float) /  1500000
+    summary_df['coverage_for_2M_genome']   = summary_df['total_bases'].astype(float) /  2000000
+    summary_df['coverage_for_2.5M_genome'] = summary_df['total_bases'].astype(float) /  2500000
+    summary_df['coverage_for_3M_genome']   = summary_df['total_bases'].astype(float) /  3000000
+    summary_df['coverage_for_3.5M_genome'] = summary_df['total_bases'].astype(float) /  3500000
+    summary_df['coverage_for_4M_genome']   = summary_df['total_bases'].astype(float) /  4000000
+    summary_df['coverage_for_4.5M_genome'] = summary_df['total_bases'].astype(float) /  4500000
+    summary_df['coverage_for_5M_genome']   = summary_df['total_bases'].astype(float) /  5000000
+    summary_df['coverage_for_5.5M_genome'] = summary_df['total_bases'].astype(float) /  5500000
+    summary_df['coverage_for_6M_genome']   = summary_df['total_bases'].astype(float) /  6000000
+    summary_df['coverage_for_6.5M_genome'] = summary_df['total_bases'].astype(float) /  6500000
+    summary_df['coverage_for_7M_genome']   = summary_df['total_bases'].astype(float) /  7000000
+    summary_df['coverage_for_7.5M_genome'] = summary_df['total_bases'].astype(float) /  7500000
+    summary_df['coverage_for_8M_genome']   = summary_df['total_bases'].astype(float) /  8000000
+    summary_df['coverage_for_8.5M_genome'] = summary_df['total_bases'].astype(float) /  8500000
+    summary_df['coverage_for_9M_genome']   = summary_df['total_bases'].astype(float) /  9000000
+    summary_df['coverage_for_9.5M_genome'] = summary_df['total_bases'].astype(float) /  9500000
+    summary_df['coverage_for_10M_genome']  = summary_df['total_bases'].astype(float) / 10000000
+    summary_df['coverage_warning']         = summary_df['coverage'].apply(lambda x: "Low coverage," if x <= 20 else "")
+    summary_df['warnings']                 = summary_df['warnings'] + summary_df['coverage_warning']
+
 ##########################################
 # creating files                         #
 ##########################################
@@ -225,11 +325,15 @@
 # reducing to the top 1 or 2 results for each analysis
 final_columns = [
 # general information
+'coverage',
 'fastqc_total_sequences',
 'fastqc_flagged_sequences',
-'fastqscan_coverage',
+'fastqc_avg_length',
+'fastp_passed_reads',
+'bbduk_phix_reads',
 'quast_#_contigs',
 'quast_GC_(%)',
+'warnings',
 'amrfinder_genes_(per_cov/per_ident)',
 
 # species
@@ -266,5 +370,5 @@
     if new_column in summary_df.columns :
         set_columns.append(new_column)
 
-summary_df.to_csv(final + '.tsv', columns = ['sample','file','version','reads','phix_reads'] + set_columns, index=False, sep="\t")
-summary_df.to_csv(final + '.txt', columns = ['sample','file','version','reads','phix_reads'] + set_columns, index=False, sep=";")
+summary_df.to_csv(final + '.tsv', columns = ['sample','file','version'] + set_columns, index=False, sep="\t")
+summary_df.to_csv(final + '.txt', columns = ['sample','file','version'] + set_columns, index=False, sep=";")
diff --git a/grandeur.nf b/grandeur.nf
@@ -29,6 +29,11 @@ if (params.config_file) {
   exit 0
 }
 
+params.fastqscan_options = ""
+if (params.fastqscan_options) {
+  println("WARNING : ${params.fastqscan_options} is depreciated" )
+}
+
 // ##### ##### ##### ##### ##### ##### ##### ##### ##### #####
 
 // Defining params
@@ -84,7 +89,6 @@ params.fastani_options            = "--matrix"
 params.fasterqdump_options        = ""
 params.fastp_options              = "--detect_adapter_for_pe"
 params.fastqc_options             = ""
-params.fastqscan_options          = ""
 params.iqtree2_options            = "-t RANDOM -m GTR+F+I -bb 1000 -alrt 1000"
 params.iqtree2_outgroup           = ""
 params.kaptive_options            = ''
@@ -325,9 +329,8 @@ workflow {
 
     ch_contigs
       .join(min_hash_distance.out.mash_err)
-      .map(it -> tuple (it[0], it[2]))
-      .join(ch_top_hit_hit, by: 0, remainder: true)
-      .join(ch_top_hit_files, by: 0, remainder: true)
+      .join(min_hash_distance.out.for_flag)
+      .join(average_nucleotide_identity.out.for_flag)
       .combine(ch_genome_sizes)
       .combine(ch_datasets)
       .set{ ch_size }
@@ -359,9 +362,7 @@ workflow {
       ch_raw_reads, 
       ch_fastas, 
       ch_for_multiqc.collect(), 
-      ch_for_summary.concat(summary_script).collect(), 
-      de_novo_alignment.out.fastp_reads, 
-      de_novo_alignment.out.phix_reads)
+      ch_for_summary.concat(summary_script).collect())
   }
 }
 

diff --git a/modules/bbmap.nf b/modules/bbmap.nf
@@ -17,7 +17,6 @@ process bbduk {
   path "bbduk/*",                                                     emit: files
   path "bbduk/${sample}.phix.stats.txt",                              emit: stats
   path "logs/${task.process}/${sample}.${workflow.sessionId}.log",    emit: log
-  tuple val(sample), env(phix_reads),                                 emit: phix_reads
 
   shell:
   '''
@@ -41,8 +40,6 @@ process bbduk {
       stats=bbduk/!{sample}.phix.stats.txt \
       threads=!{task.cpus} \
       | tee -a $log_file
-
-    phix_reads=$(grep Matched bbduk/!{sample}.phix.stats.txt | cut -f 2)
   '''
 }