Skip to content

Commit

Permalink
Merge pull request #67 from UPHL-BioNGS/update-20230523
Browse files Browse the repository at this point in the history
Update 20230523
  • Loading branch information
erinyoung authored May 25, 2023
2 parents b812f58 + cf91d31 commit 24782d5
Show file tree
Hide file tree
Showing 12 changed files with 238 additions and 96 deletions.
120 changes: 112 additions & 8 deletions bin/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
##########################################

import pandas as pd
import json
from os.path import exists

##########################################
Expand All @@ -20,7 +21,6 @@
emmtyper = 'emmtyper_summary.tsv'
fastani = 'fastani_summary.csv'
fastqc = 'fastqc_summary.csv'
fastqscan = 'fastqscan_summary.csv'
kleborate = 'kleborate_results.tsv'
kraken2 = 'kraken2_summary.csv'
legsta = 'legsta_summary.csv'
Expand All @@ -32,25 +32,28 @@
seqsero2 = 'seqsero2_results.txt'
serotypefinder = 'serotypefinder_results.txt'
shigatyper = 'shigatyper_results.txt'
size = 'size_results.csv'
multiqc_json = 'multiqc_data.json'
multiqc_stats = 'multiqc_general_stats.txt'

# final files
final = 'grandeur_summary'
extended = 'grandeur_extended_summary'
extended = 'summary/grandeur_extended_summary'

##########################################
# grouping similar files #
##########################################

csv_files = [ fastqscan, legsta ]
csv_files = [ legsta ]
tsv_files = [ quast, seqsero2, kleborate, mlst, emmtyper , pbptyper]

top_hit = [ fastani ]
top_hit = [ fastani ]

##########################################
# creating the summary dataframe #
##########################################

summary_df = pd.read_csv(names, dtype = str)
summary_df['warnings'] = ''
columns = list(summary_df.columns)

# csv files
Expand Down Expand Up @@ -109,9 +112,11 @@
new_df['organism (per mapped reads)'] = new_df['name'] + ' (' + new_df['bam0_read_map_p'] + ')'
new_df = new_df[['sample', 'organism (per mapped reads)']]
new_df = new_df.groupby('sample', as_index=False).agg({'organism (per mapped reads)': lambda x: list(x)})
new_df['warning'] = new_df['organism (per mapped reads)'].apply(lambda x: "Blobtools multiple organisms," if ','.join(x).count(',') >= 2 else "")
new_df = new_df.add_prefix(analysis + '_')
summary_df = pd.merge(summary_df, new_df, left_on="sample", right_on=analysis + "_sample", how = 'left')
summary_df.drop(analysis + "_sample", axis=1, inplace=True)
summary_df['warnings'] = summary_df['warnings'] + summary_df['blobtools_warning']

# fastani : merging relevant rows into one
if exists(fastani) :
Expand All @@ -122,9 +127,11 @@
new_df['genome (ANI estimate)'] = new_df['reference'].str.split('_').str[0] + " " + new_df['reference'].str.split("_").str[1] + " " + new_df['reference'].str.split('_').str[-2] + "_" + new_df['reference'].str.split('_').str[-1] + " (" + new_df['ANI estimate'] + ")"
new_df = new_df[['sample', 'genome (ANI estimate)']]
new_df = new_df.groupby('sample', as_index=False).agg({'genome (ANI estimate)': lambda x: list(x)})
new_df['warning'] = new_df['genome (ANI estimate)'].apply(lambda x: "Multiple FastANI hits," if ','.join(x).count(',') >= 4 else "")
new_df = new_df.add_prefix(analysis + '_')
summary_df = pd.merge(summary_df, new_df, left_on="sample", right_on=analysis + "_sample", how = 'left')
summary_df.drop(analysis + "_sample", axis=1, inplace=True)
summary_df['warnings'] = summary_df['warnings'] + summary_df['fastani_warning']

# fastqc : merging relevant rows into one
if exists(fastqc) :
Expand Down Expand Up @@ -155,9 +162,11 @@
new_df['organism (per fragment)'] = new_df['Scientific name'] + " (" + new_df['Percentage of fragments'] + ' ' + new_df['Type'] + ")"
new_df = new_df[['Sample', 'organism (per fragment)']]
new_df = new_df.groupby('Sample', as_index=False).agg({'organism (per fragment)': lambda x: list(x)})
new_df['warning'] = new_df['organism (per fragment)'].apply(lambda x: "Kraken2 multiple organisms," if ','.join(x).count(',') >= 2 else "")
new_df = new_df.add_prefix(analysis + '_')
summary_df = pd.merge(summary_df, new_df, left_on="sample", right_on=analysis + "_Sample", how = 'left')
summary_df.drop(analysis + "_Sample", axis=1, inplace=True)
summary_df['warnings'] = summary_df['warnings'] + summary_df['kraken2_warning']

# mash : top hit of potentially two different files
if exists(mash) :
Expand Down Expand Up @@ -213,6 +222,97 @@
summary_df = pd.merge(summary_df, new_df, left_on="sample", right_on=analysis + "_sample", how = 'left')
summary_df.drop(analysis + "_sample", axis=1, inplace=True)

# multiqc : bbduk and fastp

if exists(multiqc_json) :
file = multiqc_json
print("Adding analysis parsed via multiqc in " + file)
with open(file) as multiqc_data:
data = json.load(multiqc_data)

# fastp filtered reads
if "fastp_filtered_reads_plot" in data["report_plot_data"].keys():
samples = [sample.replace("_rmphix_R1", "") for sample in data["report_plot_data"]['fastp_filtered_reads_plot']['samples'][0]]
fastp_passedreads_df = pd.DataFrame(samples, columns=['fastp_sample'])
fastp_passedreads_df['fastp_passed_reads'] = data["report_plot_data"]['fastp_filtered_reads_plot']['datasets'][0][0]['data']
summary_df = pd.merge(summary_df, fastp_passedreads_df, left_on="sample", right_on="fastp_sample", how = 'left')
summary_df.drop("fastp_sample", axis=1, inplace=True)

# bbduk phix reads
if "bbmap" in data['report_saved_raw_data'].keys():
print("Adding in phix reads from bbmap")
samples = [sample.replace(".phix", "") for sample in data['report_saved_raw_data']['bbmap']['stats'].keys()]
phix_reads=[]
for sample in data['report_saved_raw_data']['bbmap']['stats'].keys() :
phix_reads.append(data['report_saved_raw_data']['bbmap']['stats'][sample]['kv']['Matched'])
bbduk_phixreads_df = pd.DataFrame(samples, columns=['bbduk_sample'])
bbduk_phixreads_df['bbduk_phix_reads'] = phix_reads
summary_df = pd.merge(summary_df, bbduk_phixreads_df, left_on="sample", right_on="bbduk_sample", how = 'left')
summary_df.drop("bbduk_sample", axis=1, inplace=True)

if exists(multiqc_stats) :
file = multiqc_stats
print("Adding analysis parsed via multiqc in " + file)
new_df = pd.read_table(file, dtype = str, index_col= False)
if "FastQC_mqc-generalstats-fastqc-avg_sequence_length" in new_df.columns :
tmp_df = new_df[["Sample","FastQC_mqc-generalstats-fastqc-avg_sequence_length"]]
tmp_df["fastqc_avg_length"]= tmp_df["FastQC_mqc-generalstats-fastqc-avg_sequence_length"]
tmp_df.drop("FastQC_mqc-generalstats-fastqc-avg_sequence_length", axis=1, inplace=True)

summary_df["possible_fastqc_name"] = summary_df['file'].str.split(" ").str[0].str.split(".").str[0]
summary_df = pd.merge(summary_df, tmp_df, left_on="possible_fastqc_name", right_on="Sample", how = 'left')
summary_df.drop("Sample", axis=1, inplace=True)
summary_df.drop("possible_fastqc_name", axis=1, inplace=True)

# size : getting the size and coverage and warning if there's too much stdev
if exists(size) :
file = size
print("Adding coverage information from sizes in " + file)
new_df = pd.read_csv(file, dtype = str, index_col= False)
new_df['datasets_size'] = new_df['datasets_size'].astype('float').astype('Int32')
new_df['expected_size'] = new_df['expected_size'].astype('float').astype('Int32')
new_df['mash_size'] = new_df['mash_size'].astype('float').astype('Int32')
new_df['quast_size'] = new_df['quast_size'].astype('float').astype('Int32')
new_df['average'] = new_df[["datasets_size","expected_size","mash_size", "quast_size"]].mean(axis = 1, skipna = True)
new_df['stdev'] = new_df[["datasets_size","expected_size","mash_size", "quast_size"]].std(axis = 1, skipna = True)
new_df['stdev_ratio'] = new_df['stdev']/new_df['average']
new_df['warning'] = new_df['stdev_ratio'].apply(lambda x: "Variable genome size," if x >= 0.1 else "")
new_df = new_df.add_prefix("size_")

summary_df = pd.merge(summary_df, new_df, left_on="sample", right_on="size_sample", how = 'left')
summary_df.drop("size_sample", axis=1, inplace=True)
summary_df.drop("size_genus", axis=1, inplace=True)
summary_df.drop("size_species", axis=1, inplace=True)
summary_df.drop("size_accession", axis=1, inplace=True)
summary_df['warnings'] = summary_df['warnings'] + summary_df['size_warning']

if "fastqc_total sequences" in summary_df:
summary_df['total_bases'] = summary_df['fastqc_total sequences'].astype('Int32') * summary_df['fastqc_avg_length'].astype(float)
else:
summary_df['total_bases'] = summary_df['quast_Total length'].astype(float)

summary_df['coverage'] = summary_df['total_bases'].astype(float) / summary_df['size_size'].astype(float)
summary_df['coverage_for_1.5M_genome'] = summary_df['total_bases'].astype(float) / 1500000
summary_df['coverage_for_2M_genome'] = summary_df['total_bases'].astype(float) / 2000000
summary_df['coverage_for_2.5M_genome'] = summary_df['total_bases'].astype(float) / 2500000
summary_df['coverage_for_3M_genome'] = summary_df['total_bases'].astype(float) / 3000000
summary_df['coverage_for_3.5M_genome'] = summary_df['total_bases'].astype(float) / 3500000
summary_df['coverage_for_4M_genome'] = summary_df['total_bases'].astype(float) / 4000000
summary_df['coverage_for_4.5M_genome'] = summary_df['total_bases'].astype(float) / 4500000
summary_df['coverage_for_5M_genome'] = summary_df['total_bases'].astype(float) / 5000000
summary_df['coverage_for_5.5M_genome'] = summary_df['total_bases'].astype(float) / 5500000
summary_df['coverage_for_6M_genome'] = summary_df['total_bases'].astype(float) / 6000000
summary_df['coverage_for_6.5M_genome'] = summary_df['total_bases'].astype(float) / 6500000
summary_df['coverage_for_7M_genome'] = summary_df['total_bases'].astype(float) / 7000000
summary_df['coverage_for_7.5M_genome'] = summary_df['total_bases'].astype(float) / 7500000
summary_df['coverage_for_8M_genome'] = summary_df['total_bases'].astype(float) / 8000000
summary_df['coverage_for_8.5M_genome'] = summary_df['total_bases'].astype(float) / 8500000
summary_df['coverage_for_9M_genome'] = summary_df['total_bases'].astype(float) / 9000000
summary_df['coverage_for_9.5M_genome'] = summary_df['total_bases'].astype(float) / 9500000
summary_df['coverage_for_10M_genome'] = summary_df['total_bases'].astype(float) / 10000000
summary_df['coverage_warning'] = summary_df['coverage'].apply(lambda x: "Low coverage," if x <= 20 else "")
summary_df['warnings'] = summary_df['warnings'] + summary_df['coverage_warning']

##########################################
# creating files #
##########################################
Expand All @@ -225,11 +325,15 @@
# reducing to the top 1 or 2 results for each analysis
final_columns = [
# general information
'coverage',
'fastqc_total_sequences',
'fastqc_flagged_sequences',
'fastqscan_coverage',
'fastqc_avg_length',
'fastp_passed_reads',
'bbduk_phix_reads',
'quast_#_contigs',
'quast_GC_(%)',
'warnings',
'amrfinder_genes_(per_cov/per_ident)',

# species
Expand Down Expand Up @@ -266,5 +370,5 @@
if new_column in summary_df.columns :
set_columns.append(new_column)

summary_df.to_csv(final + '.tsv', columns = ['sample','file','version','reads','phix_reads'] + set_columns, index=False, sep="\t")
summary_df.to_csv(final + '.txt', columns = ['sample','file','version','reads','phix_reads'] + set_columns, index=False, sep=";")
summary_df.to_csv(final + '.tsv', columns = ['sample','file','version'] + set_columns, index=False, sep="\t")
summary_df.to_csv(final + '.txt', columns = ['sample','file','version'] + set_columns, index=False, sep=";")
15 changes: 8 additions & 7 deletions grandeur.nf
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ if (params.config_file) {
exit 0
}

params.fastqscan_options = ""
if (params.fastqscan_options) {
println("WARNING : ${params.fastqscan_options} is depreciated" )
}

// ##### ##### ##### ##### ##### ##### ##### ##### ##### #####

// Defining params
Expand Down Expand Up @@ -84,7 +89,6 @@ params.fastani_options = "--matrix"
params.fasterqdump_options = ""
params.fastp_options = "--detect_adapter_for_pe"
params.fastqc_options = ""
params.fastqscan_options = ""
params.iqtree2_options = "-t RANDOM -m GTR+F+I -bb 1000 -alrt 1000"
params.iqtree2_outgroup = ""
params.kaptive_options = ''
Expand Down Expand Up @@ -325,9 +329,8 @@ workflow {

ch_contigs
.join(min_hash_distance.out.mash_err)
.map(it -> tuple (it[0], it[2]))
.join(ch_top_hit_hit, by: 0, remainder: true)
.join(ch_top_hit_files, by: 0, remainder: true)
.join(min_hash_distance.out.for_flag)
.join(average_nucleotide_identity.out.for_flag)
.combine(ch_genome_sizes)
.combine(ch_datasets)
.set{ ch_size }
Expand Down Expand Up @@ -359,9 +362,7 @@ workflow {
ch_raw_reads,
ch_fastas,
ch_for_multiqc.collect(),
ch_for_summary.concat(summary_script).collect(),
de_novo_alignment.out.fastp_reads,
de_novo_alignment.out.phix_reads)
ch_for_summary.concat(summary_script).collect())
}
}

Expand Down
3 changes: 0 additions & 3 deletions modules/bbmap.nf
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ process bbduk {
path "bbduk/*", emit: files
path "bbduk/${sample}.phix.stats.txt", emit: stats
path "logs/${task.process}/${sample}.${workflow.sessionId}.log", emit: log
tuple val(sample), env(phix_reads), emit: phix_reads

shell:
'''
Expand All @@ -41,8 +40,6 @@ process bbduk {
stats=bbduk/!{sample}.phix.stats.txt \
threads=!{task.cpus} \
| tee -a $log_file
phix_reads=$(grep Matched bbduk/!{sample}.phix.stats.txt | cut -f 2)
'''
}

Expand Down
Loading

0 comments on commit 24782d5

Please sign in to comment.