Merge pull request #84 from UPHL-BioNGS/update-2023-07-07

Update
UPHL-BioNGS · Jul 12, 2023 · 28da941 · 28da941
2 parents e466edd + 1353ce0
commit 28da941
Show file tree

Hide file tree

Showing 12 changed files with 243 additions and 25 deletions.
diff --git a/bin/HeatCluster.py b/bin/HeatCluster.py
@@ -0,0 +1,145 @@
+#!/bin/python3
+
+##########################################
+# written by Stephen Beckstrom-Sternberg #
+# for creating SNP heatmaps for Grandeur #
+##########################################
+
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+import scipy.cluster.hierarchy as sch
+from sklearn.cluster import KMeans
+from sklearn.metrics import silhouette_score
+from io import StringIO
+
+# Read the SNP matrix file
+with open("snp_matrix.txt", "r") as infile:
+    lines = infile.readlines()
+numSamples = len(lines) -1 #counts data lines
+
+# Remove 'snp-dists 0.8.2', '_contigs' and '_genomic', & replace commas with tabs
+cleaned_lines = [line.replace('snp-dists 0.8.2\t', '').replace('snp-dists 0.8.2,', '').
+                 replace(",", "\t").replace('_contigs', '').replace('_genomic', '')
+                 for line in lines]
+
+# Combine the cleaned lines into a single string instead of a file
+snp_matrix_string = "\n".join(cleaned_lines)
+
+# Read the tab-delimited string into a DataFrame
+df = pd.read_csv(StringIO(snp_matrix_string), sep='\t')
+
+#Define colormap for heatmap
+cmap = 'Reds_r'
+
+# Add Total_SNPs column for the sum of SNPs in each row
+df['Total_SNPs'] = df.sum(numeric_only=True, axis="columns")
+
+# Sort the DataFrame by Total SNPs in each row
+TotalRow_snps = df.sort_values(by='Total_SNPs', kind="mergesort", axis="rows")
+
+#Remove Total_SNPs column
+sorted_cluster_matrix = TotalRow_snps.drop(['Total_SNPs'], axis="columns")
+
+# Reorder the columns to mirror the row order
+sorted_cluster_matrix=sorted_cluster_matrix.reindex(columns=sorted_cluster_matrix.index)
+
+#Change output figure size tuple based on number of samples
+if   (numSamples <= 20): 
+    figureSize = (10, 8)
+elif (numSamples <= 40): 
+    figureSize = (20, 16)
+elif (numSamples <= 60): 
+    figureSize = (30, 24)
+else: 
+    figureSize = (40, 32)
+print("\n\nNumber of samples: ", numSamples,"\nFigure size: ", figureSize)
+
+# Compute clusters
+clusters = sch.linkage(sorted_cluster_matrix.values, method='complete', metric='euclidean')
+
+# Create clustermap and get the order of rows and columns based on clustering
+clustergrid = sns.clustermap(
+    sorted_cluster_matrix, 
+    xticklabels=True, 
+    vmin=0, 
+    vmax=50, 
+    center=20,
+    annot=True, 
+    annot_kws={'size': 6}, 
+    cbar_kws={"pad": 0.5},
+    cmap=cmap, 
+    linecolor="white", 
+    linewidths=.2, 
+    fmt='d', 
+    dendrogram_ratio=0.1,
+    col_cluster=True, 
+    row_cluster=True, 
+    figsize=figureSize
+    )
+plt.setp(clustergrid.ax_heatmap.get_xticklabels(), rotation=45, ha='right')
+
+# Suppress printing of dendrogram along the y-axis
+clustergrid.ax_row_dendrogram.set_visible(False)
+clustergrid.ax_col_dendrogram.set_visible(False)
+
+row_order = clustergrid.dendrogram_row.reordered_ind
+col_order = row_order
+# Sort the DataFrame based on the cluster order
+sorted_df = sorted_cluster_matrix.iloc[row_order, col_order]
+
+# Compute the number of SNPs within the cluster per row
+within_cluster_snps = sorted_df.apply(lambda row: row[row < 500].sum(), axis=1)
+
+# Add 'Within_Cluster_SNPs' column to the sorted DataFrame
+sorted_df['Within_Cluster_SNPs'] = within_cluster_snps.values
+
+# Calculate silhouette scores for different numbers of clusters
+silhouette_scores = []
+
+if numSamples < 11:
+    upper_range = numSamples
+else:
+    upper_range = 11
+
+for n_clusters in range(2, upper_range):
+    kmeans = KMeans(n_clusters=n_clusters, n_init=10)
+    cluster_labels = kmeans.fit_predict(sorted_df.values)
+    silhouette_scores.append(silhouette_score(sorted_df.values, cluster_labels))
+
+# Find the optimal number of clusters with the highest silhouette score
+optimal_num_clusters = silhouette_scores.index(max(silhouette_scores)) + 2
+
+# Use the optimal number of clusters to assign cluster labels and sort the DataFrame
+kmeans = KMeans(n_clusters=optimal_num_clusters, n_init=10)
+cluster_labels = kmeans.fit_predict(sorted_df.values)
+sorted_df['Cluster'] = cluster_labels
+
+# Sort the DataFrame first by 'Cluster' and then by 'Within_Cluster_SNPs'
+sorted_df = sorted_df.sort_values(by=['Cluster', 'Within_Cluster_SNPs'], ascending=[True, True], kind="mergesort")
+
+# Drop 'Cluster' and 'Within_Cluster_SNPs' columns
+sorted_df = sorted_df.drop(['Cluster', 'Within_Cluster_SNPs'], axis='columns')
+sorted_df = sorted_df.reindex(columns=sorted_df.index)
+
+# Save the finally sorted, tab-delimited SNP matrix
+sorted_df.to_csv('Final_snp_matrix.tsv', sep='\t', header=True, index=True)
+
+# Create the reordered heatmap with correct values
+heatmap = sns.clustermap(
+    sorted_df, xticklabels=True, yticklabels=True, vmin=0, vmax=50, center=20,
+    annot=True, annot_kws={'size': 6}, cbar_kws={"orientation": "vertical", "pad": 0.5},
+    cmap=cmap, linecolor="white", linewidths=.1, fmt='d', dendrogram_ratio=0.1,
+    col_cluster=False, row_cluster=False, figsize=figureSize
+)
+plt.setp(heatmap.ax_heatmap.get_xticklabels(), rotation=45, ha='right')
+
+# Save the reordered heatmap as a PDF
+heatmap.savefig('SNP_matrix.pdf')
+heatmap.savefig('SNP_matrix.png')
+heatmap.savefig('SNP_matrix_mqc.png')
+
+plt.show()
+plt.close()
+
+print("Saved heatmap as Heatmap.{pdf,png}")
diff --git a/bin/summary.py b/bin/summary.py
@@ -111,6 +111,7 @@
     print("Adding results for " + file)
     analysis = "amrfinder"
     new_df = pd.read_table(file, dtype = str, index_col= False)
+    new_df = new_df.sort_values('Gene symbol')
     new_df['genes (per cov/per ident)'] = new_df['Gene symbol'] + ' (' + new_df['% Coverage of reference sequence'] + '/' + new_df['% Identity to reference sequence'] + ')'
     new_df = new_df[['Name', 'genes (per cov/per ident)']]
     new_df = new_df.groupby('Name', as_index=False).agg({'genes (per cov/per ident)': lambda x: list(x)})
@@ -125,13 +126,22 @@
     analysis = "blobtools"
     new_df = pd.read_table(file, dtype = str, index_col= False)
     new_df = new_df[new_df['name'] != 'all']
+    new_df = new_df.sort_values('bam0_read_map_p', ascending = False)
+
+    tmp_df = new_df.drop_duplicates(subset='sample', keep="first").copy()
+    tmp_df = new_df['sample', 'name']
+    tmp_df['top_organism'] = tmp_df['name']
+    tmp_df = tmp_df.add_prefix(analysis + '_')
+
     new_df['organism (per mapped reads)'] = new_df['name'] + ' (' + new_df['bam0_read_map_p'] + ')'
     new_df = new_df[['sample', 'organism (per mapped reads)']]
     new_df = new_df.groupby('sample', as_index=False).agg({'organism (per mapped reads)': lambda x: list(x)})
     new_df['warning'] = new_df['organism (per mapped reads)'].apply(lambda x: "Blobtools multiple organisms," if ','.join(x).count(',') >= 2 else "")
     new_df = new_df.add_prefix(analysis + '_')
+
     summary_df = pd.merge(summary_df, new_df, left_on="sample", right_on=analysis + "_sample", how = 'left')
     summary_df.drop(analysis + "_sample", axis=1, inplace=True)
+    summary_df = pd.merge(summary_df, tmp_df, left_on="sample", right_on=analysis + "_sample", how = 'left')
     summary_df['warnings'] = summary_df['warnings'] + summary_df['blobtools_warning']
 
 # fastani : merging relevant rows into one
@@ -175,13 +185,22 @@
     print("Adding results for " + file)
     analysis = "kraken2"
     new_df = pd.read_csv(file, dtype = str, index_col= False)
+    new_df = new_df.sort_values('Sample', 'Percentage of fragments', ascending= False)
+    new_df['top_organism'] = new_df['Scientific name']
+
+    tmp_df = new_df['Sample', 'top_organism'].copy
+    tmp_df = tmp_df.drop_duplicates(subset=['Sample'], keep="first")
+    tmp_df = tmp_df.add_prefix(analysis + '_')
+
     new_df['organism (per fragment)'] = new_df['Scientific name'] + " (" + new_df['Percentage of fragments'] + ' ' + new_df['Type'] + ")"    
     new_df = new_df[['Sample', 'organism (per fragment)']]
     new_df = new_df.groupby('Sample', as_index=False).agg({'organism (per fragment)': lambda x: list(x)})
     new_df['warning'] = new_df['organism (per fragment)'].apply(lambda x: "Kraken2 multiple organisms," if ','.join(x).count(',') >= 2 else "")
     new_df = new_df.add_prefix(analysis + '_')
     summary_df = pd.merge(summary_df, new_df, left_on="sample", right_on=analysis + "_Sample", how = 'left')
     summary_df.drop(analysis + "_Sample", axis=1, inplace=True)
+    summary_df = pd.merge(summary_df, tmp_df, left_on="sample", right_on=analysis + "_Sample", how = 'left')
+    summary_df.drop(analysis + "_Sample", axis=1, inplace=True)
     summary_df['warnings'] = summary_df['warnings'] + summary_df['kraken2_warning']
 
 # mash : top hit of potentially two different files
@@ -190,7 +209,7 @@
     print("Adding results for " + file)
     analysis = "mash"
     new_df = pd.read_csv(file, dtype = str, index_col= False)
-    new_df.sort_values(by = ['P-value', 'mash-distance'], ascending = [True, True], inplace=True)
+    new_df = new_df.sort_values(by = ['P-value', 'mash-distance'], ascending = [True, True])
     new_df = new_df.drop_duplicates(subset=['sample'], keep='first')
     new_df = new_df.add_prefix(analysis + "_")
     summary_df = pd.merge(summary_df, new_df, left_on="sample", right_on=analysis + "_sample", how = 'left')
@@ -360,6 +379,7 @@
 'amrfinder_genes_(per_cov/per_ident)',
 
 # species
+# TODO 'predicted_organism',
 'mlst_matching_PubMLST_scheme',
 'mlst_ST',
 'mash_reference',

diff --git a/grandeur.nf b/grandeur.nf
@@ -146,7 +146,8 @@ include { test }                        from "./subworkflows/test"
 // ##### ##### ##### ##### ##### ##### ##### ##### ##### #####
 
 // Creating the summary files
-summary_script = Channel.fromPath(workflow.projectDir + "/bin/summary.py", type: "file")
+summary_script = Channel.fromPath(workflow.projectDir + "/bin/summary.py",     type: "file")
+snpmtrx_script = Channel.fromPath(workflow.projectDir + "/bin/HeatCluster.py", type: "file")
 
 // ##### ##### ##### ##### ##### ##### ##### ##### ##### #####
 
@@ -349,6 +350,7 @@ workflow {
   // optional subworkflow for comparing shared genes
   if ( params.msa ) { 
     phylogenetic_analysis(
+      snpmtrx_script,
       ch_contigs.ifEmpty([]), 
       ch_gffs.ifEmpty([]), 
       ch_top_hit.ifEmpty([]))

diff --git a/modules/amrfinderplus.nf b/modules/amrfinderplus.nf
@@ -2,7 +2,7 @@ process amrfinderplus {
   tag           "${sample}"
   label         "medcpus"
   publishDir    params.outdir, mode: 'copy'
-  container     'staphb/ncbi-amrfinderplus:3.11.11-2023-04-17.1'
+  container     'staphb/ncbi-amrfinderplus:3.11.14-2023-04-17.1'
   maxForks      10
   //#UPHLICA errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
   //#UPHLICA cpus 7

diff --git a/modules/datasets.nf b/modules/datasets.nf
@@ -13,7 +13,7 @@ process datasets_summary {
   val(taxon)
 
   output:
-  path "datasets/${taxon}_genomes.csv"                          , emit: genomes
+  path "datasets/*_genomes.csv"                                 , emit: genomes
   path "logs/${task.process}/${taxon}.${workflow.sessionId}.log", emit: log
 
   shell:
@@ -28,7 +28,8 @@ process datasets_summary {
     echo "Nextflow command : " >> $log_file
     cat .command.sh >> $log_file
 
-    taxon=$(echo !{taxon} | tr "_" " ")
+    taxon="$(echo !{taxon} | tr '_' ' ' | sed 's/[//g' | sed 's/]//g' )"
+    echo "the taxon is now $taxon"
 
     datasets summary genome taxon "$taxon" --reference --limit !{params.datasets_max_genomes} --as-json-lines | \
       dataformat tsv genome --fields accession,assminfo-refseq-category,assminfo-level,organism-name,assmstats-total-ungapped-len | \

diff --git a/modules/grandeur.nf b/modules/grandeur.nf
@@ -1,7 +1,7 @@
 process species {
   tag           "Creating list of species"
   publishDir    params.outdir, mode: 'copy'
-  container     'quay.io/biocontainers/pandas:1.1.5'
+  container     'quay.io/uphl/seaborn:0.12.2-2'
   maxForks      10
   //#UPHLICA errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
   //#UPHLICA pod annotation: 'scheduler.illumina.com/presetSize', value: 'standard-medium'
@@ -49,7 +49,7 @@ process species {
 process decompression {
   tag           "Decompressing genome file"
   publishDir    params.outdir, mode: 'copy'
-  container     'quay.io/biocontainers/pandas:1.1.5'
+  container     'quay.io/uphl/seaborn:0.12.2-2'
   maxForks      10
   stageInMode   'copy'
   //#UPHLICA errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
@@ -90,7 +90,7 @@ process decompression {
 process flag {
   tag           "${sample}"
   publishDir    params.outdir, mode: 'copy'
-  container     'quay.io/biocontainers/pandas:1.1.5'
+  container     'quay.io/uphl/seaborn:0.12.2-2'
   maxForks      10
   //#UPHLICA errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
   //#UPHLICA pod annotation: 'scheduler.illumina.com/presetSize', value: 'standard-medium'
@@ -189,7 +189,7 @@ process flag {
 process size {
   tag           "${sample}"
   publishDir    params.outdir, mode: 'copy'
-  container     'quay.io/biocontainers/pandas:1.1.5'
+  container     'quay.io/uphl/seaborn:0.12.2-2'
   maxForks      10
   //#UPHLICA errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
   //#UPHLICA pod annotation: 'scheduler.illumina.com/presetSize', value: 'standard-medium'
@@ -332,7 +332,7 @@ process size {
 
 process representative {
   tag           "${accession}"
-  container     'quay.io/biocontainers/pandas:1.1.5'
+  container     'quay.io/uphl/seaborn:0.12.2-2'
   maxForks      10
   //#UPHLICA errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
   //#UPHLICA pod annotation: 'scheduler.illumina.com/presetSize', value: 'standard-medium'
@@ -364,4 +364,42 @@ process representative {
 
     cp !{genomes}/*!{accession}* representative/.
   '''
-}
+}
+
+process snp_matrix_heatmap {
+  tag           "heatmap"
+  publishDir    params.outdir, mode: 'copy'
+  container     'quay.io/uphl/seaborn:0.12.2-2'
+  maxForks      10
+  //#UPHLICA errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
+  //#UPHLICA pod annotation: 'scheduler.illumina.com/presetSize', value: 'standard-medium'
+  //#UPHLICA memory 1.GB
+  //#UPHLICA cpus 3
+  //#UPHLICA time '10m'
+
+  input:
+  tuple file(snp_matrix), file(script)
+
+  output:
+  path "snp-dists/SNP_matrix*"
+  path "snp-dists/SNP_matrix_mqc.png"                              , emit: for_multiqc
+  path "logs/${task.process}/snp_matrix.${workflow.sessionId}.log" , emit: log_files
+
+  shell:
+  '''
+    mkdir -p snp-dists logs/!{task.process}
+    log_file=logs/!{task.process}/snp_matrix.!{workflow.sessionId}.log
+
+    # time stamp + capturing tool versions
+    date > $log_file
+    echo "container : !{task.container}" >> $log_file
+    echo "Nextflow command : " >> $log_file
+    cat .command.sh >> $log_file
+
+    python3 !{script}
+
+    mv SNP* snp-dists/.
+  '''
+}
+
+
diff --git a/modules/iqtree2.nf b/modules/iqtree2.nf
@@ -2,7 +2,7 @@ process iqtree2 {
   tag           "Pylogenetic Analysis"
   label         "maxcpus"
   publishDir    params.outdir, mode: 'copy'
-  container     'staphb/iqtree2:2.1.2'
+  container     'staphb/iqtree2:2.2.2.6'
   maxForks      10
   //#UPHLICA errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
   //#UPHLICA pod annotation: 'scheduler.illumina.com/presetSize', value: 'standard-xlarge'

diff --git a/modules/kraken2.nf b/modules/kraken2.nf
@@ -2,7 +2,7 @@ process kraken2_fastq {
   tag           "${sample}"
   label         "maxcpus"
   publishDir    params.outdir, mode: 'copy'
-  container     'staphb/kraken2:2.1.2-no-db'
+  container     'staphb/kraken2:2.1.3'
   maxForks      10
   //#UPHLICA errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
   //#UPHLICA pod annotation: 'scheduler.illumina.com/presetSize', value: 'standard-xlarge'
@@ -49,7 +49,7 @@ process kraken2_fasta {
   tag           "${sample}"
   label         "maxcpus"
   publishDir    params.outdir, mode: 'copy'
-  container     'staphb/kraken2:2.1.2-no-db'
+  container     'staphb/kraken2:2.1.3'
   maxForks      10
   //#UPHLICA errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
   //#UPHLICA pod annotation: 'scheduler.illumina.com/presetSize', value: 'standard-large'