Skip to content

Commit

Permalink
Merge pull request #84 from UPHL-BioNGS/update-2023-07-07
Browse files Browse the repository at this point in the history
Update
  • Loading branch information
erinyoung authored Jul 12, 2023
2 parents e466edd + 1353ce0 commit 28da941
Show file tree
Hide file tree
Showing 12 changed files with 243 additions and 25 deletions.
145 changes: 145 additions & 0 deletions bin/HeatCluster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
#!/bin/python3

##########################################
# written by Stephen Beckstrom-Sternberg #
# for creating SNP heatmaps for Grandeur #
##########################################

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as sch
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from io import StringIO

# Read the SNP matrix file
with open("snp_matrix.txt", "r") as infile:
lines = infile.readlines()
numSamples = len(lines) -1 #counts data lines

# Remove 'snp-dists 0.8.2', '_contigs' and '_genomic', & replace commas with tabs
cleaned_lines = [line.replace('snp-dists 0.8.2\t', '').replace('snp-dists 0.8.2,', '').
replace(",", "\t").replace('_contigs', '').replace('_genomic', '')
for line in lines]

# Combine the cleaned lines into a single string instead of a file
snp_matrix_string = "\n".join(cleaned_lines)

# Read the tab-delimited string into a DataFrame
df = pd.read_csv(StringIO(snp_matrix_string), sep='\t')

#Define colormap for heatmap
cmap = 'Reds_r'

# Add Total_SNPs column for the sum of SNPs in each row
df['Total_SNPs'] = df.sum(numeric_only=True, axis="columns")

# Sort the DataFrame by Total SNPs in each row
TotalRow_snps = df.sort_values(by='Total_SNPs', kind="mergesort", axis="rows")

#Remove Total_SNPs column
sorted_cluster_matrix = TotalRow_snps.drop(['Total_SNPs'], axis="columns")

# Reorder the columns to mirror the row order
sorted_cluster_matrix=sorted_cluster_matrix.reindex(columns=sorted_cluster_matrix.index)

#Change output figure size tuple based on number of samples
if (numSamples <= 20):
figureSize = (10, 8)
elif (numSamples <= 40):
figureSize = (20, 16)
elif (numSamples <= 60):
figureSize = (30, 24)
else:
figureSize = (40, 32)
print("\n\nNumber of samples: ", numSamples,"\nFigure size: ", figureSize)

# Compute clusters
clusters = sch.linkage(sorted_cluster_matrix.values, method='complete', metric='euclidean')

# Create clustermap and get the order of rows and columns based on clustering
clustergrid = sns.clustermap(
sorted_cluster_matrix,
xticklabels=True,
vmin=0,
vmax=50,
center=20,
annot=True,
annot_kws={'size': 6},
cbar_kws={"pad": 0.5},
cmap=cmap,
linecolor="white",
linewidths=.2,
fmt='d',
dendrogram_ratio=0.1,
col_cluster=True,
row_cluster=True,
figsize=figureSize
)
plt.setp(clustergrid.ax_heatmap.get_xticklabels(), rotation=45, ha='right')

# Suppress printing of dendrogram along the y-axis
clustergrid.ax_row_dendrogram.set_visible(False)
clustergrid.ax_col_dendrogram.set_visible(False)

row_order = clustergrid.dendrogram_row.reordered_ind
col_order = row_order
# Sort the DataFrame based on the cluster order
sorted_df = sorted_cluster_matrix.iloc[row_order, col_order]

# Compute the number of SNPs within the cluster per row
within_cluster_snps = sorted_df.apply(lambda row: row[row < 500].sum(), axis=1)

# Add 'Within_Cluster_SNPs' column to the sorted DataFrame
sorted_df['Within_Cluster_SNPs'] = within_cluster_snps.values

# Calculate silhouette scores for different numbers of clusters
silhouette_scores = []

if numSamples < 11:
upper_range = numSamples
else:
upper_range = 11

for n_clusters in range(2, upper_range):
kmeans = KMeans(n_clusters=n_clusters, n_init=10)
cluster_labels = kmeans.fit_predict(sorted_df.values)
silhouette_scores.append(silhouette_score(sorted_df.values, cluster_labels))

# Find the optimal number of clusters with the highest silhouette score
optimal_num_clusters = silhouette_scores.index(max(silhouette_scores)) + 2

# Use the optimal number of clusters to assign cluster labels and sort the DataFrame
kmeans = KMeans(n_clusters=optimal_num_clusters, n_init=10)
cluster_labels = kmeans.fit_predict(sorted_df.values)
sorted_df['Cluster'] = cluster_labels

# Sort the DataFrame first by 'Cluster' and then by 'Within_Cluster_SNPs'
sorted_df = sorted_df.sort_values(by=['Cluster', 'Within_Cluster_SNPs'], ascending=[True, True], kind="mergesort")

# Drop 'Cluster' and 'Within_Cluster_SNPs' columns
sorted_df = sorted_df.drop(['Cluster', 'Within_Cluster_SNPs'], axis='columns')
sorted_df = sorted_df.reindex(columns=sorted_df.index)

# Save the finally sorted, tab-delimited SNP matrix
sorted_df.to_csv('Final_snp_matrix.tsv', sep='\t', header=True, index=True)

# Create the reordered heatmap with correct values
heatmap = sns.clustermap(
sorted_df, xticklabels=True, yticklabels=True, vmin=0, vmax=50, center=20,
annot=True, annot_kws={'size': 6}, cbar_kws={"orientation": "vertical", "pad": 0.5},
cmap=cmap, linecolor="white", linewidths=.1, fmt='d', dendrogram_ratio=0.1,
col_cluster=False, row_cluster=False, figsize=figureSize
)
plt.setp(heatmap.ax_heatmap.get_xticklabels(), rotation=45, ha='right')

# Save the reordered heatmap as a PDF
heatmap.savefig('SNP_matrix.pdf')
heatmap.savefig('SNP_matrix.png')
heatmap.savefig('SNP_matrix_mqc.png')

plt.show()
plt.close()

print("Saved heatmap as Heatmap.{pdf,png}")
22 changes: 21 additions & 1 deletion bin/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@
print("Adding results for " + file)
analysis = "amrfinder"
new_df = pd.read_table(file, dtype = str, index_col= False)
new_df = new_df.sort_values('Gene symbol')
new_df['genes (per cov/per ident)'] = new_df['Gene symbol'] + ' (' + new_df['% Coverage of reference sequence'] + '/' + new_df['% Identity to reference sequence'] + ')'
new_df = new_df[['Name', 'genes (per cov/per ident)']]
new_df = new_df.groupby('Name', as_index=False).agg({'genes (per cov/per ident)': lambda x: list(x)})
Expand All @@ -125,13 +126,22 @@
analysis = "blobtools"
new_df = pd.read_table(file, dtype = str, index_col= False)
new_df = new_df[new_df['name'] != 'all']
new_df = new_df.sort_values('bam0_read_map_p', ascending = False)

tmp_df = new_df.drop_duplicates(subset='sample', keep="first").copy()
tmp_df = new_df['sample', 'name']
tmp_df['top_organism'] = tmp_df['name']
tmp_df = tmp_df.add_prefix(analysis + '_')

new_df['organism (per mapped reads)'] = new_df['name'] + ' (' + new_df['bam0_read_map_p'] + ')'
new_df = new_df[['sample', 'organism (per mapped reads)']]
new_df = new_df.groupby('sample', as_index=False).agg({'organism (per mapped reads)': lambda x: list(x)})
new_df['warning'] = new_df['organism (per mapped reads)'].apply(lambda x: "Blobtools multiple organisms," if ','.join(x).count(',') >= 2 else "")
new_df = new_df.add_prefix(analysis + '_')

summary_df = pd.merge(summary_df, new_df, left_on="sample", right_on=analysis + "_sample", how = 'left')
summary_df.drop(analysis + "_sample", axis=1, inplace=True)
summary_df = pd.merge(summary_df, tmp_df, left_on="sample", right_on=analysis + "_sample", how = 'left')
summary_df['warnings'] = summary_df['warnings'] + summary_df['blobtools_warning']

# fastani : merging relevant rows into one
Expand Down Expand Up @@ -175,13 +185,22 @@
print("Adding results for " + file)
analysis = "kraken2"
new_df = pd.read_csv(file, dtype = str, index_col= False)
new_df = new_df.sort_values('Sample', 'Percentage of fragments', ascending= False)
new_df['top_organism'] = new_df['Scientific name']

tmp_df = new_df['Sample', 'top_organism'].copy
tmp_df = tmp_df.drop_duplicates(subset=['Sample'], keep="first")
tmp_df = tmp_df.add_prefix(analysis + '_')

new_df['organism (per fragment)'] = new_df['Scientific name'] + " (" + new_df['Percentage of fragments'] + ' ' + new_df['Type'] + ")"
new_df = new_df[['Sample', 'organism (per fragment)']]
new_df = new_df.groupby('Sample', as_index=False).agg({'organism (per fragment)': lambda x: list(x)})
new_df['warning'] = new_df['organism (per fragment)'].apply(lambda x: "Kraken2 multiple organisms," if ','.join(x).count(',') >= 2 else "")
new_df = new_df.add_prefix(analysis + '_')
summary_df = pd.merge(summary_df, new_df, left_on="sample", right_on=analysis + "_Sample", how = 'left')
summary_df.drop(analysis + "_Sample", axis=1, inplace=True)
summary_df = pd.merge(summary_df, tmp_df, left_on="sample", right_on=analysis + "_Sample", how = 'left')
summary_df.drop(analysis + "_Sample", axis=1, inplace=True)
summary_df['warnings'] = summary_df['warnings'] + summary_df['kraken2_warning']

# mash : top hit of potentially two different files
Expand All @@ -190,7 +209,7 @@
print("Adding results for " + file)
analysis = "mash"
new_df = pd.read_csv(file, dtype = str, index_col= False)
new_df.sort_values(by = ['P-value', 'mash-distance'], ascending = [True, True], inplace=True)
new_df = new_df.sort_values(by = ['P-value', 'mash-distance'], ascending = [True, True])
new_df = new_df.drop_duplicates(subset=['sample'], keep='first')
new_df = new_df.add_prefix(analysis + "_")
summary_df = pd.merge(summary_df, new_df, left_on="sample", right_on=analysis + "_sample", how = 'left')
Expand Down Expand Up @@ -360,6 +379,7 @@
'amrfinder_genes_(per_cov/per_ident)',

# species
# TODO 'predicted_organism',
'mlst_matching_PubMLST_scheme',
'mlst_ST',
'mash_reference',
Expand Down
4 changes: 3 additions & 1 deletion grandeur.nf
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,8 @@ include { test } from "./subworkflows/test"
// ##### ##### ##### ##### ##### ##### ##### ##### ##### #####

// Creating the summary files
summary_script = Channel.fromPath(workflow.projectDir + "/bin/summary.py", type: "file")
summary_script = Channel.fromPath(workflow.projectDir + "/bin/summary.py", type: "file")
snpmtrx_script = Channel.fromPath(workflow.projectDir + "/bin/HeatCluster.py", type: "file")

// ##### ##### ##### ##### ##### ##### ##### ##### ##### #####

Expand Down Expand Up @@ -349,6 +350,7 @@ workflow {
// optional subworkflow for comparing shared genes
if ( params.msa ) {
phylogenetic_analysis(
snpmtrx_script,
ch_contigs.ifEmpty([]),
ch_gffs.ifEmpty([]),
ch_top_hit.ifEmpty([]))
Expand Down
2 changes: 1 addition & 1 deletion modules/amrfinderplus.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ process amrfinderplus {
tag "${sample}"
label "medcpus"
publishDir params.outdir, mode: 'copy'
container 'staphb/ncbi-amrfinderplus:3.11.11-2023-04-17.1'
container 'staphb/ncbi-amrfinderplus:3.11.14-2023-04-17.1'
maxForks 10
//#UPHLICA errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
//#UPHLICA cpus 7
Expand Down
5 changes: 3 additions & 2 deletions modules/datasets.nf
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ process datasets_summary {
val(taxon)

output:
path "datasets/${taxon}_genomes.csv" , emit: genomes
path "datasets/*_genomes.csv" , emit: genomes
path "logs/${task.process}/${taxon}.${workflow.sessionId}.log", emit: log

shell:
Expand All @@ -28,7 +28,8 @@ process datasets_summary {
echo "Nextflow command : " >> $log_file
cat .command.sh >> $log_file
taxon=$(echo !{taxon} | tr "_" " ")
taxon="$(echo !{taxon} | tr '_' ' ' | sed 's/[//g' | sed 's/]//g' )"
echo "the taxon is now $taxon"
datasets summary genome taxon "$taxon" --reference --limit !{params.datasets_max_genomes} --as-json-lines | \
dataformat tsv genome --fields accession,assminfo-refseq-category,assminfo-level,organism-name,assmstats-total-ungapped-len | \
Expand Down
50 changes: 44 additions & 6 deletions modules/grandeur.nf
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
process species {
tag "Creating list of species"
publishDir params.outdir, mode: 'copy'
container 'quay.io/biocontainers/pandas:1.1.5'
container 'quay.io/uphl/seaborn:0.12.2-2'
maxForks 10
//#UPHLICA errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
//#UPHLICA pod annotation: 'scheduler.illumina.com/presetSize', value: 'standard-medium'
Expand Down Expand Up @@ -49,7 +49,7 @@ process species {
process decompression {
tag "Decompressing genome file"
publishDir params.outdir, mode: 'copy'
container 'quay.io/biocontainers/pandas:1.1.5'
container 'quay.io/uphl/seaborn:0.12.2-2'
maxForks 10
stageInMode 'copy'
//#UPHLICA errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
Expand Down Expand Up @@ -90,7 +90,7 @@ process decompression {
process flag {
tag "${sample}"
publishDir params.outdir, mode: 'copy'
container 'quay.io/biocontainers/pandas:1.1.5'
container 'quay.io/uphl/seaborn:0.12.2-2'
maxForks 10
//#UPHLICA errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
//#UPHLICA pod annotation: 'scheduler.illumina.com/presetSize', value: 'standard-medium'
Expand Down Expand Up @@ -189,7 +189,7 @@ process flag {
process size {
tag "${sample}"
publishDir params.outdir, mode: 'copy'
container 'quay.io/biocontainers/pandas:1.1.5'
container 'quay.io/uphl/seaborn:0.12.2-2'
maxForks 10
//#UPHLICA errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
//#UPHLICA pod annotation: 'scheduler.illumina.com/presetSize', value: 'standard-medium'
Expand Down Expand Up @@ -332,7 +332,7 @@ process size {

process representative {
tag "${accession}"
container 'quay.io/biocontainers/pandas:1.1.5'
container 'quay.io/uphl/seaborn:0.12.2-2'
maxForks 10
//#UPHLICA errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
//#UPHLICA pod annotation: 'scheduler.illumina.com/presetSize', value: 'standard-medium'
Expand Down Expand Up @@ -364,4 +364,42 @@ process representative {
cp !{genomes}/*!{accession}* representative/.
'''
}
}

process snp_matrix_heatmap {
tag "heatmap"
publishDir params.outdir, mode: 'copy'
container 'quay.io/uphl/seaborn:0.12.2-2'
maxForks 10
//#UPHLICA errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
//#UPHLICA pod annotation: 'scheduler.illumina.com/presetSize', value: 'standard-medium'
//#UPHLICA memory 1.GB
//#UPHLICA cpus 3
//#UPHLICA time '10m'

input:
tuple file(snp_matrix), file(script)

output:
path "snp-dists/SNP_matrix*"
path "snp-dists/SNP_matrix_mqc.png" , emit: for_multiqc
path "logs/${task.process}/snp_matrix.${workflow.sessionId}.log" , emit: log_files

shell:
'''
mkdir -p snp-dists logs/!{task.process}
log_file=logs/!{task.process}/snp_matrix.!{workflow.sessionId}.log
# time stamp + capturing tool versions
date > $log_file
echo "container : !{task.container}" >> $log_file
echo "Nextflow command : " >> $log_file
cat .command.sh >> $log_file
python3 !{script}
mv SNP* snp-dists/.
'''
}


2 changes: 1 addition & 1 deletion modules/iqtree2.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ process iqtree2 {
tag "Pylogenetic Analysis"
label "maxcpus"
publishDir params.outdir, mode: 'copy'
container 'staphb/iqtree2:2.1.2'
container 'staphb/iqtree2:2.2.2.6'
maxForks 10
//#UPHLICA errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
//#UPHLICA pod annotation: 'scheduler.illumina.com/presetSize', value: 'standard-xlarge'
Expand Down
4 changes: 2 additions & 2 deletions modules/kraken2.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ process kraken2_fastq {
tag "${sample}"
label "maxcpus"
publishDir params.outdir, mode: 'copy'
container 'staphb/kraken2:2.1.2-no-db'
container 'staphb/kraken2:2.1.3'
maxForks 10
//#UPHLICA errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
//#UPHLICA pod annotation: 'scheduler.illumina.com/presetSize', value: 'standard-xlarge'
Expand Down Expand Up @@ -49,7 +49,7 @@ process kraken2_fasta {
tag "${sample}"
label "maxcpus"
publishDir params.outdir, mode: 'copy'
container 'staphb/kraken2:2.1.2-no-db'
container 'staphb/kraken2:2.1.3'
maxForks 10
//#UPHLICA errorStrategy { task.attempt < 2 ? 'retry' : 'ignore'}
//#UPHLICA pod annotation: 'scheduler.illumina.com/presetSize', value: 'standard-large'
Expand Down
Loading

0 comments on commit 28da941

Please sign in to comment.