feat: generate snakemake report (#80)

* feat: report generation * feat: add report files to de_analysis output * feat: draft de_analysis report * fix: output pathing * style: snakefmt formatting * style: reworked report.rst files * feat: added NanoPlot output to report + style changes * fix: Using NanoPlot report in snakemake report * feat: added volcano plot to report * style: linting * style: clarified code
snakemake-workflows · Sep 13, 2024 · cd25504 · cd25504
1 parent a2e45bd
commit cd25504
Show file tree

Hide file tree

Showing 12 changed files with 75 additions and 12 deletions.
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -11,6 +11,9 @@ min_version("8.10.7")
 configfile: "config/config.yml"
 
 
+report: "report/workflow.rst"
+
+
 include: "rules/commons.smk"
 include: "rules/qc.smk"
 include: "rules/utils.smk"

diff --git a/workflow/report/correlation_matrix.rst b/workflow/report/correlation_matrix.rst
@@ -0,0 +1,3 @@
+ The correlation matrix is computed by calculating the pairwise Pearson correlations between genes across all samples.
+ The matrix is then used to create a heatmap that visualizes the strength of correlation, with hierarchical clustering applied to group samples with similar expression patterns into dendograms.
+ This can be used to identify clusters of samples that share similar gene expression patterns, which might suggest shared regulatory mechanisms or functional pathways.
diff --git a/workflow/report/dispersion_graph.rst b/workflow/report/dispersion_graph.rst
@@ -0,0 +1,3 @@
+ The dispersion plot is employed to assess the quality of the initial sequencing data, on the assumption that genes with analogous average expression strength will exhibit a comparable degree of dispersion.
+ The dispersion is initially estimated for each gene (black dots) and the idealized location paramter (red curve) is used to calculate the final dispersion values (blue dots). 
+ In an optimal scenario, the final dispersion values should scatter in close proximity to the location parameter. 
diff --git a/workflow/report/heatmap.rst b/workflow/report/heatmap.rst
@@ -0,0 +1,4 @@
+ Heatmap of normalized gene expression data across all samples, highlighting pattern expression levels.
+ The heatmap is generated using Seaborn's clustermap function which organizes data by performing hierarchical clustering on the gene row and sample columns based on their expression profiles.
+ This clustering groups together genes and samples with similar expression patterns, making it easier to identify clusters of co-expressed genes and similar samples.
+ The color intensity represents the level of gene expression.
diff --git a/workflow/report/heatmap_top.rst b/workflow/report/heatmap_top.rst
@@ -0,0 +1,2 @@
+ Top_heatmap of normalized gene expression counts.
+ The top_heatmap is based on the heatmap but focuses on the top {{ snakemake.config["threshold_plot"] }} genes with the most significant expression changes.
diff --git a/workflow/report/ma_graph.rst b/workflow/report/ma_graph.rst
@@ -0,0 +1,3 @@
+ The `MA plot, <https://en.wikipedia.org/wiki/MA_plot>`_ is created by computing p-values using Wald-tests.
+ The plot compares for each transcript the mean abundance across samples (x-axis) and the log2 foldchange as a ratio of expression between the two conditions (y-axis).
+ Genes with significant changes in expression that fall outside of the significance criteria lfc_null = {{ snakemake.config["lfc_null"] }} are highlighted in red.
diff --git a/workflow/report/nanoplot_all_samples_report.rst b/workflow/report/nanoplot_all_samples_report.rst
@@ -0,0 +1 @@
+Full `NanoPlot, <https://github.com/wdecoster/NanoPlot>`_ sequencing quality report for total samples .
diff --git a/workflow/report/nanoplot_sample_report.rst b/workflow/report/nanoplot_sample_report.rst
@@ -0,0 +1 @@
+Full `NanoPlot, <https://github.com/wdecoster/NanoPlot>`_ sequencing quality report for sample {{ snakemake.wildcards }}.
diff --git a/workflow/report/volcano_plot.rst b/workflow/report/volcano_plot.rst
@@ -0,0 +1,3 @@
+ Volcano plot of all genes with their relative gene expression as log2FoldChange (x-axis) and their significance with adjusted p-values (y-axis). 
+ The expression strength criteria (dotted lines) lfc_null = {{ snakemake.config["lfc_null"] }} and the significance threshold (grey line) alpha = {{ snakemake.config["alpha"] }} determine differentially expressed genes.
+ Genes that exceed both are coloured green for overexpressed genes and red for underexpressed genes, other genes are not considered to be differentially expressed between conditions and are coloured grey.
diff --git a/workflow/report/workflow.rst b/workflow/report/workflow.rst
@@ -0,0 +1,3 @@
+ This workflow performs differential expression analysis of RNA-seq data obtained from Oxford Nanopore long-read sequencing technology. 
+ First a transcriptome FASTA is constructed using `gffread <https://github.com/gpertea/gffread>`_. Reads are then mapped to the transcriptome with the long-read optimized alignment tool `minimap2 <https://github.com/lh3/minimap2>`_. Next quantification is performed using `salmon <https://github.com/COMBINE-lab/salmon>`_ before normalization and differential expression analysis are conducted by `PyDESeq2 <https://github.com/owkin/PyDESeq2>`_.
+ Additionaly, `NanoPlot <https://github.com/wdecoster/NanoPlot>`_ is employed to analyze initial sequencing data and `QualiMap <https://github.com/EagleGenomics-cookbooks/QualiMap>`_ is used to evaluate mapping results.
diff --git a/workflow/rules/diffexp.smk b/workflow/rules/diffexp.smk
@@ -2,14 +2,38 @@ rule de_analysis:
     input:
         all_counts=rules.merge_counts.output,
     output:
-        dispersion_graph="de_analysis/dispersion_graph.svg",
-        ma_graph="de_analysis/ma_graph.svg",
-        de_heatmap="de_analysis/heatmap.svg",
-        correlation_matrix="de_analysis/correlation_matrix.svg",
+        dispersion_graph=report(
+            "de_analysis/dispersion_graph.svg",
+            category="Results",
+            caption="../report/dispersion_graph.rst",
+        ),
+        ma_graph=report(
+            "de_analysis/ma_graph.svg",
+            category="Results",
+            caption="../report/ma_graph.rst",
+        ),
+        de_heatmap=report(
+            "de_analysis/heatmap.svg",
+            category="Results",
+            caption="../report/heatmap.rst",
+        ),
+        correlation_matrix=report(
+            "de_analysis/correlation_matrix.svg",
+            category="Results",
+            caption="../report/correlation_matrix.rst",
+        ),
         normalized_counts="de_analysis/normalized_counts.csv",
-        de_top_heatmap="de_analysis/heatmap_top.svg",
+        de_top_heatmap=report(
+            "de_analysis/heatmap_top.svg",
+            category="Results",
+            caption="../report/heatmap_top.rst",
+        ),
         lfc_analysis="de_analysis/lfc_analysis.csv",
-        volcano_plot="de_analysis/volcano_plot.svg",
+        volcano_plot=report(
+            "de_analysis/volcano_plot.svg",
+            category="Results",
+            caption="../report/volcano_plot.rst",
+        ),
     params:
         samples=samples,
     log:

diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk
@@ -17,30 +17,43 @@ rule plot_samples:
             samples["sample"][wildcards.sample]
         ),
     output:
-        directory("NanoPlot/{sample}"),
+        scatter=report(
+            "NanoPlot/{sample}/NanoPlot-report.html",
+            category="Quality control",
+            caption="../report/nanoplot_sample_report.rst",
+        ),
+    params:
+        outdir=lambda wildcards: f"NanoPlot/{wildcards.sample}",
     log:
         "logs/NanoPlot/{sample}.log",
     resources:
         cpus_per_task=min(len({input}), config["max_cpus"]),  #problem with max(len(input.fastq),39)
     conda:
         "../envs/nanoplot.yml"
     shell:
-        "NanoPlot -t {resources.cpus_per_task} --tsv_stats -f svg "
-        "--fastq {input.fastq} -o {output} 2> {log}"
+        "NanoPlot --threads {resources.cpus_per_task} --tsv_stats --format svg "
+        "--fastq {input.fastq} --outdir {params.outdir} 2> {log}"
 
 
 rule plot_all_samples:
     input:
         aggregate_input(samples["sample"]),
     output:
-        directory("NanoPlot/all_samples"),
+        scatter=report(
+            "NanoPlot/all_samples/NanoPlot-report.html",
+            category="Quality control",
+            caption="../report/nanoplot_all_samples_report.rst",
+        ),
+    # This parameter is in line with the Snakemake docs 8.20.3 guideline on how to avoid having parameters as output prefixes
+    params:
+        outdir=lambda wildcards, output: output[0][:-21],
     log:
         "logs/NanoPlot/all_samples.log",
     conda:
         "../envs/nanoplot.yml"
     shell:
-        "NanoPlot -t {resources.cpus_per_task} --tsv_stats -f svg "
-        "--fastq {input} -o {output} 2> {log}"
+        "NanoPlot --threads {resources.cpus_per_task} --tsv_stats --format svg "
+        "--fastq {input} --outdir {params.outdir} 2> {log}"
 
 
 rule compress_nplot: