Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add resort bam #1331

Draft
wants to merge 12 commits into
base: master
Choose a base branch
from
4 changes: 2 additions & 2 deletions BALSAMIC/constants/cluster_analysis.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
},
"picard_markduplicates": {
"time": "06:00:00",
"n": 8
"n": 36
},
"bwa_mem": {
"time": "08:00:00",
Expand Down Expand Up @@ -129,7 +129,7 @@
"n": 8
},
"samtools_sort_index": {
"time": "01:30:00",
"time": "02:30:00",
"n": 16
},
"sentieon_DNAscope": {
Expand Down
15 changes: 14 additions & 1 deletion BALSAMIC/constants/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@
"snakemake_rules/quality_control/samtools_qc.rule",
],
"align": [
"snakemake_rules/align/sentieon_alignment.rule",
"snakemake_rules/align/bam_compress.rule",
],
"varcall": [
Expand Down Expand Up @@ -66,7 +65,11 @@
"snakemake_rules/umi/mergetype_tumor_umi.rule",
"snakemake_rules/umi/generate_AF_tables.rule",
],
"concatenation": [
"snakemake_rules/concatenation/concatenation.rule",
],
"align": [
"snakemake_rules/align/bwa_mem_picard.rule",
"snakemake_rules/umi/sentieon_umiextract.rule",
"snakemake_rules/umi/sentieon_consensuscall.rule",
],
Expand Down Expand Up @@ -97,7 +100,11 @@
"snakemake_rules/quality_control/contest.rule",
"snakemake_rules/umi/generate_AF_tables.rule",
],
"concatenation": [
"snakemake_rules/concatenation/concatenation.rule",
],
"align": [
"snakemake_rules/align/bwa_mem_picard.rule",
"snakemake_rules/umi/sentieon_umiextract.rule",
"snakemake_rules/umi/sentieon_consensuscall.rule",
],
Expand All @@ -116,6 +123,9 @@
],
},
"single_wgs": {
"align": [
"snakemake_rules/align/sentieon_alignment.rule"
],
"qc": [
"snakemake_rules/quality_control/fastp_wgs.rule",
"snakemake_rules/quality_control/sentieon_qc_metrics.rule",
Expand All @@ -134,6 +144,9 @@
],
},
"paired_wgs": {
"align": [
"snakemake_rules/align/sentieon_alignment.rule"
],
"qc": [
"snakemake_rules/quality_control/fastp_wgs.rule",
"snakemake_rules/quality_control/sentieon_qc_metrics.rule",
Expand Down
2 changes: 1 addition & 1 deletion BALSAMIC/models/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,7 @@ def get_final_bam_name(
final_bam_suffix = "dedup"
elif self.analysis.sequencing_type == SequencingType.TARGETED:
# Only dedup is necessary for TGA
final_bam_suffix = "dedup"
final_bam_suffix = "dedup_sorted"
else:
# For WGS the bamfiles are realigned
final_bam_suffix = "dedup.realign"
Expand Down
106 changes: 106 additions & 0 deletions BALSAMIC/snakemake_rules/align/bwa_mem_picard.rule
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# vim: syntax=python tabstop=4 expandtab
# coding: utf-8

# Following rule will take input fastq files, align them using bwa mem, and convert the output to sam format


rule bwa_mem:
input:
fa = config["reference"]["reference_genome"],
fastq_r1=fastq_dir + "{sample}_1.fp.fastq.gz",
fastq_r2=fastq_dir + "{sample}_2.fp.fastq.gz",
refidx = expand(config["reference"]["reference_genome"] + ".{prefix}", prefix=["amb","ann","bwt","pac","sa"])
output:
bam_out=Path(bam_dir,"{sample}_align_sort.bam").as_posix()
benchmark:
Path(benchmark_dir, "bwa_mem_{sample}.tsv").as_posix()
singularity:
Path(singularity_image, config["bioinfo_tools"].get("bwa") + ".sif").as_posix()
params:
bam_header = params.common.align_header,
tmpdir = tempfile.mkdtemp(prefix=tmp_dir),
sample_id = "{sample}"
threads:
get_threads(cluster_config, "bwa_mem")
message:
("Align fastq files with bwa-mem to reference genome and "
"sort using samtools for sample: {params.sample_id}")
shell:
"""
mkdir -p {params.tmpdir};
export TMPDIR={params.tmpdir};

bwa mem \
-t {threads} \
-R {params.bam_header} \
-M \
-v 1 \
{input.fa} {input.fastq_r1} {input.fastq_r2} \
| samtools sort -T {params.tmpdir} \
--threads {threads} \
--output-fmt BAM \
-o {output.bam_out} - ;

samtools index -@ {threads} {output.bam_out};
rm -rf {params.tmpdir};
"""


rule picard_markduplicates:
input:
Path(bam_dir, "{sample}_align_sort.bam").as_posix()
output:
mrkdup = Path(bam_dir, "{sample_type}.{sample}.dedup_sorted.bam").as_posix(),
picard_stats = Path(qc_dir, "{sample_type}.{sample}.dedup.metrics").as_posix(),
benchmark:
Path(benchmark_dir,"picard_{sample_type}_markduplicates_{sample}.tsv").as_posix()
singularity:
Path(singularity_image,config["bioinfo_tools"].get("picard") + ".sif").as_posix()
params:
mem = "16g",
tmpdir = tempfile.mkdtemp(prefix=tmp_dir),
sample_id = "{sample}"
threads:
get_threads(cluster_config, "picard_markduplicates")
message:
"Picard marking duplicates and samtool indexing for sample: {params.sample_id}"
shell:
"""
mkdir -p {params.tmpdir};
export TMPDIR={params.tmpdir};

picard -Djava.io.tmpdir={params.tmpdir} -Xmx{params.mem} \
MarkDuplicates \
INPUT={input} \
OUTPUT={output.mrkdup}_markdup.bam \
VALIDATION_STRINGENCY=SILENT \
MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=1000 \
REMOVE_DUPLICATES=FALSE \
METRICS_FILE='{output.picard_stats}';

samtools index {output.mrkdup}_markdup.bam ;

picard -Xmx150g FixMateInformation -ADD_MATE_CIGAR true -MAX_RECORDS_IN_RAM 10000000 -CREATE_INDEX true -CREATE_MD5_FILE true \
-TMP_DIR {params.tmpdir} \
-INPUT {output.mrkdup}_markdup.bam \
-OUTPUT {params.tmpdir}/{wildcards.sample_type}.fixed.bam;

samtools view --threads {threads} -O BAM -f 4 {params.tmpdir}/{wildcards.sample_type}.fixed.bam \
-o {params.tmpdir}/{wildcards.sample_type}.fixed.um.bam;

samtools index {params.tmpdir}/{wildcards.sample_type}.fixed.um.bam;

samtools view --threads {threads} -O BAM -F 4 {params.tmpdir}/{wildcards.sample_type}.fixed.bam \
-o {params.tmpdir}/{wildcards.sample_type}.fixed.m.bam;

samtools index {params.tmpdir}/{wildcards.sample_type}.fixed.m.bam;

picard -Xmx150g AddOrReplaceReadGroups -RGPU ILLUMINAi -RGID {wildcards.sample_type} -RGSM {wildcards.sample_type} -RGPL ILLUMINAi -RGLB ILLUMINAi -MAX_RECORDS_IN_RAM 1000000 -CREATE_INDEX true -CREATE_MD5_FILE true \
-TMP_DIR {params.tmpdir} \
-INPUT {params.tmpdir}/{wildcards.sample_type}.fixed.m.bam \
-OUTPUT {output.mrkdup};

samtools index {output.mrkdup};

rm -rf {params.tmpdir};
"""
52 changes: 37 additions & 15 deletions BALSAMIC/snakemake_rules/align/sentieon_alignment.rule
Original file line number Diff line number Diff line change
Expand Up @@ -85,15 +85,37 @@ shell_bam_files=$(echo {input.bam_files} | sed 's/ / -i /g') ;
--metrics {output.metrics} \
{output.bam};


sed 's/^LIBRARY/\\n## METRICS CLASS\tpicard\.sam\.DuplicationMetrics\\nLIBRARY/' -i {output.metrics}
"""

rule samtools_sort_index:
input:
bam = Path(bam_dir,"{sample_type}.{sample}.dedup.bam").as_posix(),
output:
bam = Path(bam_dir,"{sample_type}.{sample}.dedup_sorted.bam").as_posix(),
benchmark:
Path(benchmark_dir,"samtools_sort_index_{sample_type}_{sample}.tsv").as_posix()
singularity:
Path(singularity_image,config["bioinfo_tools"].get("samtools") + ".sif").as_posix()
params:
sample_id="{sample}"
threads:
get_threads(cluster_config,"samtools_qc")
message:
"Calculating alignment stats for sample: {params.sample_id}"
shell:
"""
samtools sort --threads {threads} -o {output.bam} {input.bam};
samtools index -@ {threads} {output.bam};
"""


rule sentieon_realign:
input:
ref = config["reference"]["reference_genome"],
mills = config["reference"]["mills_1kg"],
bam = Path(bam_dir, "{sample_type}.{sample}.dedup.bam").as_posix(),
bam = Path(bam_dir, "{sample_type}.{sample}.dedup_sorted.bam").as_posix(),
indel_1kg = config["reference"]["known_indel_1kg"]
output:
bam = Path(bam_dir, "{sample_type}.{sample}.dedup.realign.bam").as_posix()
Expand All @@ -110,18 +132,18 @@ rule sentieon_realign:
"INDEL realignment using sentieon realigner for sample: {params.sample_id}"
shell:
"""
mkdir -p {params.tmpdir};
export TMPDIR={params.tmpdir};
export SENTIEON_TMPDIR={params.tmpdir};
export SENTIEON_LICENSE={params.sentieon_lic};

{params.sentieon_exec} driver \
-r {input.ref} \
-t {threads} \
-i {input.bam} \
--algo Realigner \
-k {input.mills} \
-k {input.indel_1kg} \
{output};
"""
mkdir -p {params.tmpdir};
export TMPDIR={params.tmpdir};
export SENTIEON_TMPDIR={params.tmpdir};
export SENTIEON_LICENSE={params.sentieon_lic};
{params.sentieon_exec} driver \
-r {input.ref} \
-t {threads} \
-i {input.bam} \
--algo Realigner \
-k {input.mills} \
-k {input.indel_1kg} \
{output};
"""

17 changes: 9 additions & 8 deletions BALSAMIC/snakemake_rules/concatenation/concatenation.rule
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
rule concatenate:
"""Merge fastq files per lane into a single forward and reverse fastq."""
input:
fwd_fastqs = lambda wildcards: config_model.get_all_fastqs_for_sample(
sample_name = wildcards.sample, fastq_types = [FastqName.FWD]),
rev_fastqs = lambda wildcards: config_model.get_all_fastqs_for_sample(
sample_name = wildcards.sample, fastq_types = [FastqName.REV])
fastqs_fwd=lambda wildcards: config_model.get_all_fastqs_for_sample(
sample_name=wildcards.sample, fastq_types=[FastqName.FWD]
),
fastqs_rev=lambda wildcards: config_model.get_all_fastqs_for_sample(
sample_name=wildcards.sample, fastq_types=[FastqName.REV]
),
output:
concat_fwd = fastq_dir + "{sample}_concat_R_1.fp.fastq.gz",
concat_rev = fastq_dir + "{sample}_concat_R_2.fp.fastq.gz"
Expand All @@ -16,13 +18,12 @@ rule concatenate:
params:
fastq_dir = fastq_dir,
sample = "{sample}",
read = "{read}"
threads:
get_threads(cluster_config, "concatenate")
message:
"Sample {params.sample} and read {params.read} FASTQ concatenation"
"Sample {params.sample} FASTQ concatenation"
shell:
"""
cat {input.fwd_fastqs} > {output.concat_fwd}
cat {input.rev_fastqs} > {output.concat_rev}
cat {input.fastqs_fwd} > {output.concat_fwd}
cat {input.fastqs_rev} > {output.concat_rev}
"""
40 changes: 20 additions & 20 deletions BALSAMIC/snakemake_rules/quality_control/fastp_tga.rule
Original file line number Diff line number Diff line change
Expand Up @@ -3,33 +3,33 @@
rule fastp_umi_trim:
"""Fastq TGA data pre-processing to remove UMIs."""
input:
fastq_r1 = lambda wildcards: config_model.get_fastq_by_fastq_pattern(wildcards.fastq_pattern, FastqName.FWD),
fastq_r2 = lambda wildcards: config_model.get_fastq_by_fastq_pattern(wildcards.fastq_pattern, FastqName.REV)
concat_fwd=fastq_dir + "{sample}_concat_R_1.fp.fastq.gz",
concat_rev=fastq_dir + "{sample}_concat_R_2.fp.fastq.gz"
output:
fastq_r1 = temp(fastq_dir + "{fastq_pattern}_1.umi_removed.fastq.gz"),
fastq_r2 = temp(fastq_dir + "{fastq_pattern}_2.umi_removed.fastq.gz"),
json = qc_dir + "fastp/{fastq_pattern}_umi_removed_fastp.json",
html = qc_dir + "fastp/{fastq_pattern}_umi_removed_fastp.html",
fastq_r1 = temp(fastq_dir + "{sample}_1.umi_removed.fastq.gz"),
fastq_r2 = temp(fastq_dir + "{sample}_2.umi_removed.fastq.gz"),
json = qc_dir + "fastp/{sample}_umi_removed_fastp.json",
html = qc_dir + "fastp/{sample}_umi_removed_fastp.html",
benchmark:
Path(benchmark_dir, "fastp_remove_umi" + "{fastq_pattern}.tsv").as_posix()
Path(benchmark_dir, "fastp_remove_umi" + "{sample}.tsv").as_posix()
singularity:
Path(singularity_image, config["bioinfo_tools"].get("fastp") + ".sif").as_posix()
params:
tmpdir = tmp_dir,
fastp_trim_umi = " ".join(fastp_parameters["fastp_trim_umi"]),
fastq_pattern = "{fastq_pattern}",
sample = "{sample}",
threads:
get_threads(cluster_config, 'fastp_remove_umi')
message:
"Trimming away UMI sequences for fastqs in fastq pattern: {params.fastq_pattern}"
"Trimming away UMI sequences for fastqs in sample: {params.sample}"
shell:
"""
export TMPDIR={params.tmpdir};

fastp \
--thread {threads} \
--in1 {input.fastq_r1} \
--in2 {input.fastq_r2} \
--in1 {input.concat_fwd} \
--in2 {input.concat_rev} \
--out1 {output.fastq_r1} \
--out2 {output.fastq_r2} \
--json {output.json} \
Expand All @@ -45,26 +45,26 @@ fastp \
rule fastp_quality_trim_tga:
"""Fastq data pre-processing after removal of UMIs."""
input:
fastq_r1 = fastq_dir + "{fastq_pattern}_1.umi_removed.fastq.gz",
fastq_r2 = fastq_dir + "{fastq_pattern}_2.umi_removed.fastq.gz"
fastq_r1 = fastq_dir + "{sample}_1.umi_removed.fastq.gz",
fastq_r2 = fastq_dir + "{sample}_2.umi_removed.fastq.gz"
output:
fastq_r1 = temp(fastq_dir + "{fastq_pattern}_1.fp.fastq.gz"),
fastq_r2 = temp(fastq_dir + "{fastq_pattern}_2.fp.fastq.gz"),
json = qc_dir + "fastp/{fastq_pattern}_fastp.json",
html = qc_dir + "fastp/{fastq_pattern}_fastp.html"
fastq_r1 = temp(fastq_dir + "{sample}_1.fp.fastq.gz"),
fastq_r2 = temp(fastq_dir + "{sample}_2.fp.fastq.gz"),
json = qc_dir + "fastp/{sample}_fastp.json",
html = qc_dir + "fastp/{sample}_fastp.html"
benchmark:
Path(benchmark_dir, "fastp_quality_trim" + "{fastq_pattern}.tsv").as_posix()
Path(benchmark_dir, "fastp_quality_trim" + "{sample}.tsv").as_posix()
singularity:
Path(singularity_image, config["bioinfo_tools"].get("fastp") + ".sif").as_posix()
params:
tmpdir = tmp_dir,
quality_trim = " ".join(fastp_parameters["fastp_trim_qual"]),
adapter_trim = " ".join(fastp_parameters["fastp_trim_adapter"]),
fastq_pattern = "{fastq_pattern}"
sample = "{sample}"
threads:
get_threads(cluster_config, 'fastp_quality_trim')
message:
"Quality and adapter trimming for fastqs for fastq pattern: {params.fastq_pattern}"
"Quality and adapter trimming for fastqs for fastq pattern: {params.sample}"
shell:
"""
export TMPDIR={params.tmpdir};
Expand Down
7 changes: 3 additions & 4 deletions BALSAMIC/snakemake_rules/quality_control/multiqc.rule
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,9 @@ if config["analysis"]["sequencing_type"] == 'wgs':

else:
# fastp metrics
multiqc_input.extend(expand(qc_dir + "fastp/{fastq_pattern}_fastp.json",
fastq_pattern=config_model.get_fastq_patterns_by_sample(sample_names = sample_names)))
multiqc_input.extend(expand(qc_dir + "fastp/{fastq_pattern}_fastp.html",
fastq_pattern=config_model.get_fastq_patterns_by_sample(sample_names = sample_names)))
multiqc_input.extend(expand(qc_dir + "fastp/{sample}_fastp.json", sample=sample_names))
multiqc_input.extend(expand(qc_dir + "fastp/{sample}_fastp.html", sample=sample_names))


# picard metrics
multiqc_input.extend(expand(bam_dir + "{sample}.dedup.insertsizemetric.txt", sample=tumor_sample))
Expand Down
2 changes: 1 addition & 1 deletion tests/models/test_config_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,7 @@ def test_get_final_bam_name(balsamic_model: ConfigModel):
)

# Then retrieved final bam names should match the expected format and be identical regardless of request parameter
expected_final_bam_name = f"{bam_dir}{sample_type}.{sample_name}.dedup.bam"
expected_final_bam_name = f"{bam_dir}{sample_type}.{sample_name}.dedup_sorted.bam"
assert expected_final_bam_name == bam_name_sample_name
assert bam_name_sample_name == bam_name_sample_type

Expand Down
Loading