Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: extend bedfiles for CNV analysis #1469

Merged
merged 12 commits into from
Jul 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions BALSAMIC/assets/scripts/extend_bedfile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import click


@click.command()
@click.argument("input_bedfile", type=click.Path(exists=True))
@click.argument("output_bedfile", type=click.Path())
@click.option("--min-region-size", default=20, help="Minimum region size to enforce.")
def extend_bedfile(input_bedfile: str, output_bedfile: str, min_region_size: int):
"""
Process a BED file to ensure regions are at least a minimum size.

Args:
input_bedfile (str): Input BED file path.
output_bedfile (str): Output BED file path.
min_region_size (int): Minimum region size to enforce.
"""
with open(input_bedfile, "r") as infile, open(output_bedfile, "w") as outfile:
for line in infile:
fields = line.strip().split("\t")

chrom: str = fields[0]
start = int(fields[1])
end = int(fields[2])

region_length: int = end - start
if region_length < min_region_size:
center = (start + end) // 2
half_size = min_region_size // 2
start = max(0, center - half_size)
end = center + half_size
if min_region_size % 2 != 0:
end += 1

outfile.write(f"{chrom}\t{start}\t{end}\n")


if __name__ == "__main__":
extend_bedfile()
8 changes: 8 additions & 0 deletions BALSAMIC/constants/cluster_analysis.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@
"time": "05:00:00",
"n": 10
},
"extend_short_bedregions": {
"time": "01:00:00",
"n": 1
},
"cnvkit_create_coverage": {
"time": "6:00:00",
"n": 18
Expand Down Expand Up @@ -424,6 +428,10 @@
"time": "01:00:00",
"n": 1
},
"bedtools_merge": {
"time": "01:00:00",
"n": 1
},
"bam_compress": {
"time": "04:00:00",
"n": 20
Expand Down
6 changes: 6 additions & 0 deletions BALSAMIC/constants/cluster_cache.json
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,12 @@
"mail_type": "FAIL",
"partition": "core"
},
"expand_short_bedregions": {
"n": 1,
"time": "00:15:00",
"mail_type": "FAIL",
"partition": "core"
},
"fasta_index_reference_genome": {
"n": 24,
"time": "01:00:00",
Expand Down
6 changes: 4 additions & 2 deletions BALSAMIC/constants/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@
"snakemake_rules/umi/sentieon_consensuscall.rule",
],
"varcall": [
"snakemake_rules/variant_calling/cnvkit_preprocess.rule"
"snakemake_rules/variant_calling/extend_bed.rule",
"snakemake_rules/variant_calling/cnvkit_preprocess.rule",
"snakemake_rules/variant_calling/germline_tga.rule",
"snakemake_rules/variant_calling/split_bed.rule",
"snakemake_rules/variant_calling/somatic_cnv_tumor_only_tga.rule",
Expand Down Expand Up @@ -108,7 +109,8 @@
"snakemake_rules/umi/umi_sentieon_alignment.rule",
],
"varcall": [
"snakemake_rules/variant_calling/cnvkit_preprocess.rule"
"snakemake_rules/variant_calling/extend_bed.rule",
"snakemake_rules/variant_calling/cnvkit_preprocess.rule",
"snakemake_rules/variant_calling/germline_tga.rule",
"snakemake_rules/variant_calling/split_bed.rule",
"snakemake_rules/variant_calling/somatic_tumor_normal.rule",
Expand Down
3 changes: 3 additions & 0 deletions BALSAMIC/constants/workflow_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,9 @@
"bam_post_processing": {
"manta_max_base_quality": 70,
},
"bed_pre_processing": {
"minimum_region_size": 100,
},
"common": {
"header_per_lane": "'@RG\\tID:{fastq_pattern}\\tSM:{sample_type}\\tPL:ILLUMINAi'",
"header_per_sample": "'@RG\\tID:{sample}\\tSM:{sample_type}\\tPL:ILLUMINAi'",
Expand Down
11 changes: 11 additions & 0 deletions BALSAMIC/models/params.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,16 @@ class BAMPostProcessingParams(BaseModel):
manta_max_base_quality: int


class BEDPreProcessingParams(BaseModel):
"""This class defines the params settings used as constants in bed pre-processing rules

Attributes:
minimum_region_size: int (required); the minimum region size in input bedfiles for CNV analysis
"""

minimum_region_size: int


class BalsamicWorkflowConfig(BaseModel):
"""Defines set of rules in balsamic workflow

Expand All @@ -216,6 +226,7 @@ class BalsamicWorkflowConfig(BaseModel):
"""

bam_post_processing: BAMPostProcessingParams
bed_pre_processing: BEDPreProcessingParams
common: ParamsCommon
insert_size_metrics: ParamsInsertSizeMetrics
manta: ParamsManta
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
rule create_target:
input:
target_bait = target_bed,
refgene_flat = refgene_flat,
access_bed = access_5kb_hg19,
bed_expanded_merged = Path(cnv_dir + "capture_kit_expanded_merged.bed").as_posix(),
refgene_flat = config_model.reference["refgene_flat"],
access_bed = config_model.reference["access_regions"],
wake_up = result_dir + "start_analysis",
output:
targets = cnv_dir + "targets.bed",
Expand All @@ -15,8 +15,8 @@ rule create_target:
Path(benchmark_dir, "cnvkit.targets.tsv").as_posix()
shell:
"""
cnvkit.py target {input.target_bait} --annotate {input.refgene_flat} --split -o {output.targets};
cnvkit.py antitarget {input.target_bait} -g {input.access_bed} -o {output.antitargets};
cnvkit.py target {input.bed_expanded_merged} --annotate {input.refgene_flat} --split -o {output.targets};
cnvkit.py antitarget {input.bed_expanded_merged} -g {input.access_bed} -o {output.antitargets};
"""

rule create_coverage:
Expand Down
42 changes: 42 additions & 0 deletions BALSAMIC/snakemake_rules/variant_calling/extend_bed.rule
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@

rule extend_short_bedregions:
input:
baits_bed = config_model.panel.capture_kit,
wake_up= result_dir + "start_analysis",
output:
baits_bed_expanded=Path(cnv_dir + "capture_kit_expanded.bed").as_posix(),
benchmark:
Path(benchmark_dir,"extend_short_bedregions.tsv").as_posix()
singularity:
Path(singularity_image,config["bioinfo_tools"].get("pysam") + ".sif").as_posix()
params:
bedfile_extend = get_script_path("extend_bedfile.py"),
minimum_region_size = params.bed_pre_processing.minimum_region_size
threads:
get_threads(cluster_config, "extend_short_bedregions")
message:
"Extending regions in bedfiel to a minimum size of {params.minimum_region_size}."
shell:
"""
python {params.bedfile_extend} --min-region-size {params.minimum_region_size} {input.baits_bed} {output.baits_bed_expanded} ;
"""


rule bedtools_sort_and_merge:
input:
bed_expanded = Path(cnv_dir + "capture_kit_expanded.bed").as_posix(),
output:
bed_expanded_merged = Path(cnv_dir + "capture_kit_expanded_merged.bed").as_posix(),
benchmark:
Path(benchmark_dir, 'bedtools_merge_expanded_bedfile.tsv').as_posix()
singularity:
Path(singularity_image, config["bioinfo_tools"].get("bedtools") + ".sif").as_posix()
threads:
get_threads(cluster_config, "bedtools_merge")
message:
"Running bedtools sort and merge to merge potentially overlapping regions."
shell:
"""
bedtools sort -i {input.bed_expanded} > {input.bed_expanded}_sorted.bed
bedtools merge {input.bed_expanded}_sorted.bed {output.bed_expanded_merged}
"""
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

rule gatk_collectreadcounts:
input:
bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = wildcards.sample),
bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = wildcards.sample, specified_suffix="dedup"),
genome_interval = config["reference"]["genome_interval"]
output:
readcounts_hdf5 = cnv_dir + "{sample}.collectreadcounts.hdf5"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ rm -rf {params.tmpdir};
rule cnvkit_segment_CNV_research:
input:
access_bed = config["reference"]["access_regions"],
baits_bed = config["panel"]["capture_kit"],
fasta = config["reference"]["reference_genome"],
refgene_flat = config["reference"]["refgene_flat"],
tumor_target_cnn=expand(cnv_dir + "{sample}.targetcoverage.cnn", sample=tumor_sample),
Expand Down Expand Up @@ -78,21 +77,21 @@ if [[ ! -f "{params.pon}" ]]; then

# Compile a coverage reference from the given list of files
cnvkit.py reference \
{output.normal_target_coverage} \
{output.normal_antitarget_coverage} \
{input.normal_target_cnn} \
{input.normal_antitarget_cnn} \
--fasta {input.fasta} \
--output {params.normal_reference_cnn};

cnvkit.py fix {output.tumor_target_coverage} \
{output.tumor_antitarget_coverage} \
cnvkit.py fix {input.tumor_target_cnn} \
{input.tumor_antitarget_cnn} \
{params.normal_reference_cnn} \
--output {output.cnr};

else

echo "PON reference exists- Using it for coverage correction"
cnvkit.py fix {output.tumor_target_coverage} \
{output.tumor_antitarget_coverage} \
cnvkit.py fix {input.tumor_target_cnn} \
{input.tumor_antitarget_cnn} \
{params.pon} \
--output {output.cnr};

Expand Down Expand Up @@ -203,14 +202,11 @@ rule cnvkit_call_CNV_research:
input:
access_bed = config["reference"]["access_regions"],
fasta = config["reference"]["reference_genome"],
baits_bed = config["panel"]["capture_kit"],
refgene_flat = config["reference"]["refgene_flat"],
purity_ploidy = cnv_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".purecn.purity.csv",
cns_initial = cnv_dir + "tumor.initial.cns",
cnr = cnv_dir + "tumor.merged.cnr",
snv_merged = vcf_dir + "SNV.germline.merged.dnascope.vcf.gz",
bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample),
bamN = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = normal_sample),
output:
cns = cnv_dir + "tumor.merged.cns",
gene_breaks = cnv_dir + config["analysis"]["case_id"] + ".gene_breaks",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
rule cnvkit_segment_CNV_research:
input:
access_bed = config["reference"]["access_regions"],
baits_bed = config["panel"]["capture_kit"],
fasta = config["reference"]["reference_genome"],
refgene_flat = config["reference"]["refgene_flat"],
tumor_target_cnn=expand(cnv_dir + "{sample}.targetcoverage.cnn",sample=tumor_sample),
Expand Down Expand Up @@ -34,19 +33,19 @@ rule cnvkit_segment_CNV_research:
if [[ ! -f "{params.pon}" ]]; then
cnvkit.py reference --output {params.flat_reference_cnn} \
--fasta {input.fasta} \
--targets {output.targets} \
--antitargets {output.antitargets};
--targets {input.tumor_target_cnn} \
--antitargets {input.tumor_antitarget_cnn};

cnvkit.py fix {output.tumor_target_coverage} \
{output.tumor_antitarget_coverage} \
cnvkit.py fix {input.tumor_target_cnn} \
{input.tumor_antitarget_cnn} \
{params.flat_reference_cnn} \
--output {output.cnr};

else

echo "PON reference exists- Using it for coverage correction"
cnvkit.py fix {output.tumor_target_coverage} \
{output.tumor_antitarget_coverage} \
cnvkit.py fix {input.tumor_target_cnn} \
{input.tumor_antitarget_cnn} \
{params.pon} \
--output {output.cnr};

Expand Down Expand Up @@ -153,7 +152,6 @@ rule cnvkit_call_CNV_research:
input:
access_bed = config["reference"]["access_regions"],
fasta = config["reference"]["reference_genome"],
baits_bed = config["panel"]["capture_kit"],
refgene_flat = config["reference"]["refgene_flat"],
purity_ploidy = cnv_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".purecn.purity.csv",
cns_initial= cnv_dir + "tumor.initial.cns",
Expand Down
3 changes: 2 additions & 1 deletion BALSAMIC/workflows/PON.smk
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ if sequence_type == SequencingType.TARGETED:
rules_to_include.append("snakemake_rules/quality_control/fastp_tga.rule")
rules_to_include.append("snakemake_rules/align/tga_sentieon_alignment.rule")
rules_to_include.append("snakemake_rules/align/tga_bam_postprocess.rule")
rules_to_include.append("snakemake_rules/variant_calling/extend_bed.rule")
rules_to_include.append("snakemake_rules/variant_calling/cnvkit_preprocess.rule")
else:
rules_to_include.append("snakemake_rules/quality_control/fastp_wgs.rule")
Expand All @@ -91,7 +92,7 @@ if pon_workflow in [PONWorkflow.GENS_MALE, PONWorkflow.GENS_FEMALE]:
rules_to_include.append("snakemake_rules/variant_calling/gatk_read_counts.rule")
rules_to_include.append("snakemake_rules/pon/gens_create_pon.rule")

pon_finish = expand(analysis_dir + "analysis_PON_finish")
pon_finish = Path(analysis_dir + "analysis_PON_finish").as_posix()

for r in rules_to_include:
include: Path(BALSAMIC_DIR, r).as_posix()
Expand Down
1 change: 0 additions & 1 deletion BALSAMIC/workflows/balsamic.smk
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,6 @@ delivery_dir: str = Path(result_dir, "delivery").as_posix() + "/"
umi_dir: str = Path(result_dir, "umi").as_posix() + "/"
umi_qc_dir: str = Path(qc_dir, "umi_qc").as_posix() + "/"


# Annotations
research_annotations = []
clinical_annotations = []
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ Added:
* MSI analysis to the tumor-normal workflow https://github.com/Clinical-Genomics/BALSAMIC/pull/1454
* Sentieon install directory path to case config arguments https://github.com/Clinical-Genomics/BALSAMIC/pull/1461
* UMI extraction and deduplication to TGA workflow https://github.com/Clinical-Genomics/BALSAMIC/pull/1358
* Padding of bed-regions for CNVkit to minimum 100 bases https://github.com/Clinical-Genomics/BALSAMIC/pull/1469
* Added min mapq 20 to CNVkit PON workflow https://github.com/Clinical-Genomics/BALSAMIC/pull/1465

Changed:
Expand Down
2 changes: 2 additions & 0 deletions docs/balsamic_pon.rst
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ When creating a new PON reference file, the next steps have to be followed:
/path/analysis/analysis_PON_finish
/path/analysis/cnv/<panel_name>_CNVkit_PON_reference_<version>.cnn

``Note: The bedfile from the bait-set will be padded in the generation of the PON according to the minimum bed region size set in Balsamic as well as during the analysis with CNVkit, this to avoid CNVkit filtering out short regions.``

Using the PON during analysis
-----------------------------

Expand Down
6 changes: 6 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,12 @@ def references_dir(test_data_dir) -> Path:
return Path(test_data_dir, "references")


@pytest.fixture(scope="session")
def bedfile_path(test_data_dir) -> Path:
"""Return bedfile path."""
return Path(test_data_dir, "bedfiles", "test_bedfile.bed")


@pytest.fixture(scope="session")
def purity_csv_path(test_data_dir) -> Path:
"""Return pureCN purity CSV path."""
Expand Down
2 changes: 1 addition & 1 deletion tests/models/test_config_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,5 +418,5 @@ def test_get_final_bam_name_pon(balsamic_pon_model: ConfigModel):
)

# Then retrieved final bam names should match the expected format and be identical regardless of request parameter
expected_final_bam_name = f"{bam_dir}{sample_type}.{sample_name}.dedup.bam"
expected_final_bam_name = f"{bam_dir}{sample_type}.{sample_name}.dedup.fixmate.bam"
assert expected_final_bam_name == bam_name_sample_name
Loading
Loading