diff --git a/.gitignore b/.gitignore index ddffda13e..0dc625f86 100644 --- a/.gitignore +++ b/.gitignore @@ -35,3 +35,4 @@ BALSAMIC_reference/ BALSAMIC/workflows/umi_workflow BALSAMIC/workflows/umi_workflow/.* .idea/ +.vscode/ diff --git a/BALSAMIC/commands/config/case.py b/BALSAMIC/commands/config/case.py index 0c4444bbc..67a257ee3 100644 --- a/BALSAMIC/commands/config/case.py +++ b/BALSAMIC/commands/config/case.py @@ -1,4 +1,5 @@ """Balsamic config case CLI.""" + import logging from datetime import datetime from pathlib import Path @@ -36,6 +37,7 @@ OPTION_SWEGEN_SV, OPTION_TUMOR_SAMPLE_NAME, OPTION_UMI, + OPTION_UMI_MIN_READS, OPTION_UMI_TRIM_LENGTH, ) from BALSAMIC.constants.analysis import BIOINFO_TOOL_ENV, AnalysisWorkflow, Gender @@ -87,6 +89,7 @@ @OPTION_TUMOR_SAMPLE_NAME @OPTION_UMI @OPTION_UMI_TRIM_LENGTH +@OPTION_UMI_MIN_READS @click.pass_context def case_config( context: click.Context, @@ -119,6 +122,7 @@ def case_config( tumor_sample_name: str, umi: bool, umi_trim_length: int, + umi_min_reads: str | None, ): references_path: Path = Path(balsamic_cache, cache_version, genome_version) references: Dict[str, Path] = get_absolute_paths_dict( @@ -205,6 +209,7 @@ def case_config( "analysis_workflow": analysis_workflow, "config_creation_date": datetime.now().strftime("%Y-%m-%d %H:%M"), }, + custom_filters={"umi_min_reads": umi_min_reads if umi_min_reads else None}, reference=references, singularity={ "image": Path(balsamic_cache, cache_version, "containers").as_posix() @@ -221,14 +226,16 @@ def case_config( bioinfo_tools=BIOINFO_TOOL_ENV, container_conda_env_path=CONTAINERS_DIR, ), - panel={ - "exome": exome, - "capture_kit": panel_bed, - "chrom": get_panel_chrom(panel_bed), - "pon_cnn": pon_cnn, - } - if panel_bed - else None, + panel=( + { + "exome": exome, + "capture_kit": panel_bed, + "chrom": get_panel_chrom(panel_bed), + "pon_cnn": pon_cnn, + } + if panel_bed + else None + ), ).model_dump(by_alias=True, exclude_none=True) LOG.info("Balsamic config model instantiated successfully") diff --git a/BALSAMIC/commands/options.py b/BALSAMIC/commands/options.py index 41a4bdec1..3f2f86eb2 100644 --- a/BALSAMIC/commands/options.py +++ b/BALSAMIC/commands/options.py @@ -1,4 +1,5 @@ """Balsamic command options.""" + import click from BALSAMIC import __version__ as balsamic_version @@ -22,7 +23,11 @@ from BALSAMIC.constants.constants import LOG_LEVELS, LogLevel from BALSAMIC.constants.rules import DELIVERY_RULES from BALSAMIC.constants.workflow_params import VCF_DICT -from BALSAMIC.utils.cli import validate_cache_version, validate_exome_option +from BALSAMIC.utils.cli import ( + validate_cache_version, + validate_exome_option, + validate_umi_min_reads, +) OPTION_ADAPTER_TRIM = click.option( "--adapter-trim/--no-adapter-trim", @@ -423,3 +428,10 @@ type=click.INT, help="Trim N bases from reads in FASTQ file", ) + +OPTION_UMI_MIN_READS = click.option( + "--umi-min-reads", + type=click.STRING, + callback=validate_umi_min_reads, + help="Minimum raw reads supporting each UMI group. Format: 'x,y,z'.", +) diff --git a/BALSAMIC/models/config.py b/BALSAMIC/models/config.py index 1c17c8ec3..ba413cac9 100644 --- a/BALSAMIC/models/config.py +++ b/BALSAMIC/models/config.py @@ -1,4 +1,5 @@ """Balsamic analysis config case models.""" + import re from glob import glob from pathlib import Path @@ -171,6 +172,12 @@ def validate_pon_version(cls, pon_version: Optional[str]): return pon_version +class CustomFilters(BaseModel): + """Variant calling custom filters.""" + + umi_min_reads: str | None = None + + class ConfigModel(BaseModel): """ Class providing common functions and variables for different balsamic workflows. @@ -186,6 +193,7 @@ class ConfigModel(BaseModel): vcf : Field(VCFmodel); variables relevant for variant calling pipeline background_variants: Field(Path(optional)); path to BACKGROUND VARIANTS for UMI analysis: Field(AnalysisModel); Pydantic model containing workflow variables + custom_filters: Field(CustomFilters); custom parameters for variant filtering This class also contains functions that help retrieve sample and file information, facilitating BALSAMIC run operations in Snakemake. @@ -211,6 +219,7 @@ class ConfigModel(BaseModel): vcf: Optional[VCFModel] = None background_variants: Optional[str] = None analysis: AnalysisModel + custom_filters: CustomFilters | None = None @field_validator("reference") def abspath_as_str(cls, reference: Dict[str, Path]): diff --git a/BALSAMIC/utils/cli.py b/BALSAMIC/utils/cli.py index 6b66c70e8..72c202539 100644 --- a/BALSAMIC/utils/cli.py +++ b/BALSAMIC/utils/cli.py @@ -296,12 +296,16 @@ def get_fastq_info(sample_name: str, fastq_path: str) -> Dict[str, FastqInfoMode "fwd": fwd_fastq, "rev": rev_fastq, } - fastq_dict[fastq_pair_pattern].update( - { - "fwd_resolved": Path(fwd_fastq).resolve().as_posix(), - "rev_resolved": Path(rev_fastq).resolve().as_posix(), - } - ) if Path(fwd_fastq).is_symlink() or Path(rev_fastq).is_symlink() else None + ( + fastq_dict[fastq_pair_pattern].update( + { + "fwd_resolved": Path(fwd_fastq).resolve().as_posix(), + "rev_resolved": Path(rev_fastq).resolve().as_posix(), + } + ) + if Path(fwd_fastq).is_symlink() or Path(rev_fastq).is_symlink() + else None + ) if not fastq_dict: error_message = f"No fastqs found for: {sample_name} in {fastq_path}" @@ -511,3 +515,16 @@ def validate_cache_version( raise click.BadParameter( f"Invalid cache version format. Use '{CacheVersion.DEVELOP}' or 'X.X.X'." ) + + +def validate_umi_min_reads( + _ctx: click.Context, _param: click.Parameter, umi_min_reads: str | None +) -> str: + """Validate the provided cache version.""" + if umi_min_reads: + umi_min_reads_parts: List[str] = umi_min_reads.split(",") + if len(umi_min_reads_parts) == 3 and all( + part.isdigit() for part in umi_min_reads_parts + ): + return umi_min_reads + raise click.BadParameter("Invalid UMI minimum reads format. Use 'x,y,z'.") diff --git a/BALSAMIC/workflows/balsamic.smk b/BALSAMIC/workflows/balsamic.smk index 49a5f34e0..8219ba128 100644 --- a/BALSAMIC/workflows/balsamic.smk +++ b/BALSAMIC/workflows/balsamic.smk @@ -10,7 +10,11 @@ from typing import Dict, List from BALSAMIC.constants.constants import FileType from BALSAMIC.constants.analysis import FastqName, MutationType, SampleType -from BALSAMIC.constants.paths import BALSAMIC_DIR, SENTIEON_DNASCOPE_DIR, SENTIEON_TNSCOPE_DIR +from BALSAMIC.constants.paths import ( + BALSAMIC_DIR, + SENTIEON_DNASCOPE_DIR, + SENTIEON_TNSCOPE_DIR, +) from BALSAMIC.constants.rules import SNAKEMAKE_RULES from BALSAMIC.constants.variant_filters import ( COMMON_SETTINGS, @@ -21,7 +25,11 @@ from BALSAMIC.constants.variant_filters import ( VARDICT_SETTINGS_COMMON, MANTA_FILTER_SETTINGS, ) -from BALSAMIC.constants.workflow_params import VARCALL_PARAMS, WORKFLOW_PARAMS, SLEEP_BEFORE_START +from BALSAMIC.constants.workflow_params import ( + VARCALL_PARAMS, + WORKFLOW_PARAMS, + SLEEP_BEFORE_START, +) from BALSAMIC.models.config import ConfigModel from BALSAMIC.models.params import BalsamicWorkflowConfig, VarCallerFilter from BALSAMIC.utils.cli import check_executable, generate_h5 @@ -101,7 +109,7 @@ if config["analysis"]["sequencing_type"] != "wgs": pon_cnn: str = get_pon_cnn(config) # Run information -singularity_image: str = config_model.singularity['image'] +singularity_image: str = config_model.singularity["image"] sample_names: List[str] = config_model.get_all_sample_names() tumor_sample: str = config_model.get_sample_name_by_type(SampleType.TUMOR) sequencing_type = config_model.analysis.sequencing_type @@ -110,7 +118,9 @@ if config_model.analysis.analysis_type == "paired": # Sample status to sampleID namemap if config_model.analysis.analysis_type == "paired": - status_to_sample_id = "TUMOR" + "\\\\t" + tumor_sample + "\\\\n" + "NORMAL" + "\\\\t" + normal_sample + status_to_sample_id = ( + "TUMOR" + "\\\\t" + tumor_sample + "\\\\n" + "NORMAL" + "\\\\t" + normal_sample + ) else: status_to_sample_id = "TUMOR" + "\\\\t" + tumor_sample @@ -134,82 +144,134 @@ fastp_parameters: Dict = get_fastp_parameters(config_model) # parse parameters as constants to workflows params = BalsamicWorkflowConfig.model_validate(WORKFLOW_PARAMS) +# Custom filter to set the minimum number of reads required to support each UMI tag group +if config_model.custom_filters and config_model.custom_filters.umi_min_reads: + params.umiconsensuscall.filter_minreads = config_model.custom_filters.umi_min_reads + # vcfanno annotations -research_annotations.append( { - 'annotation': [{ - 'file': Path(config["reference"]["gnomad_variant"]).as_posix(), - 'fields': ["AF", "AF_popmax"], - 'ops': ["self", "self"], - 'names': ["GNOMADAF", "GNOMADAF_popmax"] - }] -} +research_annotations.append( + { + "annotation": [ + { + "file": Path(config["reference"]["gnomad_variant"]).as_posix(), + "fields": ["AF", "AF_popmax"], + "ops": ["self", "self"], + "names": ["GNOMADAF", "GNOMADAF_popmax"], + } + ] + } ) -research_annotations.append( { - 'annotation': [{ - 'file': Path(config["reference"]["clinvar"]).as_posix(), - 'fields': ["CLNACC", "CLNREVSTAT", "CLNSIG", "ORIGIN", "CLNVC", "CLNVCSO"], - 'ops': ["self", "self", "self", "self", "self", "self"], - 'names': ["CLNACC", "CLNREVSTAT", "CLNSIG", "ORIGIN", "CLNVC", "CLNVCSO"] - }] -} +research_annotations.append( + { + "annotation": [ + { + "file": Path(config["reference"]["clinvar"]).as_posix(), + "fields": [ + "CLNACC", + "CLNREVSTAT", + "CLNSIG", + "ORIGIN", + "CLNVC", + "CLNVCSO", + ], + "ops": ["self", "self", "self", "self", "self", "self"], + "names": [ + "CLNACC", + "CLNREVSTAT", + "CLNSIG", + "ORIGIN", + "CLNVC", + "CLNVCSO", + ], + } + ] + } ) -research_annotations.append( { - 'annotation': [{ - 'file': Path(config["reference"]["cadd_snv"]).as_posix(), - 'names': ["CADD"], - 'ops': ["mean"], - 'columns': [6] - }] -} +research_annotations.append( + { + "annotation": [ + { + "file": Path(config["reference"]["cadd_snv"]).as_posix(), + "names": ["CADD"], + "ops": ["mean"], + "columns": [6], + } + ] + } ) if "swegen_snv_frequency" in config["reference"]: - research_annotations.append( { - 'annotation': [{ - 'file': get_swegen_snv(config), - 'fields': ["AF", "AC_Hom", "AC_Het", "AC_Hemi"], - 'ops': ["self", "self", "self","self"], - 'names': ["SWEGENAF", "SWEGENAAC_Hom", "SWEGENAAC_Het", "SWEGENAAC_Hemi"] - }] - } + research_annotations.append( + { + "annotation": [ + { + "file": get_swegen_snv(config), + "fields": ["AF", "AC_Hom", "AC_Het", "AC_Hemi"], + "ops": ["self", "self", "self", "self"], + "names": [ + "SWEGENAF", + "SWEGENAAC_Hom", + "SWEGENAAC_Het", + "SWEGENAAC_Hemi", + ], + } + ] + } ) if "clinical_snv_observations" in config["reference"]: - clinical_annotations.append( { - 'annotation': [{ - 'file': get_clinical_snv_observations(config), - 'fields': ["Frq", "Obs", "Hom"], - 'ops': ["self", "self", "self"], - 'names': ["Frq", "Obs", "Hom"] - }] - } + clinical_annotations.append( + { + "annotation": [ + { + "file": get_clinical_snv_observations(config), + "fields": ["Frq", "Obs", "Hom"], + "ops": ["self", "self", "self"], + "names": ["Frq", "Obs", "Hom"], + } + ] + } ) clinical_snv_obs: str = get_clinical_snv_observations(config) if "cancer_germline_snv_observations" in config["reference"]: - clinical_annotations.append( { - 'annotation': [{ - 'file': get_cancer_germline_snv_observations(config), - 'fields': ["Frq", "Obs", "Hom"], - 'ops': ["self", "self", "self"], - 'names': ["Cancer_Germline_Frq", "Cancer_Germline_Obs", "Cancer_Germline_Hom"] - }] - } + clinical_annotations.append( + { + "annotation": [ + { + "file": get_cancer_germline_snv_observations(config), + "fields": ["Frq", "Obs", "Hom"], + "ops": ["self", "self", "self"], + "names": [ + "Cancer_Germline_Frq", + "Cancer_Germline_Obs", + "Cancer_Germline_Hom", + ], + } + ] + } ) cancer_germline_snv_obs: str = get_cancer_germline_snv_observations(config) if "cancer_somatic_snv_observations" in config["reference"]: - clinical_annotations.append( { - 'annotation': [{ - 'file': get_cancer_somatic_snv_observations(config), - 'fields': ["Frq", "Obs", "Hom"], - 'ops': ["self", "self", "self"], - 'names': ["Cancer_Somatic_Frq", "Cancer_Somatic_Obs", "Cancer_Somatic_Hom"] - }] - } + clinical_annotations.append( + { + "annotation": [ + { + "file": get_cancer_somatic_snv_observations(config), + "fields": ["Frq", "Obs", "Hom"], + "ops": ["self", "self", "self"], + "names": [ + "Cancer_Somatic_Frq", + "Cancer_Somatic_Obs", + "Cancer_Somatic_Hom", + ], + } + ] + } ) cancer_somatic_snv_obs: str = get_cancer_somatic_snv_observations(config) @@ -239,36 +301,46 @@ try: if os.getenv("SENTIEON_EXEC") is not None: config["SENTIEON_EXEC"] = os.environ["SENTIEON_EXEC"] else: - config["SENTIEON_EXEC"] = Path(os.environ["SENTIEON_INSTALL_DIR"], "bin", "sentieon").as_posix() + config["SENTIEON_EXEC"] = Path( + os.environ["SENTIEON_INSTALL_DIR"], "bin", "sentieon" + ).as_posix() config["SENTIEON_TNSCOPE"] = SENTIEON_TNSCOPE_DIR.as_posix() config["SENTIEON_DNASCOPE"] = SENTIEON_DNASCOPE_DIR.as_posix() except KeyError as error: - LOG.error("Set environment variables SENTIEON_LICENSE, SENTIEON_INSTALL_DIR, SENTIEON_EXEC " - "to run SENTIEON variant callers") + LOG.error( + "Set environment variables SENTIEON_LICENSE, SENTIEON_INSTALL_DIR, SENTIEON_EXEC " + "to run SENTIEON variant callers" + ) raise BalsamicError if not Path(config["SENTIEON_EXEC"]).exists(): - LOG.error("Sentieon executable not found {}".format(Path(config["SENTIEON_EXEC"]).as_posix())) + LOG.error( + "Sentieon executable not found {}".format( + Path(config["SENTIEON_EXEC"]).as_posix() + ) + ) raise BalsamicError if "hg38" in config["reference"]["reference_genome"]: config["reference"]["genome_version"] = "hg38" elif "canfam3" in config["reference"]["reference_genome"]: config["reference"]["genome_version"] = "canfam3" - LOG.error("The main BALSAMIC workflow is not compatible with the canfam3 genome version " - "use '--analysis-workflow balsamic-qc' instead") + LOG.error( + "The main BALSAMIC workflow is not compatible with the canfam3 genome version " + "use '--analysis-workflow balsamic-qc' instead" + ) raise BalsamicError else: config["reference"]["genome_version"] = "hg19" -LOG.info('Genome version set to %s', config["reference"]["genome_version"]) +LOG.info("Genome version set to %s", config["reference"]["genome_version"]) # Add normal sample if analysis is paired germline_call_samples = ["tumor"] -if config['analysis']['analysis_type'] == "paired": +if config["analysis"]["analysis_type"] == "paired": germline_call_samples.append("normal") # Create list of chromosomes in panel for panel only variant calling to be used in rules @@ -281,25 +353,42 @@ if "background_variants" in config: # Set temporary dir environment variable os.environ["SENTIEON_TMPDIR"] = result_dir -os.environ['TMPDIR'] = get_result_dir(config) +os.environ["TMPDIR"] = get_result_dir(config) # CNV report input files cnv_report_paths = [] if config["analysis"]["sequencing_type"] == "wgs": - if config['analysis']['analysis_type'] == "paired": - cnv_report_paths.append(f"{vcf_dir}CNV.somatic.{config['analysis']['case_id']}.ascat.samplestatistics.txt.pdf") - cnv_report_paths.extend(expand( - f"{vcf_dir}CNV.somatic.{config['analysis']['case_id']}.ascat.{{output_suffix}}.png.pdf", - output_suffix=["ascatprofile", "rawprofile", "ASPCF", "tumor", "germline", "sunrise"] - )) + if config["analysis"]["analysis_type"] == "paired": + cnv_report_paths.append( + f"{vcf_dir}CNV.somatic.{config['analysis']['case_id']}.ascat.samplestatistics.txt.pdf" + ) + cnv_report_paths.extend( + expand( + f"{vcf_dir}CNV.somatic.{config['analysis']['case_id']}.ascat.{{output_suffix}}.png.pdf", + output_suffix=[ + "ascatprofile", + "rawprofile", + "ASPCF", + "tumor", + "germline", + "sunrise", + ], + ) + ) else: - cnv_report_paths.extend(expand( - f"{vcf_dir}CNV.somatic.{config['analysis']['case_id']}.cnvpytor.{{output_suffix}}.png.pdf", - output_suffix=["circular", "scatter"] - )) + cnv_report_paths.extend( + expand( + f"{vcf_dir}CNV.somatic.{config['analysis']['case_id']}.cnvpytor.{{output_suffix}}.png.pdf", + output_suffix=["circular", "scatter"], + ) + ) else: - cnv_report_paths.extend(expand(f"{cnv_dir}tumor.merged-{{plot}}.pdf",plot=["diagram", "scatter"])) - cnv_report_paths.append(f"{cnv_dir}CNV.somatic.{config['analysis']['case_id']}.purecn.purity.csv.pdf") + cnv_report_paths.extend( + expand(f"{cnv_dir}tumor.merged-{{plot}}.pdf", plot=["diagram", "scatter"]) + ) + cnv_report_paths.append( + f"{cnv_dir}CNV.somatic.{config['analysis']['case_id']}.purecn.purity.csv.pdf" + ) # Extract variant callers for the workflow germline_caller = [] @@ -307,58 +396,78 @@ somatic_caller = [] somatic_caller_cnv = [] somatic_caller_sv = [] for m in set(MutationType): - germline_caller_balsamic = get_variant_callers(config=config, - analysis_type=config['analysis']['analysis_type'], - workflow_solution="BALSAMIC", - mutation_type=m, - sequencing_type=config["analysis"]["sequencing_type"], - mutation_class="germline") - - germline_caller_sentieon = get_variant_callers(config=config, - analysis_type=config['analysis']['analysis_type'], - workflow_solution="Sentieon", - mutation_type=m, - sequencing_type=config["analysis"]["sequencing_type"], - mutation_class="germline") - - germline_caller = germline_caller + germline_caller_balsamic + germline_caller_sentieon - - - somatic_caller_balsamic = get_variant_callers(config=config, - analysis_type=config['analysis']['analysis_type'], - workflow_solution="BALSAMIC", - mutation_type=m, - sequencing_type=config["analysis"]["sequencing_type"], - mutation_class="somatic") - - somatic_caller_sentieon = get_variant_callers(config=config, - analysis_type=config['analysis']['analysis_type'], - workflow_solution="Sentieon", - mutation_type=m, - sequencing_type=config["analysis"]["sequencing_type"], - mutation_class="somatic") - - somatic_caller_sentieon_umi = get_variant_callers(config=config, - analysis_type=config['analysis']['analysis_type'], - workflow_solution="Sentieon_umi", - mutation_type=m, - sequencing_type=config["analysis"]["sequencing_type"], - mutation_class="somatic") - somatic_caller = somatic_caller + somatic_caller_sentieon_umi + somatic_caller_balsamic + somatic_caller_sentieon - -somatic_caller_sv = get_variant_callers(config=config, - analysis_type=config['analysis']['analysis_type'], - workflow_solution="BALSAMIC", - mutation_type="SV", - sequencing_type=config["analysis"]["sequencing_type"], - mutation_class="somatic") - -somatic_caller_cnv = get_variant_callers(config=config, - analysis_type=config['analysis']['analysis_type'], - workflow_solution="BALSAMIC", - mutation_type="CNV", - sequencing_type=config["analysis"]["sequencing_type"], - mutation_class="somatic") + germline_caller_balsamic = get_variant_callers( + config=config, + analysis_type=config["analysis"]["analysis_type"], + workflow_solution="BALSAMIC", + mutation_type=m, + sequencing_type=config["analysis"]["sequencing_type"], + mutation_class="germline", + ) + + germline_caller_sentieon = get_variant_callers( + config=config, + analysis_type=config["analysis"]["analysis_type"], + workflow_solution="Sentieon", + mutation_type=m, + sequencing_type=config["analysis"]["sequencing_type"], + mutation_class="germline", + ) + + germline_caller = ( + germline_caller + germline_caller_balsamic + germline_caller_sentieon + ) + + somatic_caller_balsamic = get_variant_callers( + config=config, + analysis_type=config["analysis"]["analysis_type"], + workflow_solution="BALSAMIC", + mutation_type=m, + sequencing_type=config["analysis"]["sequencing_type"], + mutation_class="somatic", + ) + + somatic_caller_sentieon = get_variant_callers( + config=config, + analysis_type=config["analysis"]["analysis_type"], + workflow_solution="Sentieon", + mutation_type=m, + sequencing_type=config["analysis"]["sequencing_type"], + mutation_class="somatic", + ) + + somatic_caller_sentieon_umi = get_variant_callers( + config=config, + analysis_type=config["analysis"]["analysis_type"], + workflow_solution="Sentieon_umi", + mutation_type=m, + sequencing_type=config["analysis"]["sequencing_type"], + mutation_class="somatic", + ) + somatic_caller = ( + somatic_caller + + somatic_caller_sentieon_umi + + somatic_caller_balsamic + + somatic_caller_sentieon + ) + +somatic_caller_sv = get_variant_callers( + config=config, + analysis_type=config["analysis"]["analysis_type"], + workflow_solution="BALSAMIC", + mutation_type="SV", + sequencing_type=config["analysis"]["sequencing_type"], + mutation_class="somatic", +) + +somatic_caller_cnv = get_variant_callers( + config=config, + analysis_type=config["analysis"]["analysis_type"], + workflow_solution="BALSAMIC", + mutation_type="CNV", + sequencing_type=config["analysis"]["sequencing_type"], + mutation_class="somatic", +) somatic_caller_sv.remove("svdb") svdb_callers_prio = somatic_caller_sv + somatic_caller_cnv @@ -368,14 +477,16 @@ for var_caller in svdb_callers_prio: # Collect only snv callers for calculating tmb somatic_caller_tmb = [] -for ws in ["BALSAMIC","Sentieon","Sentieon_umi"]: - somatic_caller_snv = get_variant_callers(config=config, - analysis_type=config['analysis']['analysis_type'], - workflow_solution=ws, - mutation_type="SNV", - sequencing_type=config["analysis"]["sequencing_type"], - mutation_class="somatic") - somatic_caller_tmb += somatic_caller_snv +for ws in ["BALSAMIC", "Sentieon", "Sentieon_umi"]: + somatic_caller_snv = get_variant_callers( + config=config, + analysis_type=config["analysis"]["analysis_type"], + workflow_solution=ws, + mutation_type="SNV", + sequencing_type=config["analysis"]["sequencing_type"], + mutation_class="somatic", + ) + somatic_caller_tmb += somatic_caller_snv # Remove variant callers from list of callers @@ -388,18 +499,22 @@ if "disable_variant_caller" in config: germline_caller.remove(var_caller) rules_to_include = [] -analysis_type = config['analysis']["analysis_type"] -sequence_type = config['analysis']["sequencing_type"] +analysis_type = config["analysis"]["analysis_type"] +sequence_type = config["analysis"]["sequencing_type"] -for sub,value in SNAKEMAKE_RULES.items(): - if sub in ["common", analysis_type + "_" + sequence_type]: - for module_name,module_rules in value.items(): - rules_to_include.extend(module_rules) +for sub, value in SNAKEMAKE_RULES.items(): + if sub in ["common", analysis_type + "_" + sequence_type]: + for module_name, module_rules in value.items(): + rules_to_include.extend(module_rules) if config["analysis"]["analysis_workflow"] == "balsamic": rules_to_include = [rule for rule in rules_to_include if "umi" not in rule] - somatic_caller = [var_caller for var_caller in somatic_caller if "umi" not in var_caller] - somatic_caller_tmb = [var_caller for var_caller in somatic_caller_tmb if "umi" not in var_caller] + somatic_caller = [ + var_caller for var_caller in somatic_caller if "umi" not in var_caller + ] + somatic_caller_tmb = [ + var_caller for var_caller in somatic_caller_tmb if "umi" not in var_caller + ] # Add rule for DRAGEN if "dragen" in config: @@ -411,18 +526,24 @@ if "gens_coverage_pon" in config["reference"]: rules_to_include.append("snakemake_rules/variant_calling/gens_preprocessing.rule") LOG.info(f"The following rules will be included in the workflow: {rules_to_include}") -LOG.info(f"The following Germline variant callers will be included in the workflow: {germline_caller}") -LOG.info(f"The following somatic variant callers will be included in the workflow: {somatic_caller}") +LOG.info( + f"The following Germline variant callers will be included in the workflow: {germline_caller}" +) +LOG.info( + f"The following somatic variant callers will be included in the workflow: {somatic_caller}" +) for r in rules_to_include: + include: Path(BALSAMIC_DIR, r).as_posix() + # Define common and analysis specific outputs quality_control_results = [ Path(qc_dir, case_id + "_metrics_deliverables.yaml").as_posix(), Path(qc_dir, "multiqc_report.html").as_posix(), - Path(qc_dir, "multiqc_data/multiqc_data.json").as_posix() + Path(qc_dir, "multiqc_data/multiqc_data.json").as_posix(), ] # Analysis results @@ -430,7 +551,10 @@ analysis_specific_results = [] # Germline SNVs/SVs analysis_specific_results.extend( - expand(vep_dir + "{vcf}.vcf.gz", vcf=get_vcf(config, germline_caller, germline_call_samples)) + expand( + vep_dir + "{vcf}.vcf.gz", + vcf=get_vcf(config, germline_caller, germline_call_samples), + ) ) # Germline SNVs specifically for genotype @@ -439,23 +563,35 @@ if config["analysis"]["analysis_type"] == "paired": # Raw VCFs analysis_specific_results.extend( - expand(vcf_dir + "{vcf}.research.vcf.gz", vcf=get_vcf(config, somatic_caller, [case_id])) + expand( + vcf_dir + "{vcf}.research.vcf.gz", + vcf=get_vcf(config, somatic_caller, [case_id]), + ) ) # Filtered and passed post annotation research VCFs analysis_specific_results.extend( - expand(vep_dir + "{vcf}.research.filtered.pass.vcf.gz", vcf=get_vcf(config, somatic_caller, [case_id])) + expand( + vep_dir + "{vcf}.research.filtered.pass.vcf.gz", + vcf=get_vcf(config, somatic_caller, [case_id]), + ) ) # Filtered and passed post annotation clinical VCFs analysis_specific_results.extend( - expand(vep_dir + "{vcf}.clinical.filtered.pass.vcf.gz", vcf=get_vcf(config, somatic_caller, [case_id])) + expand( + vep_dir + "{vcf}.clinical.filtered.pass.vcf.gz", + vcf=get_vcf(config, somatic_caller, [case_id]), + ) ) # TMB analysis_specific_results.extend( - expand(vep_dir + "{vcf}.balsamic_stat", vcf=get_vcf(config, somatic_caller_tmb, [case_id])) + expand( + vep_dir + "{vcf}.balsamic_stat", + vcf=get_vcf(config, somatic_caller_tmb, [case_id]), + ) ) # CNV report @@ -465,79 +601,127 @@ analysis_specific_results.append(cnv_dir + "CNV.somatic." + case_id + ".report.p if config["analysis"]["sequencing_type"] != "wgs": # CNVkit analysis_specific_results.append(cnv_dir + "tumor.merged.cns") - analysis_specific_results.extend(expand(cnv_dir + "tumor.merged-{plot}", plot=["diagram.pdf", "scatter.pdf"])) - analysis_specific_results.append(cnv_dir + case_id +".gene_metrics") + analysis_specific_results.extend( + expand(cnv_dir + "tumor.merged-{plot}", plot=["diagram.pdf", "scatter.pdf"]) + ) + analysis_specific_results.append(cnv_dir + case_id + ".gene_metrics") # vcf2cytosure - analysis_specific_results.extend(expand( - vcf_dir + "CNV.somatic.{case_name}.{var_caller}.vcf2cytosure.cgh", - case_name=case_id, - var_caller=["cnvkit"] - )) + analysis_specific_results.extend( + expand( + vcf_dir + "CNV.somatic.{case_name}.{var_caller}.vcf2cytosure.cgh", + case_name=case_id, + var_caller=["cnvkit"], + ) + ) # VarDict analysis_specific_results.extend( - expand(vep_dir + "{vcf}.research.filtered.pass.ranked.vcf.gz", vcf=get_vcf(config, ["vardict"], [case_id])) + expand( + vep_dir + "{vcf}.research.filtered.pass.ranked.vcf.gz", + vcf=get_vcf(config, ["vardict"], [case_id]), + ) ) # UMI if config["analysis"]["analysis_workflow"] == "balsamic-umi": - analysis_specific_results.extend(expand(umi_qc_dir + "{sample}.umi.mean_family_depth", sample=config_model.get_all_sample_names())) + analysis_specific_results.extend( + expand( + umi_qc_dir + "{sample}.umi.mean_family_depth", + sample=config_model.get_all_sample_names(), + ) + ) if background_variant_file: analysis_specific_results.extend( - expand(umi_qc_dir + "{case_name}.{var_caller}.AFtable.txt", case_name=case_id, var_caller=["tnscope_umi"]) + expand( + umi_qc_dir + "{case_name}.{var_caller}.AFtable.txt", + case_name=case_id, + var_caller=["tnscope_umi"], + ) + ) + + +if ( + config["analysis"]["sequencing_type"] == "wgs" + and config["analysis"]["analysis_type"] == "paired" +): + analysis_specific_results.extend( + expand( + vcf_dir + "{vcf}.copynumber.txt.gz", + vcf=get_vcf(config, ["ascat"], [case_id]), ) + ) + analysis_specific_results.extend( + expand(vcf_dir + "{vcf}.cov.gz", vcf=get_vcf(config, ["dellycnv"], [case_id])) + ) + analysis_specific_results.extend( + expand( + vcf_dir + "SV.somatic.{case_name}.{sample_type}.tiddit_cov.bed", + case_name=case_id, + sample_type=["tumor", "normal"], + ) + ) + analysis_specific_results.extend( + expand( + vcf_dir + "CNV.somatic.{case_name}.{sample_type}.vcf2cytosure.cgh", + case_name=case_id, + sample_type=["tumor", "normal"], + ) + ) - - -if config["analysis"]["sequencing_type"] == "wgs" and config['analysis']['analysis_type'] == "paired": +if ( + config["analysis"]["sequencing_type"] == "wgs" + and config["analysis"]["analysis_type"] == "single" +): analysis_specific_results.extend( - expand(vcf_dir + "{vcf}.copynumber.txt.gz", vcf=get_vcf(config, ["ascat"], [case_id])) + expand( + vcf_dir + "CNV.somatic.{case_name}.{sample_type}.vcf2cytosure.cgh", + case_name=case_id, + sample_type=["tumor"], + ) ) analysis_specific_results.extend( - expand(vcf_dir + "{vcf}.cov.gz", vcf=get_vcf(config,["dellycnv"],[case_id])) - ) - analysis_specific_results.extend(expand( - vcf_dir + "SV.somatic.{case_name}.{sample_type}.tiddit_cov.bed", - case_name=case_id, - sample_type=["tumor", "normal"] - )) - analysis_specific_results.extend(expand( - vcf_dir + "CNV.somatic.{case_name}.{sample_type}.vcf2cytosure.cgh", - case_name=case_id, - sample_type=["tumor","normal"] - )) - -if config['analysis']['sequencing_type'] == "wgs" and config['analysis']['analysis_type'] == 'single': - analysis_specific_results.extend(expand( - vcf_dir + "CNV.somatic.{case_name}.{sample_type}.vcf2cytosure.cgh", - case_name=case_id, - sample_type=["tumor"] - )) - analysis_specific_results.extend(expand( - vcf_dir + "SV.somatic.{case_name}.tumor.tiddit_cov.bed", - case_name=case_id, - )) - -if config['analysis']['analysis_type'] == "single": + expand( + vcf_dir + "SV.somatic.{case_name}.tumor.tiddit_cov.bed", + case_name=case_id, + ) + ) + +if config["analysis"]["analysis_type"] == "single": analysis_specific_results.extend( - expand(vcf_dir + "{vcf}.cov.gz",vcf=get_vcf(config,["dellycnv"],[case_id])) + expand(vcf_dir + "{vcf}.cov.gz", vcf=get_vcf(config, ["dellycnv"], [case_id])) ) # GENS Outputs -if config["analysis"]["sequencing_type"] == "wgs" and "gens_coverage_pon" in config["reference"]: +if ( + config["analysis"]["sequencing_type"] == "wgs" + and "gens_coverage_pon" in config["reference"] +): analysis_specific_results.extend( - expand(cnv_dir + "{sample}.{gens_input}.bed.gz", sample=sample_names, gens_input=["cov", "baf"]) + expand( + cnv_dir + "{sample}.{gens_input}.bed.gz", + sample=sample_names, + gens_input=["cov", "baf"], + ) ) # Dragen -if config["analysis"]["sequencing_type"] == "wgs" and config['analysis']['analysis_type'] == "single": +if ( + config["analysis"]["sequencing_type"] == "wgs" + and config["analysis"]["analysis_type"] == "single" +): if "dragen" in config: - analysis_specific_results.extend([ - Path(result_dir, "dragen", "SNV.somatic." + case_id + ".dragen_tumor.bam").as_posix(), - Path(result_dir, "dragen", "SNV.somatic." + case_id + ".dragen.vcf.gz").as_posix() - ]) + analysis_specific_results.extend( + [ + Path( + result_dir, "dragen", "SNV.somatic." + case_id + ".dragen_tumor.bam" + ).as_posix(), + Path( + result_dir, "dragen", "SNV.somatic." + case_id + ".dragen.vcf.gz" + ).as_posix(), + ] + ) LOG.info(f"Following outputs will be delivered {analysis_specific_results}") -if 'benchmark_plots' in config: +if "benchmark_plots" in config: log_dir = config["analysis"]["log"] if not check_executable("sh5util"): LOG.warning("sh5util executable does not exist. Won't be able to plot analysis") @@ -551,7 +735,11 @@ if 'benchmark_plots' in config: benchmark_plot = Path(benchmark_dir, job_name + ".pdf") log_file_plot = plot_analysis(log_file, h5_file, benchmark_plot) - logging.debug("Plot file for {} available at: {}".format(log_file.as_posix(), log_file_plot)) + logging.debug( + "Plot file for {} available at: {}".format( + log_file.as_posix(), log_file_plot + ) + ) # Merge plots into one based on rule name for my_rule in vars(rules).keys(): @@ -560,34 +748,36 @@ if 'benchmark_plots' in config: for plots in Path(benchmark_dir).glob(f"BALSAMIC*.{my_rule}.*.pdf"): my_rule_pdf.append(plots.as_posix()) my_rule_plots.append(plots) - my_rule_pdf.write(Path(benchmark_dir, my_rule+".pdf").as_posix()) + my_rule_pdf.write(Path(benchmark_dir, my_rule + ".pdf").as_posix()) my_rule_pdf.close() # Delete previous plots after merging for plots in my_rule_plots: plots.unlink() -if 'delivery' in config: +if "delivery" in config: wildcard_dict = { "sample": sample_names, "case_name": case_id, - "allow_missing": True + "allow_missing": True, } - if config['analysis']["analysis_type"] in ["paired", "single"]: - wildcard_dict.update({ - "var_type": ["CNV", "SNV", "SV"], - "var_class": ["somatic", "germline"], - "var_caller": somatic_caller + germline_caller, - "bedchrom": config["panel"]["chrom"] if "panel" in config else [], - }) + if config["analysis"]["analysis_type"] in ["paired", "single"]: + wildcard_dict.update( + { + "var_type": ["CNV", "SNV", "SV"], + "var_class": ["somatic", "germline"], + "var_caller": somatic_caller + germline_caller, + "bedchrom": config["panel"]["chrom"] if "panel" in config else [], + } + ) - if 'rules_to_deliver' in config: - rules_to_deliver = config['rules_to_deliver'].split(",") + if "rules_to_deliver" in config: + rules_to_deliver = config["rules_to_deliver"].split(",") else: - rules_to_deliver = ['multiqc'] + rules_to_deliver = ["multiqc"] - output_files_ready = [('path', 'path_index', 'step', 'tag', 'id', 'format')] + output_files_ready = [("path", "path_index", "step", "tag", "id", "format")] for my_rule in set(rules_to_deliver): try: @@ -597,29 +787,36 @@ if 'delivery' in config: continue LOG.info("Delivering step (rule) {} {}.".format(my_rule, housekeeper_id)) - files_to_deliver = get_rule_output(rules=rules, rule_name=my_rule, output_file_wildcards=wildcard_dict) + files_to_deliver = get_rule_output( + rules=rules, rule_name=my_rule, output_file_wildcards=wildcard_dict + ) LOG.info("The following files added to delivery: {}".format(files_to_deliver)) output_files_ready.extend(files_to_deliver) - output_files_ready = [dict(zip(output_files_ready[0], value)) for value in output_files_ready[1:]] - delivery_ready = Path(get_result_dir(config), "delivery_report", case_id + "_delivery_ready.hk").as_posix() + output_files_ready = [ + dict(zip(output_files_ready[0], value)) for value in output_files_ready[1:] + ] + delivery_ready = Path( + get_result_dir(config), "delivery_report", case_id + "_delivery_ready.hk" + ).as_posix() write_json(output_files_ready, delivery_ready) FormatFile(delivery_ready) wildcard_constraints: - sample = "|".join(sample_names) + sample="|".join(sample_names), + rule all: input: - quality_control_results + analysis_specific_results + quality_control_results + analysis_specific_results, output: - finish_file = Path(get_result_dir(config), "analysis_finish").as_posix() + finish_file=Path(get_result_dir(config), "analysis_finish").as_posix(), params: - tmp_dir = tmp_dir, - case_name = config["analysis"]["case_id"], + tmp_dir=tmp_dir, + case_name=config["analysis"]["case_id"], message: - "Finalizing analysis for {params.case_name}", + "Finalizing analysis for {params.case_name}" run: import datetime import shutil @@ -633,11 +830,11 @@ rule all: LOG.error(val_exc) raise BalsamicError - # Remove temporary directory tree + # Remove temporary directory tree try: shutil.rmtree(params.tmp_dir) except OSError as e: - print ("Error: %s - %s." % (e.filename, e.strerror)) + print("Error: %s - %s." % (e.filename, e.strerror)) - # Finish timestamp file + # Finish timestamp file write_finish_file(file_path=output.finish_file) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 4d840e100..81d664ca6 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,3 +1,10 @@ +[15.0.1] +------- + +Added: +^^^^^^ +* CLI option for the minimum raw reads supporting each UMI group filter + [15.0.0] ------- diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py index 785b428bb..754e2147f 100644 --- a/tests/utils/test_utils.py +++ b/tests/utils/test_utils.py @@ -1,10 +1,11 @@ """Test helper functions.""" + import json import logging import subprocess import sys from pathlib import Path -from typing import Dict, List, Any +from typing import Any, Dict, List from unittest import mock import click @@ -40,6 +41,7 @@ job_id_dump_to_yaml, validate_cache_version, validate_exome_option, + validate_umi_min_reads, ) from BALSAMIC.utils.exc import BalsamicError, WorkflowRunError from BALSAMIC.utils.io import ( @@ -1072,6 +1074,23 @@ def test_validate_cache_version_wrong_format(): validate_cache_version(click.Context, click.Parameter, cli_version) +@pytest.mark.parametrize( + "umi_min_reads, should_fail", + [("3,2,1", False), ("1,0,0", False), ("3,2", True), ("a,b,c", True), (None, False)], +) +def test_validate_umi_min_reads(umi_min_reads: str, should_fail: bool): + """Test UMI min reads option validation.""" + if should_fail: + with pytest.raises(click.BadParameter): + validate_umi_min_reads( + _ctx=click.Context, _param=click.Parameter, umi_min_reads=umi_min_reads + ) + else: + validate_umi_min_reads( + _ctx=click.Context, _param=click.Parameter, umi_min_reads=umi_min_reads + ) + + def test_read_vcf(vcf_file_path, vcf_file_gz_path): """Test correct reading of VCF files.""" # GIVEN a path to two identical VCF files, one gzipped and one not