From e6516d61405881affc561151fedff1fcbc3d0f16 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Fri, 14 Jun 2024 17:58:20 +0200 Subject: [PATCH 01/39] add gens functionality to tga --- .../assets/scripts/postprocess_gens_cnvkit.py | 36 +++ BALSAMIC/assets/scripts/preprocess_gens.py | 2 +- BALSAMIC/constants/rules.py | 2 + BALSAMIC/constants/tools.py | 3 + .../variant_calling/gens_preprocessing.rule | 235 +++++++++++------- 5 files changed, 191 insertions(+), 87 deletions(-) create mode 100644 BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py new file mode 100644 index 000000000..8c95c496c --- /dev/null +++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py @@ -0,0 +1,36 @@ +import click +import pandas as pd + +@click.command() +@click.option( + "-o", + "--output-file", + required=True, + type=click.Path(exists=False), + help="Name of output-file.", +) +@click.option( + "-c", + "--normalised-coverage-path", + required=True, + type=click.Path(exists=True), + help="Input CNVkit cnr result.", +) +def create_gens_cov_file(output_file, ormalised_coverage_path): + # Process CNVkit file + log2_data = [] + cnvkit_df = pd.read_csv(ormalised_coverage_path, sep='\t') + for index, row in cnvkit_df.iterrows(): + if row['gene'] == "Antitarget": + continue + midpoint = row['start'] + int((row['end'] - row['start']) / 2) + log2_data.append(f"{row['chromosome']}\t{midpoint-1}\t{midpoint}\t{row['log2']}") + + # Write log2 data to output file + with open(output_file, 'w') as log2_out: + for resolution in ['o', 'a', 'b', 'c', 'd']: + for line in log2_data: + log2_out.write(f"{resolution}_{line}\n") + +if __name__ == '__main__': + create_gens_cov_file() diff --git a/BALSAMIC/assets/scripts/preprocess_gens.py b/BALSAMIC/assets/scripts/preprocess_gens.py index 0e146a3f1..e3c07a5e0 100755 --- a/BALSAMIC/assets/scripts/preprocess_gens.py +++ b/BALSAMIC/assets/scripts/preprocess_gens.py @@ -26,7 +26,7 @@ "-s", "--sequencing-type", required=True, - type=click.Choice([SequencingType.WGS]), + type=click.Choice([SequencingType.WGS, SequencingType.TARGETED]), help="Sequencing type used.", ) @click.pass_context diff --git a/BALSAMIC/constants/rules.py b/BALSAMIC/constants/rules.py index 947dbac72..59cdab9f2 100644 --- a/BALSAMIC/constants/rules.py +++ b/BALSAMIC/constants/rules.py @@ -77,6 +77,7 @@ "snakemake_rules/align/postprocess_bam.rule", ], "varcall": [ + "snakemake_rules/variant_calling/gens_preprocessing.rule", "snakemake_rules/variant_calling/germline.rule", "snakemake_rules/variant_calling/split_bed.rule", "snakemake_rules/variant_calling/somatic_cnv_tumor_only_tga.rule", @@ -108,6 +109,7 @@ "snakemake_rules/align/postprocess_bam.rule", ], "varcall": [ + "snakemake_rules/variant_calling/gens_preprocessing.rule", "snakemake_rules/variant_calling/germline.rule", "snakemake_rules/variant_calling/split_bed.rule", "snakemake_rules/variant_calling/somatic_tumor_normal.rule", diff --git a/BALSAMIC/constants/tools.py b/BALSAMIC/constants/tools.py index 197650df4..71ff8ab65 100644 --- a/BALSAMIC/constants/tools.py +++ b/BALSAMIC/constants/tools.py @@ -40,6 +40,9 @@ "c": 1000, "d": 100, }, + }, + SequencingType.TARGETED: { + "BAF_SKIP_N": {"o": 0, "a": 0, "b": 0, "c": 0, "d": 0}, } }, } diff --git a/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule b/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule index e2b32534f..73f70f551 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule @@ -1,98 +1,161 @@ # vim: syntax=python tabstop=4 expandtab # coding: utf-8 -rule sentieon_DNAscope_gnomad: - input: - ref = config["reference"]["reference_genome"], - gnomad_af5= config["reference"]["gnomad_min_af5"], - bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = wildcards.sample) - output: - gnomad_af5_vcf = cnv_dir + "SNV.germline.{sample}.dnascope_gnomad_af5.vcf.gz", - params: - tmpdir = tempfile.mkdtemp(prefix=tmp_dir), - pcr_model = params.common.pcr_model, - sentieon_exec = config["SENTIEON_EXEC"], - sentieon_lic = config["SENTIEON_LICENSE"], - sentieon_ml_dnascope = config["SENTIEON_DNASCOPE"], - sample = "{sample}" - benchmark: - Path(benchmark_dir, "sentieon_DNAscope_gnomad_{sample}.tsv").as_posix() - threads: - get_threads(cluster_config, "sentieon_DNAscope_gnomad") - message: - "Calling germline variants on positions in Gnomad AF > 0.05 using Sentieon DNAscope for {params.sample}" - shell: - """ -export TMPDIR={params.tmpdir}; -export SENTIEON_TMPDIR={params.tmpdir}; -export SENTIEON_LICENSE={params.sentieon_lic}; -export SENTIEON_DNASCOPE={params.sentieon_ml_dnascope}; +if config["analysis"]["sequencing_type"] == 'wgs': + rule sentieon_DNAscope_gnomad: + input: + ref=config["reference"]["reference_genome"], + gnomad_af5=config["reference"]["gnomad_min_af5"], + bam=lambda wildcards: config_model.get_final_bam_name(bam_dir=bam_dir,sample_name=wildcards.sample) + output: + gnomad_af5_vcf=cnv_dir + "SNV.germline.{sample}.dnascope_gnomad_af5.vcf.gz", + params: + tmpdir=tempfile.mkdtemp(prefix=tmp_dir), + pcr_model=params.common.pcr_model, + sentieon_exec=config["SENTIEON_EXEC"], + sentieon_lic=config["SENTIEON_LICENSE"], + sample="{sample}" + benchmark: + Path(benchmark_dir,"sentieon_DNAscope_gnomad_{sample}.tsv").as_posix() + threads: + get_threads(cluster_config,"sentieon_DNAscope_gnomad") + message: + "Calling germline variants on positions in Gnomad AF > 0.05 using Sentieon DNAscope for {params.sample}" + shell: + """ + export TMPDIR={params.tmpdir}; + export SENTIEON_TMPDIR={params.tmpdir}; + export SENTIEON_LICENSE={params.sentieon_lic}; + export SENTIEON_DNASCOPE={params.sentieon_ml_dnascope}; + + {params.sentieon_exec} driver \ + -t {threads} \ + -r {input.ref} \ + -i {input.bam} \ + --algo DNAscope \ + --pcr_indel_mode {params.pcr_model} \ + --given {input.gnomad_af5} {output.gnomad_af5_vcf}; + + rm -rf {params.tmpdir}; + """ -{params.sentieon_exec} driver \ --t {threads} \ --r {input.ref} \ --i {input.bam} \ ---algo DNAscope \ ---pcr_indel_mode {params.pcr_model} \ ---given {input.gnomad_af5} {output.gnomad_af5_vcf}; + rule gatk_denoisereadcounts: + input: + gens_pon=config["reference"]["gens_coverage_pon"], + readcounts_hdf5=cnv_dir + "{sample}.collectreadcounts.hdf5" + output: + denoised_cr=cnv_dir + "{sample}.denoisedCR.tsv", + standardized_cr=cnv_dir + "{sample}.standardizedCR.tsv" + params: + tmpdir=tempfile.mkdtemp(prefix=tmp_dir), + sample="{sample}" + benchmark: + Path(benchmark_dir,"gatk_denoisereadcounts_{sample}.tsv").as_posix() + singularity: + Path(singularity_image,config["bioinfo_tools"].get("gatk") + ".sif").as_posix() + threads: + get_threads(cluster_config,"gatk_denoisereadcounts") + message: + "Running GATK DenoiseReadCounts on {params.sample} for GENS." + shell: + """ + export TMPDIR={params.tmpdir}; -rm -rf {params.tmpdir}; - """ + gatk --java-options "-Xmx60g" DenoiseReadCounts \ + -I {input.readcounts_hdf5} \ + --count-panel-of-normals {input.gens_pon} \ + --tmp-dir {params.tmpdir} \ + --standardized-copy-ratios {output.standardized_cr} \ + --denoised-copy-ratios {output.denoised_cr} -rule gatk_denoisereadcounts: - input: - gens_pon = config["reference"]["gens_coverage_pon"], - readcounts_hdf5 = cnv_dir + "{sample}.collectreadcounts.hdf5" - output: - denoised_cr = cnv_dir + "{sample}.denoisedCR.tsv", - standardized_cr = cnv_dir + "{sample}.standardizedCR.tsv" - params: - tmpdir=tempfile.mkdtemp(prefix=tmp_dir), - sample="{sample}" - benchmark: - Path(benchmark_dir, "gatk_denoisereadcounts_{sample}.tsv").as_posix() - singularity: - Path(singularity_image,config["bioinfo_tools"].get("gatk") + ".sif").as_posix() - threads: - get_threads(cluster_config,"gatk_denoisereadcounts") - message: - "Running GATK DenoiseReadCounts on {params.sample} for GENS." - shell: - """ -export TMPDIR={params.tmpdir}; + rm -rf {params.tmpdir} + """ + + rule gens_preprocessing_wgs: + input: + denoised_cr = cnv_dir + "{sample}.denoisedCR.tsv", + gnomad_af5_vcf = cnv_dir + "SNV.germline.{sample}.dnascope_gnomad_af5.vcf.gz", + output: + gens_baf_bed = cnv_dir + "{sample}.baf.bed", + gens_cov_bed = cnv_dir + "{sample}.cov.bed" + params: + gens_preprocessing = get_script_path("preprocess_gens.py"), + sequencing_type = sequencing_type, + sample="{sample}" + benchmark: + Path(benchmark_dir, "gens_preprocessing_{sample}.tsv").as_posix() + threads: + get_threads(cluster_config, "gens_preprocessing") + message: + "Formatting output for GENS for sample: {params.sample}." + shell: + """ + python {params.gens_preprocessing} -s {params.sequencing_type} -o {output.gens_baf_bed} calculate-bafs --vcf-file-path {input.gnomad_af5_vcf} + python {params.gens_preprocessing} -s {params.sequencing_type} -o {output.gens_cov_bed} create-coverage-regions --normalised-coverage-path {input.denoised_cr} + """ +else: + rule sentieon_DNAscope_gnomad_tga: + input: + bed=config["panel"]["capture_kit"], + ref=config["reference"]["reference_genome"], + gnomad_af5=config["reference"]["gnomad_min_af5"], + bam=lambda wildcards: config_model.get_final_bam_name(bam_dir=bam_dir,sample_name=wildcards.sample) + output: + gnomad_af5_vcf=cnv_dir + "SNV.germline.{sample}.dnascope_gnomad_af5.vcf.gz", + params: + tmpdir=tempfile.mkdtemp(prefix=tmp_dir), + pcr_model=params.common.pcr_model, + sentieon_exec=config["SENTIEON_EXEC"], + sentieon_lic=config["SENTIEON_LICENSE"], + sample="{sample}" + benchmark: + Path(benchmark_dir,"sentieon_DNAscope_gnomad_{sample}.tsv").as_posix() + threads: + get_threads(cluster_config,"sentieon_DNAscope_gnomad") + message: + "Calling germline variants on positions in Gnomad AF > 0.05 using Sentieon DNAscope for {params.sample}" + shell: + """ + export TMPDIR={params.tmpdir}; + export SENTIEON_TMPDIR={params.tmpdir}; + export SENTIEON_LICENSE={params.sentieon_lic}; + export SENTIEON_DNASCOPE={params.sentieon_ml_dnascope}; -gatk --java-options "-Xmx60g" DenoiseReadCounts \ --I {input.readcounts_hdf5} \ ---count-panel-of-normals {input.gens_pon} \ ---tmp-dir {params.tmpdir} \ ---standardized-copy-ratios {output.standardized_cr} \ ---denoised-copy-ratios {output.denoised_cr} + {params.sentieon_exec} driver \ + -t {threads} \ + -r {input.ref} \ + -i {input.bam} \ + --interval {input.bed} \ + --interval_padding 100 \ + --algo DNAscope \ + --pcr_indel_mode {params.pcr_model} \ + --given {input.gnomad_af5} {output.gnomad_af5_vcf}; -rm -rf {params.tmpdir} - """ + rm -rf {params.tmpdir}; + """ -rule gens_preprocessing: - input: - denoised_cr = cnv_dir + "{sample}.denoisedCR.tsv", - gnomad_af5_vcf = cnv_dir + "SNV.germline.{sample}.dnascope_gnomad_af5.vcf.gz", - output: - gens_baf_bed = cnv_dir + "{sample}.baf.bed", - gens_cov_bed = cnv_dir + "{sample}.cov.bed" - params: - gens_preprocessing = get_script_path("preprocess_gens.py"), - sequencing_type = sequencing_type, - sample="{sample}" - benchmark: - Path(benchmark_dir, "gens_preprocessing_{sample}.tsv").as_posix() - threads: - get_threads(cluster_config, "gens_preprocessing") - message: - "Formatting output for GENS for sample: {params.sample}." - shell: - """ -python {params.gens_preprocessing} -s {params.sequencing_type} -o {output.gens_baf_bed} calculate-bafs --vcf-file-path {input.gnomad_af5_vcf} -python {params.gens_preprocessing} -s {params.sequencing_type} -o {output.gens_cov_bed} create-coverage-regions --normalised-coverage-path {input.denoised_cr} - """ + rule gens_preprocessing_tga: + input: + cnvkit_cnr = cnv_dir + "tumor.merged.cnr", + gnomad_af5_vcf = cnv_dir + "SNV.germline.{sample}.dnascope_gnomad_af5.vcf.gz", + output: + gens_baf_bed = cnv_dir + "{sample}.baf.bed", + gens_cov_bed = cnv_dir + "{sample}.cov.bed" + params: + gens_preprocessing = get_script_path("preprocess_gens.py"), + gens_preprocess_cnvkit = get_script_path("postprocess_gens_cnvkit.py"), + sample="{sample}" + benchmark: + Path(benchmark_dir, "gens_preprocessing_{sample}.tsv").as_posix() + threads: + get_threads(cluster_config, "gens_preprocessing") + message: + "Formatting output for GENS for sample: {params.sample}." + shell: + """ + python {params.gens_preprocessing} -s {params.sequencing_type} -o {output.gens_baf_bed} calculate-bafs --vcf-file-path {input.gnomad_af5_vcf} + python {params.gens_preprocess_cnvkit} -o {output.gens_cov_bed} --normalised-coverage-path {input.cnvkit_cnr} + """ rule finalize_gens_outputfiles: input: From 7f6e521fc7fdc5235f9fb43891112b9ba18aaaff Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Fri, 14 Jun 2024 18:01:24 +0200 Subject: [PATCH 02/39] add gens inputs --- BALSAMIC/workflows/balsamic.smk | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/BALSAMIC/workflows/balsamic.smk b/BALSAMIC/workflows/balsamic.smk index 49a5f34e0..25226cc49 100644 --- a/BALSAMIC/workflows/balsamic.smk +++ b/BALSAMIC/workflows/balsamic.smk @@ -9,7 +9,7 @@ from pathlib import Path from typing import Dict, List from BALSAMIC.constants.constants import FileType -from BALSAMIC.constants.analysis import FastqName, MutationType, SampleType +from BALSAMIC.constants.analysis import FastqName, MutationType, SampleType, SequencingType from BALSAMIC.constants.paths import BALSAMIC_DIR, SENTIEON_DNASCOPE_DIR, SENTIEON_TNSCOPE_DIR from BALSAMIC.constants.rules import SNAKEMAKE_RULES from BALSAMIC.constants.variant_filters import ( @@ -522,10 +522,15 @@ if config['analysis']['analysis_type'] == "single": ) # GENS Outputs -if config["analysis"]["sequencing_type"] == "wgs" and "gens_coverage_pon" in config["reference"]: +if config["analysis"]["sequencing_type"] == SequencingType.WGS and "gens_coverage_pon" in config["reference"]: analysis_specific_results.extend( expand(cnv_dir + "{sample}.{gens_input}.bed.gz", sample=sample_names, gens_input=["cov", "baf"]) ) +if config["analysis"]["sequencing_type"] == SequencingType.TARGETED: + analysis_specific_results.extend( + expand(cnv_dir + "{sample}.{gens_input}.bed.gz", sample=sample_names, gens_input=["cov", "baf"]) + ) + # Dragen if config["analysis"]["sequencing_type"] == "wgs" and config['analysis']['analysis_type'] == "single": From 24e41a9d33a37038b78a2f0064f869bbe3beee14 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Fri, 14 Jun 2024 18:56:08 +0200 Subject: [PATCH 03/39] add gens gnomad af to tga --- .../assets/scripts/postprocess_gens_cnvkit.py | 18 ++++++---- BALSAMIC/commands/config/case.py | 25 ++----------- BALSAMIC/constants/tools.py | 2 +- BALSAMIC/utils/cli.py | 36 +++++++++++++++++++ BALSAMIC/workflows/balsamic.smk | 2 +- 5 files changed, 51 insertions(+), 32 deletions(-) diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py index 8c95c496c..cc387202f 100644 --- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py +++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py @@ -1,6 +1,7 @@ import click import pandas as pd + @click.command() @click.option( "-o", @@ -19,18 +20,21 @@ def create_gens_cov_file(output_file, ormalised_coverage_path): # Process CNVkit file log2_data = [] - cnvkit_df = pd.read_csv(ormalised_coverage_path, sep='\t') + cnvkit_df = pd.read_csv(ormalised_coverage_path, sep="\t") for index, row in cnvkit_df.iterrows(): - if row['gene'] == "Antitarget": + if row["gene"] == "Antitarget": continue - midpoint = row['start'] + int((row['end'] - row['start']) / 2) - log2_data.append(f"{row['chromosome']}\t{midpoint-1}\t{midpoint}\t{row['log2']}") + midpoint = row["start"] + int((row["end"] - row["start"]) / 2) + log2_data.append( + f"{row['chromosome']}\t{midpoint-1}\t{midpoint}\t{row['log2']}" + ) # Write log2 data to output file - with open(output_file, 'w') as log2_out: - for resolution in ['o', 'a', 'b', 'c', 'd']: + with open(output_file, "w") as log2_out: + for resolution in ["o", "a", "b", "c", "d"]: for line in log2_data: log2_out.write(f"{resolution}_{line}\n") -if __name__ == '__main__': + +if __name__ == "__main__": create_gens_cov_file() diff --git a/BALSAMIC/commands/config/case.py b/BALSAMIC/commands/config/case.py index 0c4444bbc..2e038762e 100644 --- a/BALSAMIC/commands/config/case.py +++ b/BALSAMIC/commands/config/case.py @@ -50,6 +50,7 @@ get_bioinfo_tools_version, get_panel_chrom, get_sample_list, + get_gens_references, ) from BALSAMIC.utils.io import read_json, write_json from BALSAMIC.utils.utils import get_absolute_paths_dict @@ -129,29 +130,7 @@ def case_config( if cadd_annotations: references.update(cadd_annotations_path) - if any([genome_interval, gens_coverage_pon, gnomad_min_af5]): - if panel_bed: - raise click.BadParameter( - "GENS is currently not compatible with TGA analysis, only WGS." - ) - if not all([genome_interval, gens_coverage_pon, gnomad_min_af5]): - raise click.BadParameter( - "All three arguments (genome_interval gens_coverage_pon, gnomad_min_af5) are required for GENS." - ) - - gens_ref_files = { - "genome_interval": genome_interval, - "gens_coverage_pon": gens_coverage_pon, - "gnomad_min_af5": gnomad_min_af5, - } - - references.update( - { - gens_file: path - for gens_file, path in gens_ref_files.items() - if path is not None - } - ) + references = get_gens_references(genome_interval, gens_coverage_pon, gnomad_min_af5, panel_bed, references) variants_observations = { "clinical_snv_observations": clinical_snv_observations, diff --git a/BALSAMIC/constants/tools.py b/BALSAMIC/constants/tools.py index 71ff8ab65..0fe61fc4a 100644 --- a/BALSAMIC/constants/tools.py +++ b/BALSAMIC/constants/tools.py @@ -43,6 +43,6 @@ }, SequencingType.TARGETED: { "BAF_SKIP_N": {"o": 0, "a": 0, "b": 0, "c": 0, "d": 0}, - } + }, }, } diff --git a/BALSAMIC/utils/cli.py b/BALSAMIC/utils/cli.py index 1fd903ac5..c4580454e 100644 --- a/BALSAMIC/utils/cli.py +++ b/BALSAMIC/utils/cli.py @@ -215,6 +215,42 @@ def bioinfo_tool_version_conda( return conda_bioinfo_version +def get_gens_references( + genome_interval: Optional[str], + gens_coverage_pon: Optional[str], + gnomad_min_af5: Optional[str], + panel_bed: Optional[str], + references: Dict[str, str], +) -> Dict: + + gens_arguments = [genome_interval, gens_coverage_pon, gnomad_min_af5] + + # Check if any of the gens arguments are provided + if any(gens_arguments): + # Ensure all required arguments are provided or raise an error if panel_bed is not set + if not all(gens_arguments) and not panel_bed: + raise click.BadParameter( + "All three arguments (genome_interval, gens_coverage_pon, gnomad_min_af5) are required for GENS in WGS." + ) + + # Construct the gens_ref_files dictionary + gens_ref_files = ( + {"gnomad_min_af5": gnomad_min_af5} + if panel_bed + else { + "genome_interval": genome_interval, + "gens_coverage_pon": gens_coverage_pon, + "gnomad_min_af5": gnomad_min_af5, + } + ) + + # Update references dictionary with gens values + references.update( + {gens_file: path for gens_file, path in gens_ref_files.items() if path} + ) + return references + + def get_bioinfo_tools_version( bioinfo_tools: dict, container_conda_env_path: Path ) -> dict: diff --git a/BALSAMIC/workflows/balsamic.smk b/BALSAMIC/workflows/balsamic.smk index 25226cc49..4014d8510 100644 --- a/BALSAMIC/workflows/balsamic.smk +++ b/BALSAMIC/workflows/balsamic.smk @@ -526,7 +526,7 @@ if config["analysis"]["sequencing_type"] == SequencingType.WGS and "gens_coverag analysis_specific_results.extend( expand(cnv_dir + "{sample}.{gens_input}.bed.gz", sample=sample_names, gens_input=["cov", "baf"]) ) -if config["analysis"]["sequencing_type"] == SequencingType.TARGETED: +if config["analysis"]["sequencing_type"] == SequencingType.TARGETED and "gnomad_min_af5" in config["reference"]: analysis_specific_results.extend( expand(cnv_dir + "{sample}.{gens_input}.bed.gz", sample=sample_names, gens_input=["cov", "baf"]) ) From cdbe43fce0404ba4964169dde0bbf85c73ec9f8b Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Fri, 14 Jun 2024 18:56:17 +0200 Subject: [PATCH 04/39] black --- BALSAMIC/commands/config/case.py | 4 +++- BALSAMIC/utils/cli.py | 1 - 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/BALSAMIC/commands/config/case.py b/BALSAMIC/commands/config/case.py index 2e038762e..081de6265 100644 --- a/BALSAMIC/commands/config/case.py +++ b/BALSAMIC/commands/config/case.py @@ -130,7 +130,9 @@ def case_config( if cadd_annotations: references.update(cadd_annotations_path) - references = get_gens_references(genome_interval, gens_coverage_pon, gnomad_min_af5, panel_bed, references) + references = get_gens_references( + genome_interval, gens_coverage_pon, gnomad_min_af5, panel_bed, references + ) variants_observations = { "clinical_snv_observations": clinical_snv_observations, diff --git a/BALSAMIC/utils/cli.py b/BALSAMIC/utils/cli.py index c4580454e..47ef70c29 100644 --- a/BALSAMIC/utils/cli.py +++ b/BALSAMIC/utils/cli.py @@ -222,7 +222,6 @@ def get_gens_references( panel_bed: Optional[str], references: Dict[str, str], ) -> Dict: - gens_arguments = [genome_interval, gens_coverage_pon, gnomad_min_af5] # Check if any of the gens arguments are provided From f262919814253facba4da62af034e07f612164fc Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Fri, 14 Jun 2024 19:07:39 +0200 Subject: [PATCH 05/39] fix --- BALSAMIC/utils/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BALSAMIC/utils/cli.py b/BALSAMIC/utils/cli.py index 47ef70c29..2fdf20902 100644 --- a/BALSAMIC/utils/cli.py +++ b/BALSAMIC/utils/cli.py @@ -245,7 +245,7 @@ def get_gens_references( # Update references dictionary with gens values references.update( - {gens_file: path for gens_file, path in gens_ref_files.items() if path} + {gens_file: path for gens_file, path in gens_ref_files.items() if path is not None} ) return references From c2aafb9af6a0427d74d8c69baaf7295149f62920 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Fri, 14 Jun 2024 19:14:12 +0200 Subject: [PATCH 06/39] fix --- BALSAMIC/utils/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BALSAMIC/utils/cli.py b/BALSAMIC/utils/cli.py index 2fdf20902..94ab873f7 100644 --- a/BALSAMIC/utils/cli.py +++ b/BALSAMIC/utils/cli.py @@ -247,7 +247,7 @@ def get_gens_references( references.update( {gens_file: path for gens_file, path in gens_ref_files.items() if path is not None} ) - return references + return references def get_bioinfo_tools_version( From cc9e5064eff4d92531b18ecae3339c1760fdf567 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Mon, 17 Jun 2024 07:47:01 +0200 Subject: [PATCH 07/39] fix --- BALSAMIC/constants/rules.py | 2 -- BALSAMIC/workflows/balsamic.smk | 8 ++------ 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/BALSAMIC/constants/rules.py b/BALSAMIC/constants/rules.py index 59cdab9f2..947dbac72 100644 --- a/BALSAMIC/constants/rules.py +++ b/BALSAMIC/constants/rules.py @@ -77,7 +77,6 @@ "snakemake_rules/align/postprocess_bam.rule", ], "varcall": [ - "snakemake_rules/variant_calling/gens_preprocessing.rule", "snakemake_rules/variant_calling/germline.rule", "snakemake_rules/variant_calling/split_bed.rule", "snakemake_rules/variant_calling/somatic_cnv_tumor_only_tga.rule", @@ -109,7 +108,6 @@ "snakemake_rules/align/postprocess_bam.rule", ], "varcall": [ - "snakemake_rules/variant_calling/gens_preprocessing.rule", "snakemake_rules/variant_calling/germline.rule", "snakemake_rules/variant_calling/split_bed.rule", "snakemake_rules/variant_calling/somatic_tumor_normal.rule", diff --git a/BALSAMIC/workflows/balsamic.smk b/BALSAMIC/workflows/balsamic.smk index 4014d8510..63f4ccaa8 100644 --- a/BALSAMIC/workflows/balsamic.smk +++ b/BALSAMIC/workflows/balsamic.smk @@ -406,7 +406,7 @@ if "dragen" in config: rules_to_include.append("snakemake_rules/concatenation/concatenation.rule") # Add rule for GENS -if "gens_coverage_pon" in config["reference"]: +if "gnomad_min_af5" in config["reference"]: rules_to_include.append("snakemake_rules/variant_calling/gatk_read_counts.rule") rules_to_include.append("snakemake_rules/variant_calling/gens_preprocessing.rule") @@ -522,11 +522,7 @@ if config['analysis']['analysis_type'] == "single": ) # GENS Outputs -if config["analysis"]["sequencing_type"] == SequencingType.WGS and "gens_coverage_pon" in config["reference"]: - analysis_specific_results.extend( - expand(cnv_dir + "{sample}.{gens_input}.bed.gz", sample=sample_names, gens_input=["cov", "baf"]) - ) -if config["analysis"]["sequencing_type"] == SequencingType.TARGETED and "gnomad_min_af5" in config["reference"]: +if "gnomad_min_af5" in config["reference"]: analysis_specific_results.extend( expand(cnv_dir + "{sample}.{gens_input}.bed.gz", sample=sample_names, gens_input=["cov", "baf"]) ) From d471395e900fbf1f8b77db898c3228dbd237c9f8 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Mon, 17 Jun 2024 07:57:47 +0200 Subject: [PATCH 08/39] fix --- BALSAMIC/workflows/balsamic.smk | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/BALSAMIC/workflows/balsamic.smk b/BALSAMIC/workflows/balsamic.smk index 63f4ccaa8..c15af04e2 100644 --- a/BALSAMIC/workflows/balsamic.smk +++ b/BALSAMIC/workflows/balsamic.smk @@ -407,8 +407,9 @@ if "dragen" in config: # Add rule for GENS if "gnomad_min_af5" in config["reference"]: - rules_to_include.append("snakemake_rules/variant_calling/gatk_read_counts.rule") rules_to_include.append("snakemake_rules/variant_calling/gens_preprocessing.rule") +if "gnomad_min_af5" in config["reference"] and sequence_type == SequencingType.WGS: + rules_to_include.append("snakemake_rules/variant_calling/gatk_read_counts.rule") LOG.info(f"The following rules will be included in the workflow: {rules_to_include}") LOG.info(f"The following Germline variant callers will be included in the workflow: {germline_caller}") From 05bd3ae51bf78c09b027962ef10b384c4332a018 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Mon, 17 Jun 2024 08:48:36 +0200 Subject: [PATCH 09/39] fix --- .../snakemake_rules/variant_calling/gens_preprocessing.rule | 2 -- 1 file changed, 2 deletions(-) diff --git a/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule b/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule index 73f70f551..3eb57cce0 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule @@ -26,7 +26,6 @@ if config["analysis"]["sequencing_type"] == 'wgs': export TMPDIR={params.tmpdir}; export SENTIEON_TMPDIR={params.tmpdir}; export SENTIEON_LICENSE={params.sentieon_lic}; - export SENTIEON_DNASCOPE={params.sentieon_ml_dnascope}; {params.sentieon_exec} driver \ -t {threads} \ @@ -119,7 +118,6 @@ else: export TMPDIR={params.tmpdir}; export SENTIEON_TMPDIR={params.tmpdir}; export SENTIEON_LICENSE={params.sentieon_lic}; - export SENTIEON_DNASCOPE={params.sentieon_ml_dnascope}; {params.sentieon_exec} driver \ -t {threads} \ From 686855630586b1faf8ba11f00b546b5e6ef78abb Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Mon, 17 Jun 2024 09:02:57 +0200 Subject: [PATCH 10/39] doc strings and named args --- BALSAMIC/commands/config/case.py | 6 +++++- BALSAMIC/utils/cli.py | 12 ++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/BALSAMIC/commands/config/case.py b/BALSAMIC/commands/config/case.py index 081de6265..02bea71b2 100644 --- a/BALSAMIC/commands/config/case.py +++ b/BALSAMIC/commands/config/case.py @@ -131,7 +131,11 @@ def case_config( references.update(cadd_annotations_path) references = get_gens_references( - genome_interval, gens_coverage_pon, gnomad_min_af5, panel_bed, references + genome_interval=genome_interval, + gens_coverage_pon=gens_coverage_pon, + gnomad_min_af5=gnomad_min_af5, + panel_bed=panel_bed, + references=references ) variants_observations = { diff --git a/BALSAMIC/utils/cli.py b/BALSAMIC/utils/cli.py index 94ab873f7..361082d94 100644 --- a/BALSAMIC/utils/cli.py +++ b/BALSAMIC/utils/cli.py @@ -222,6 +222,18 @@ def get_gens_references( panel_bed: Optional[str], references: Dict[str, str], ) -> Dict: + """ + Assigns reference-files required for GENS if they have been supplied. + + :param genome_interval: Coverage-regions. (required for WGS GENS) + :param gens_coverage_pon: PON for GATK CollectReadCounts. (required for WGS GENS) + :param gnomad_min_af5: gnomad VCF filtered to keep variants above 5% VAF. (required for WGS and TGA GENS) + :param panel_bed: Bedfile supplied for TGA analyses. + :param references: Reference dictionary to be updated. + + :return: references: Updated reference dictionary. + """ + gens_arguments = [genome_interval, gens_coverage_pon, gnomad_min_af5] # Check if any of the gens arguments are provided From b03a9f0cdc3772cc1d4cfa8609912db473b966a3 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Mon, 17 Jun 2024 09:03:46 +0200 Subject: [PATCH 11/39] black --- BALSAMIC/commands/config/case.py | 2 +- BALSAMIC/utils/cli.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/BALSAMIC/commands/config/case.py b/BALSAMIC/commands/config/case.py index 02bea71b2..69e12533f 100644 --- a/BALSAMIC/commands/config/case.py +++ b/BALSAMIC/commands/config/case.py @@ -135,7 +135,7 @@ def case_config( gens_coverage_pon=gens_coverage_pon, gnomad_min_af5=gnomad_min_af5, panel_bed=panel_bed, - references=references + references=references, ) variants_observations = { diff --git a/BALSAMIC/utils/cli.py b/BALSAMIC/utils/cli.py index 361082d94..7dce18211 100644 --- a/BALSAMIC/utils/cli.py +++ b/BALSAMIC/utils/cli.py @@ -257,7 +257,11 @@ def get_gens_references( # Update references dictionary with gens values references.update( - {gens_file: path for gens_file, path in gens_ref_files.items() if path is not None} + { + gens_file: path + for gens_file, path in gens_ref_files.items() + if path is not None + } ) return references From 582cfb07d62fd2bdc5758caea5bc56a123cf0eff Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Mon, 17 Jun 2024 09:14:03 +0200 Subject: [PATCH 12/39] changelog --- CHANGELOG.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index d9157bd1b..8b76deeb3 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -5,6 +5,7 @@ Added: ^^^^^^ * MSIsensor-pro container https://github.com/Clinical-Genomics/BALSAMIC/pull/1444 +* GENS input files for TGA https://github.com/Clinical-Genomics/BALSAMIC/pull/1448 Changed: ^^^^^^^^ From e9c4c7f0b610d0e075db387e4f870673d13a99b4 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Mon, 17 Jun 2024 10:09:24 +0200 Subject: [PATCH 13/39] bug fix --- BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule | 1 + 1 file changed, 1 insertion(+) diff --git a/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule b/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule index 3eb57cce0..654d7139f 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule @@ -140,6 +140,7 @@ else: gens_baf_bed = cnv_dir + "{sample}.baf.bed", gens_cov_bed = cnv_dir + "{sample}.cov.bed" params: + sequencing_type = sequencing_type, gens_preprocessing = get_script_path("preprocess_gens.py"), gens_preprocess_cnvkit = get_script_path("postprocess_gens_cnvkit.py"), sample="{sample}" From 8f88da780fc283e4f51d5214a2ac7e2433dcca95 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Mon, 17 Jun 2024 10:43:25 +0200 Subject: [PATCH 14/39] fix --- BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py index cc387202f..d06bfc3c2 100644 --- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py +++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py @@ -17,10 +17,17 @@ type=click.Path(exists=True), help="Input CNVkit cnr result.", ) -def create_gens_cov_file(output_file, ormalised_coverage_path): +def create_gens_cov_file(output_file, normalised_coverage_path): + """ + Post-processes the CNVkit cnr output for upload to GENS. + Removing Antitarget regions and outputting the coverages in multiple resolution-formats. + + :param output_file: Path to GENS output.cov file + :param normalised_coverage_path: Path to input CNVkit cnr file. + """ # Process CNVkit file log2_data = [] - cnvkit_df = pd.read_csv(ormalised_coverage_path, sep="\t") + cnvkit_df = pd.read_csv(normalised_coverage_path, sep="\t") for index, row in cnvkit_df.iterrows(): if row["gene"] == "Antitarget": continue From 4a4671e5e4cb2848e2c2ab1ca221ef815fe085eb Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Mon, 17 Jun 2024 16:45:33 +0200 Subject: [PATCH 15/39] lower padding --- .../snakemake_rules/variant_calling/gens_preprocessing.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule b/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule index 654d7139f..f051200ac 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule @@ -124,7 +124,7 @@ else: -r {input.ref} \ -i {input.bam} \ --interval {input.bed} \ - --interval_padding 100 \ + --interval_padding 20 \ --algo DNAscope \ --pcr_indel_mode {params.pcr_model} \ --given {input.gnomad_af5} {output.gnomad_af5_vcf}; From 5bcf7687d7b92e365d1fa6c348d23139a161dd56 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Tue, 18 Jun 2024 09:17:38 +0200 Subject: [PATCH 16/39] add tumor purity adjustment to gens cov file --- .../assets/scripts/postprocess_gens_cnvkit.py | 22 +++++++++++++++++-- .../variant_calling/gens_preprocessing.rule | 11 +++++----- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py index d06bfc3c2..b2003cc24 100644 --- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py +++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py @@ -17,23 +17,41 @@ type=click.Path(exists=True), help="Input CNVkit cnr result.", ) -def create_gens_cov_file(output_file, normalised_coverage_path): +@click.option( + "-p", + "--tumor-purity-path", + required=False, + type=click.Path(exists=True), + help="Tumor purity file from PureCN", +) +def create_gens_cov_file(output_file, normalised_coverage_path, tumor_purity_path): """ Post-processes the CNVkit cnr output for upload to GENS. Removing Antitarget regions and outputting the coverages in multiple resolution-formats. :param output_file: Path to GENS output.cov file :param normalised_coverage_path: Path to input CNVkit cnr file. + :param tumor_purity_path: Path to PureCN purity estimate csv file """ # Process CNVkit file log2_data = [] cnvkit_df = pd.read_csv(normalised_coverage_path, sep="\t") + + # Process PureCN purity file + purity = None + if tumor_purity_path: + purecn_df = pd.read_csv(tumor_purity_path, sep=",") + purity = float(purecn_df.iloc[0]["Purity"]) + for index, row in cnvkit_df.iterrows(): if row["gene"] == "Antitarget": continue midpoint = row["start"] + int((row["end"] - row["start"]) / 2) + log2 = row['log2'] + if purity: + log2 = round(float(log2) / purity, 4) log2_data.append( - f"{row['chromosome']}\t{midpoint-1}\t{midpoint}\t{row['log2']}" + f"{row['chromosome']}\t{midpoint-1}\t{midpoint}\t{log2}" ) # Write log2 data to output file diff --git a/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule b/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule index f051200ac..18164d985 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule @@ -65,7 +65,7 @@ if config["analysis"]["sequencing_type"] == 'wgs': --count-panel-of-normals {input.gens_pon} \ --tmp-dir {params.tmpdir} \ --standardized-copy-ratios {output.standardized_cr} \ - --denoised-copy-ratios {output.denoised_cr} + --denoised-copy-ratios {output.denoised_cr} ; rm -rf {params.tmpdir} """ @@ -89,7 +89,7 @@ if config["analysis"]["sequencing_type"] == 'wgs': "Formatting output for GENS for sample: {params.sample}." shell: """ - python {params.gens_preprocessing} -s {params.sequencing_type} -o {output.gens_baf_bed} calculate-bafs --vcf-file-path {input.gnomad_af5_vcf} + python {params.gens_preprocessing} -s {params.sequencing_type} -o {output.gens_baf_bed} calculate-bafs --vcf-file-path {input.gnomad_af5_vcf} ; python {params.gens_preprocessing} -s {params.sequencing_type} -o {output.gens_cov_bed} create-coverage-regions --normalised-coverage-path {input.denoised_cr} """ else: @@ -136,6 +136,7 @@ else: input: cnvkit_cnr = cnv_dir + "tumor.merged.cnr", gnomad_af5_vcf = cnv_dir + "SNV.germline.{sample}.dnascope_gnomad_af5.vcf.gz", + purecn_purity_csv = cnv_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".purecn.purity.csv", output: gens_baf_bed = cnv_dir + "{sample}.baf.bed", gens_cov_bed = cnv_dir + "{sample}.cov.bed" @@ -152,8 +153,8 @@ else: "Formatting output for GENS for sample: {params.sample}." shell: """ - python {params.gens_preprocessing} -s {params.sequencing_type} -o {output.gens_baf_bed} calculate-bafs --vcf-file-path {input.gnomad_af5_vcf} - python {params.gens_preprocess_cnvkit} -o {output.gens_cov_bed} --normalised-coverage-path {input.cnvkit_cnr} + python {params.gens_preprocessing} -s {params.sequencing_type} -o {output.gens_baf_bed} calculate-bafs --vcf-file-path {input.gnomad_af5_vcf} ; + python {params.gens_preprocess_cnvkit} -o {output.gens_cov_bed} --normalised-coverage-path {input.cnvkit_cnr} --tumor-purity-path {input.purecn_purity_csv} """ rule finalize_gens_outputfiles: @@ -175,7 +176,7 @@ rule finalize_gens_outputfiles: "Bgzip and index GENS output: {params.gens_input} for sample: {params.sample_id}." shell: """ -bgzip {input.gens_input} +bgzip {input.gens_input} ; tabix {input.gens_input}.gz """ From acde38f688c4f1aab24e3c039eabc273eef703e2 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Wed, 19 Jun 2024 15:33:36 +0200 Subject: [PATCH 17/39] typehints --- BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py index b2003cc24..e0542b85c 100644 --- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py +++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py @@ -24,7 +24,7 @@ type=click.Path(exists=True), help="Tumor purity file from PureCN", ) -def create_gens_cov_file(output_file, normalised_coverage_path, tumor_purity_path): +def create_gens_cov_file(output_file: str, normalised_coverage_path: str, tumor_purity_path: str | None): """ Post-processes the CNVkit cnr output for upload to GENS. Removing Antitarget regions and outputting the coverages in multiple resolution-formats. From ed4c018a4873160106f3f389e7bc580f64bfdac5 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Wed, 19 Jun 2024 15:40:37 +0200 Subject: [PATCH 18/39] black --- BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py index e0542b85c..4ad870c12 100644 --- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py +++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py @@ -24,7 +24,9 @@ type=click.Path(exists=True), help="Tumor purity file from PureCN", ) -def create_gens_cov_file(output_file: str, normalised_coverage_path: str, tumor_purity_path: str | None): +def create_gens_cov_file( + output_file: str, normalised_coverage_path: str, tumor_purity_path: str | None +): """ Post-processes the CNVkit cnr output for upload to GENS. Removing Antitarget regions and outputting the coverages in multiple resolution-formats. @@ -47,12 +49,10 @@ def create_gens_cov_file(output_file: str, normalised_coverage_path: str, tumor_ if row["gene"] == "Antitarget": continue midpoint = row["start"] + int((row["end"] - row["start"]) / 2) - log2 = row['log2'] + log2 = row["log2"] if purity: log2 = round(float(log2) / purity, 4) - log2_data.append( - f"{row['chromosome']}\t{midpoint-1}\t{midpoint}\t{log2}" - ) + log2_data.append(f"{row['chromosome']}\t{midpoint-1}\t{midpoint}\t{log2}") # Write log2 data to output file with open(output_file, "w") as log2_out: From a108a0e2f406699f76418fb14a85fe8bf9cf60ea Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Wed, 19 Jun 2024 16:17:21 +0200 Subject: [PATCH 19/39] fix pytests --- tests/commands/config/test_config_sample.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/tests/commands/config/test_config_sample.py b/tests/commands/config/test_config_sample.py index 9185e7afa..1cc920fc0 100644 --- a/tests/commands/config/test_config_sample.py +++ b/tests/commands/config/test_config_sample.py @@ -367,7 +367,7 @@ def test_missing_required_gens_arguments( # THEN the CLI should exit code 2 and display an informative error message assert result.exit_code == 2 assert ( - "All three arguments (genome_interval gens_coverage_pon, gnomad_min_af5) are required for GENS." + "All three arguments (genome_interval, gens_coverage_pon, gnomad_min_af5) are required for GENS in WGS." in result.output ) @@ -448,21 +448,17 @@ def test_config_with_gens_arguments_for_tga( balsamic_cache, "--tumor-sample-name", tumor_sample_name, - "--gens-coverage-pon", - gens_cov_pon_file, "--gnomad-min-af5", gens_min_5_af_gnomad_file, - "--genome-interval", - gens_hg19_interval_list, "-p", panel_bed_file, ], ) - # THEN config should fail with error message - assert result.exit_code == 2 - assert ( - "GENS is currently not compatible with TGA analysis, only WGS." in result.output - ) + # THEN a config should be created and exist + assert result.exit_code == 0 + assert Path( + analysis_dir, case_id_tumor_only, f"{case_id_tumor_only}.{FileType.JSON}" + ).exists() def test_config_wgs_with_exome( From 99d3d57514f79f545dcf27f909dd32a0bfb418d5 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Fri, 28 Jun 2024 12:15:43 +0200 Subject: [PATCH 20/39] update purity adjustment formula --- BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py index 4ad870c12..a0ce25d37 100644 --- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py +++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py @@ -44,6 +44,7 @@ def create_gens_cov_file( if tumor_purity_path: purecn_df = pd.read_csv(tumor_purity_path, sep=",") purity = float(purecn_df.iloc[0]["Purity"]) + ploidy = float(purecn_df.iloc[0]["Ploidy"]) for index, row in cnvkit_df.iterrows(): if row["gene"] == "Antitarget": @@ -51,7 +52,7 @@ def create_gens_cov_file( midpoint = row["start"] + int((row["end"] - row["start"]) / 2) log2 = row["log2"] if purity: - log2 = round(float(log2) / purity, 4) + log2 = round(float(1 - purity) + purity * (log2 / ploidy), 4) log2_data.append(f"{row['chromosome']}\t{midpoint-1}\t{midpoint}\t{log2}") # Write log2 data to output file From 44f25e1c0fceeaa6d297ce2701152ce7c718d1a2 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Fri, 28 Jun 2024 12:39:09 +0200 Subject: [PATCH 21/39] m conflict --- BALSAMIC/workflows/balsamic.smk | 5 ----- 1 file changed, 5 deletions(-) diff --git a/BALSAMIC/workflows/balsamic.smk b/BALSAMIC/workflows/balsamic.smk index b4b67c8c6..0ecce72c4 100644 --- a/BALSAMIC/workflows/balsamic.smk +++ b/BALSAMIC/workflows/balsamic.smk @@ -9,17 +9,12 @@ from pathlib import Path from typing import Dict, List from BALSAMIC.constants.constants import FileType -<<<<<<< HEAD from BALSAMIC.constants.analysis import FastqName, MutationType, SampleType, SequencingType -from BALSAMIC.constants.paths import BALSAMIC_DIR, SENTIEON_DNASCOPE_DIR, SENTIEON_TNSCOPE_DIR -======= -from BALSAMIC.constants.analysis import FastqName, MutationType, SampleType from BALSAMIC.constants.paths import ( BALSAMIC_DIR, SENTIEON_DNASCOPE_DIR, SENTIEON_TNSCOPE_DIR, ) ->>>>>>> develop from BALSAMIC.constants.rules import SNAKEMAKE_RULES from BALSAMIC.constants.variant_filters import ( COMMON_SETTINGS, From 9335478a7d6917b63a937f772fc907e769d5d695 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Fri, 28 Jun 2024 12:44:41 +0200 Subject: [PATCH 22/39] m conflict --- BALSAMIC/workflows/balsamic.smk | 7 ------- 1 file changed, 7 deletions(-) diff --git a/BALSAMIC/workflows/balsamic.smk b/BALSAMIC/workflows/balsamic.smk index 0ecce72c4..960b83432 100644 --- a/BALSAMIC/workflows/balsamic.smk +++ b/BALSAMIC/workflows/balsamic.smk @@ -698,14 +698,7 @@ if config["analysis"]["analysis_type"] == "single": ) # GENS Outputs -<<<<<<< HEAD if "gnomad_min_af5" in config["reference"]: -======= -if ( - config["analysis"]["sequencing_type"] == "wgs" - and "gens_coverage_pon" in config["reference"] -): ->>>>>>> develop analysis_specific_results.extend( expand( cnv_dir + "{sample}.{gens_input}.bed.gz", From 576811644c121dcaa1f397d8a66c0ba5082975d4 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Mon, 15 Jul 2024 12:18:17 +0200 Subject: [PATCH 23/39] adjust formula --- BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py index a0ce25d37..21e706c31 100644 --- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py +++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py @@ -1,6 +1,9 @@ import click import pandas as pd +import numpy as np +def adjust_log2_ratio(log2_ratio, purity): + return np.log2((2**log2_ratio - (1 - purity)) / purity) @click.command() @click.option( @@ -52,7 +55,7 @@ def create_gens_cov_file( midpoint = row["start"] + int((row["end"] - row["start"]) / 2) log2 = row["log2"] if purity: - log2 = round(float(1 - purity) + purity * (log2 / ploidy), 4) + log2 = adjust_log2_ratio(log2, purity) log2_data.append(f"{row['chromosome']}\t{midpoint-1}\t{midpoint}\t{log2}") # Write log2 data to output file From 64338874c78cf2e15fb08bbbd05eb88e01768e9a Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Mon, 15 Jul 2024 13:09:45 +0200 Subject: [PATCH 24/39] add round --- BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py index 21e706c31..f05f3ef35 100644 --- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py +++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py @@ -55,7 +55,7 @@ def create_gens_cov_file( midpoint = row["start"] + int((row["end"] - row["start"]) / 2) log2 = row["log2"] if purity: - log2 = adjust_log2_ratio(log2, purity) + log2 = round(adjust_log2_ratio(log2, purity), 4) log2_data.append(f"{row['chromosome']}\t{midpoint-1}\t{midpoint}\t{log2}") # Write log2 data to output file From f3a2c665ef3efcbf2696613f34426381a5b74fa5 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Tue, 23 Jul 2024 16:36:11 +0200 Subject: [PATCH 25/39] code review --- .../assets/scripts/postprocess_gens_cnvkit.py | 51 ++++++++++++------- BALSAMIC/utils/io.py | 7 +++ 2 files changed, 40 insertions(+), 18 deletions(-) diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py index f05f3ef35..1e56f4a77 100644 --- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py +++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py @@ -1,9 +1,21 @@ import click -import pandas as pd +from BALSAMIC.utils.io import read_csv import numpy as np -def adjust_log2_ratio(log2_ratio, purity): - return np.log2((2**log2_ratio - (1 - purity)) / purity) +def calculate_log2_ratio(purity, log2_ratio, ploidy): + # Ensure that the inputs are within valid ranges + if not (0 <= purity <= 1): + raise ValueError("Purity must be between 0 and 1") + + if ploidy <= 0: + raise ValueError("Ploidy must be a positive number") + + # Calculate the log2 ratio + numerator = (1 - purity) + (purity * log2_ratio) + log2_ratio = np.log2(numerator / ploidy) + + return log2_ratio + @click.command() @click.option( @@ -18,7 +30,7 @@ def adjust_log2_ratio(log2_ratio, purity): "--normalised-coverage-path", required=True, type=click.Path(exists=True), - help="Input CNVkit cnr result.", + help="Input CNVkit cnr. result.", ) @click.option( "-p", @@ -30,33 +42,36 @@ def adjust_log2_ratio(log2_ratio, purity): def create_gens_cov_file( output_file: str, normalised_coverage_path: str, tumor_purity_path: str | None ): - """ - Post-processes the CNVkit cnr output for upload to GENS. - Removing Antitarget regions and outputting the coverages in multiple resolution-formats. + """Post-processes the CNVkit .cnr output for upload to GENS. - :param output_file: Path to GENS output.cov file - :param normalised_coverage_path: Path to input CNVkit cnr file. - :param tumor_purity_path: Path to PureCN purity estimate csv file + Removes Antitarget regions, adjusts for purity and ploidy and outputs the coverages in multiple resolution-formats. + + Args: + output_file: Path to GENS output.cov file + normalised_coverage_path: Path to input CNVkit cnr file. + tumor_purity_path: Path to PureCN purity estimate csv file """ - # Process CNVkit file log2_data = [] - cnvkit_df = pd.read_csv(normalised_coverage_path, sep="\t") + + # Process CNVkit file + cnr_dict_list: list[dict] = read_csv(csv_path=normalised_coverage_path, delimeter="\t") # Process PureCN purity file purity = None if tumor_purity_path: - purecn_df = pd.read_csv(tumor_purity_path, sep=",") - purity = float(purecn_df.iloc[0]["Purity"]) - ploidy = float(purecn_df.iloc[0]["Ploidy"]) + purecn_dict_list: list[dict] = read_csv(csv_path=tumor_purity_path, delimeter=",") + purity = float(purecn_dict_list[0]["Purity"]) + ploidy = float(purecn_dict_list[0]["Ploidy"]) - for index, row in cnvkit_df.iterrows(): + for row in cnr_dict_list: if row["gene"] == "Antitarget": continue midpoint = row["start"] + int((row["end"] - row["start"]) / 2) log2 = row["log2"] if purity: - log2 = round(adjust_log2_ratio(log2, purity), 4) - log2_data.append(f"{row['chromosome']}\t{midpoint-1}\t{midpoint}\t{log2}") + log2 = round(calculate_log2_ratio(purity, log2, ploidy), 4) + log2_data.append(f"{row['chromosome']}\t{midpoint - 1}\t{midpoint}\t{log2}") + # Write log2 data to output file with open(output_file, "w") as log2_out: diff --git a/BALSAMIC/utils/io.py b/BALSAMIC/utils/io.py index ccc4d8a20..d5aba9944 100644 --- a/BALSAMIC/utils/io.py +++ b/BALSAMIC/utils/io.py @@ -1,5 +1,6 @@ """Input/Output utility methods.""" +import csv import gzip import json import logging @@ -66,6 +67,12 @@ def write_json(json_obj: dict, path: str) -> None: raise OSError(f"Error while writing JSON file: {path}, error: {error}") +def read_csv(csv_path: str, delimeter: str) -> List[dict]: + """Read data from a csv file.""" + with open(csv_path, mode='r') as csv_file: + csv_reader = csv.DictReader(csv_file, delimiter=delimeter) + return [row for row in csv_reader] + def read_yaml(yaml_path: str) -> dict: """Read data from a yaml file.""" if Path(yaml_path).exists(): From 2b91865619e578218fae5f9513996a2f69f65b41 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Wed, 24 Jul 2024 08:37:22 +0200 Subject: [PATCH 26/39] fix bug --- BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py index 1e56f4a77..83a5c3a7a 100644 --- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py +++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py @@ -66,7 +66,9 @@ def create_gens_cov_file( for row in cnr_dict_list: if row["gene"] == "Antitarget": continue - midpoint = row["start"] + int((row["end"] - row["start"]) / 2) + start = int(row["start"]) + end = int(row["end"]) + midpoint = start + int((end - start / 2)) log2 = row["log2"] if purity: log2 = round(calculate_log2_ratio(purity, log2, ploidy), 4) From 821bbd3f082181a74ab09e796f8b4aeb722b6f12 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Wed, 24 Jul 2024 08:53:40 +0200 Subject: [PATCH 27/39] float --- BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py index 83a5c3a7a..29f874b42 100644 --- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py +++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py @@ -69,7 +69,7 @@ def create_gens_cov_file( start = int(row["start"]) end = int(row["end"]) midpoint = start + int((end - start / 2)) - log2 = row["log2"] + log2 = float(row["log2"]) if purity: log2 = round(calculate_log2_ratio(purity, log2, ploidy), 4) log2_data.append(f"{row['chromosome']}\t{midpoint - 1}\t{midpoint}\t{log2}") From 6aa551065a16c59bb4c0ee899f3470eca1c05592 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Wed, 24 Jul 2024 10:51:45 +0200 Subject: [PATCH 28/39] comment out purity and ploidy adjustment --- BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py index 29f874b42..ef46ccfc3 100644 --- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py +++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py @@ -70,8 +70,8 @@ def create_gens_cov_file( end = int(row["end"]) midpoint = start + int((end - start / 2)) log2 = float(row["log2"]) - if purity: - log2 = round(calculate_log2_ratio(purity, log2, ploidy), 4) + #if purity: + # log2 = round(calculate_log2_ratio(purity, log2, ploidy), 4) log2_data.append(f"{row['chromosome']}\t{midpoint - 1}\t{midpoint}\t{log2}") From f70318973b43b5a93300652d71915651cf2426da Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Thu, 25 Jul 2024 13:58:24 +0200 Subject: [PATCH 29/39] adjust purity and ploidy --- .../assets/scripts/postprocess_gens_cnvkit.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py index ef46ccfc3..b395158d3 100644 --- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py +++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py @@ -1,6 +1,7 @@ import click from BALSAMIC.utils.io import read_csv import numpy as np +import warnings def calculate_log2_ratio(purity, log2_ratio, ploidy): # Ensure that the inputs are within valid ranges @@ -12,6 +13,10 @@ def calculate_log2_ratio(purity, log2_ratio, ploidy): # Calculate the log2 ratio numerator = (1 - purity) + (purity * log2_ratio) + + if numerator <= 0: + return None + log2_ratio = np.log2(numerator / ploidy) return log2_ratio @@ -63,6 +68,7 @@ def create_gens_cov_file( purity = float(purecn_dict_list[0]["Purity"]) ploidy = float(purecn_dict_list[0]["Ploidy"]) + count_none_log2 = 0 for row in cnr_dict_list: if row["gene"] == "Antitarget": continue @@ -70,10 +76,15 @@ def create_gens_cov_file( end = int(row["end"]) midpoint = start + int((end - start / 2)) log2 = float(row["log2"]) - #if purity: - # log2 = round(calculate_log2_ratio(purity, log2, ploidy), 4) - log2_data.append(f"{row['chromosome']}\t{midpoint - 1}\t{midpoint}\t{log2}") + if purity: + log2 = round(calculate_log2_ratio(purity, log2, ploidy), 4) + if not log2: + count_none_log2 += 1 + warnings.warn("Numerator is less than or equal to 0, returning None for region.") + continue + log2_data.append(f"{row['chromosome']}\t{midpoint - 1}\t{midpoint}\t{log2}") + warnings.warn(f"Some regions could not be transformed due to invalid values after plodiy and purity adjustment: {count_none_log2}") # Write log2 data to output file with open(output_file, "w") as log2_out: From 8994fc85fbf83fea840507697aba089dc30df466 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Thu, 25 Jul 2024 15:12:09 +0200 Subject: [PATCH 30/39] bug fix --- BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py index b395158d3..560f949b1 100644 --- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py +++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py @@ -77,12 +77,12 @@ def create_gens_cov_file( midpoint = start + int((end - start / 2)) log2 = float(row["log2"]) if purity: - log2 = round(calculate_log2_ratio(purity, log2, ploidy), 4) + log2 = calculate_log2_ratio(purity, log2, ploidy) if not log2: count_none_log2 += 1 warnings.warn("Numerator is less than or equal to 0, returning None for region.") continue - + log2 = round(log2, 4) log2_data.append(f"{row['chromosome']}\t{midpoint - 1}\t{midpoint}\t{log2}") warnings.warn(f"Some regions could not be transformed due to invalid values after plodiy and purity adjustment: {count_none_log2}") From 11be7bfe547bbf2bd0fd1c574f34421dbdc1f0e6 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Thu, 25 Jul 2024 16:10:56 +0200 Subject: [PATCH 31/39] remove purity and ploidy adjustment --- BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py index 560f949b1..05e72426f 100644 --- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py +++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py @@ -76,13 +76,13 @@ def create_gens_cov_file( end = int(row["end"]) midpoint = start + int((end - start / 2)) log2 = float(row["log2"]) - if purity: - log2 = calculate_log2_ratio(purity, log2, ploidy) - if not log2: - count_none_log2 += 1 - warnings.warn("Numerator is less than or equal to 0, returning None for region.") - continue - log2 = round(log2, 4) + #if purity: + # log2 = calculate_log2_ratio(purity, log2, ploidy) + # if not log2: + # count_none_log2 += 1 + # warnings.warn("Numerator is less than or equal to 0, returning None for region.") + # continue + # log2 = round(log2, 4) log2_data.append(f"{row['chromosome']}\t{midpoint - 1}\t{midpoint}\t{log2}") warnings.warn(f"Some regions could not be transformed due to invalid values after plodiy and purity adjustment: {count_none_log2}") From 3d44b03941eb50ef6c7be1a889535db9da914854 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Mon, 29 Jul 2024 08:17:01 +0200 Subject: [PATCH 32/39] new threads --- BALSAMIC/constants/cluster_analysis.json | 4 ++++ .../snakemake_rules/variant_calling/gens_preprocessing.rule | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/BALSAMIC/constants/cluster_analysis.json b/BALSAMIC/constants/cluster_analysis.json index d63b5e136..10828dfcf 100644 --- a/BALSAMIC/constants/cluster_analysis.json +++ b/BALSAMIC/constants/cluster_analysis.json @@ -156,6 +156,10 @@ "time": "24:00:00", "n": 36 }, + "sentieon_DNAscope_gnomad_tga": { + "time": "24:00:00", + "n": 12 + }, "sentieon_TNhaplotyper": { "time": "24:00:00", "n": 36 diff --git a/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule b/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule index 0b5d35e78..466a8beb4 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule @@ -112,7 +112,7 @@ else: benchmark: Path(benchmark_dir,"sentieon_DNAscope_gnomad_{sample}.tsv").as_posix() threads: - get_threads(cluster_config,"sentieon_DNAscope_gnomad") + get_threads(cluster_config,"sentieon_DNAscope_gnomad_tga") message: "Calling germline variants on positions in Gnomad AF > 0.05 using Sentieon DNAscope for {params.sample}" shell: From b484670e9081cc4f4c1043b047d0ec43e39b63b3 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Wed, 31 Jul 2024 14:24:28 +0200 Subject: [PATCH 33/39] return purity adjustment --- .../assets/scripts/postprocess_gens_cnvkit.py | 25 ++++++++----------- .../containers/varcall_py3/varcall_py3.yaml | 1 - BALSAMIC/utils/io.py | 2 +- 3 files changed, 12 insertions(+), 16 deletions(-) diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py index 05e72426f..4cf0d0a5a 100644 --- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py +++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py @@ -40,12 +40,12 @@ def calculate_log2_ratio(purity, log2_ratio, ploidy): @click.option( "-p", "--tumor-purity-path", - required=False, + required=True, type=click.Path(exists=True), help="Tumor purity file from PureCN", ) def create_gens_cov_file( - output_file: str, normalised_coverage_path: str, tumor_purity_path: str | None + output_file: str, normalised_coverage_path: str, tumor_purity_path: str ): """Post-processes the CNVkit .cnr output for upload to GENS. @@ -62,11 +62,9 @@ def create_gens_cov_file( cnr_dict_list: list[dict] = read_csv(csv_path=normalised_coverage_path, delimeter="\t") # Process PureCN purity file - purity = None - if tumor_purity_path: - purecn_dict_list: list[dict] = read_csv(csv_path=tumor_purity_path, delimeter=",") - purity = float(purecn_dict_list[0]["Purity"]) - ploidy = float(purecn_dict_list[0]["Ploidy"]) + purecn_dict_list: list[dict] = read_csv(csv_path=tumor_purity_path, delimeter=",") + purity = float(purecn_dict_list[0]["Purity"]) + ploidy = float(purecn_dict_list[0]["Ploidy"]) count_none_log2 = 0 for row in cnr_dict_list: @@ -76,13 +74,12 @@ def create_gens_cov_file( end = int(row["end"]) midpoint = start + int((end - start / 2)) log2 = float(row["log2"]) - #if purity: - # log2 = calculate_log2_ratio(purity, log2, ploidy) - # if not log2: - # count_none_log2 += 1 - # warnings.warn("Numerator is less than or equal to 0, returning None for region.") - # continue - # log2 = round(log2, 4) + log2 = calculate_log2_ratio(purity, log2, ploidy) + if not log2: + count_none_log2 += 1 + warnings.warn("Numerator is less than or equal to 0, returning None for region.") + continue + log2 = round(log2, 4) log2_data.append(f"{row['chromosome']}\t{midpoint - 1}\t{midpoint}\t{log2}") warnings.warn(f"Some regions could not be transformed due to invalid values after plodiy and purity adjustment: {count_none_log2}") diff --git a/BALSAMIC/containers/varcall_py3/varcall_py3.yaml b/BALSAMIC/containers/varcall_py3/varcall_py3.yaml index 4f106103e..5c643a93a 100644 --- a/BALSAMIC/containers/varcall_py3/varcall_py3.yaml +++ b/BALSAMIC/containers/varcall_py3/varcall_py3.yaml @@ -163,7 +163,6 @@ dependencies: - svdb=2.8.1 - sysroot_linux-64=2.12 - tabix=1.11 - - tiddit=3.3.2 - tk=8.6.12 - tktable=2.10 - toolz=0.12.0 diff --git a/BALSAMIC/utils/io.py b/BALSAMIC/utils/io.py index d5aba9944..555597a30 100644 --- a/BALSAMIC/utils/io.py +++ b/BALSAMIC/utils/io.py @@ -67,7 +67,7 @@ def write_json(json_obj: dict, path: str) -> None: raise OSError(f"Error while writing JSON file: {path}, error: {error}") -def read_csv(csv_path: str, delimeter: str) -> List[dict]: +def read_csv(csv_path: str, delimeter: str = ",") -> List[dict]: """Read data from a csv file.""" with open(csv_path, mode='r') as csv_file: csv_reader = csv.DictReader(csv_file, delimiter=delimeter) From 1fac87657d100f9fab0b81e69116d85c7e782f07 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Wed, 31 Jul 2024 15:08:07 +0200 Subject: [PATCH 34/39] code review --- .../assets/scripts/postprocess_gens_cnvkit.py | 25 +++++--- BALSAMIC/commands/config/case.py | 11 +++- BALSAMIC/constants/cluster_analysis.json | 4 +- .../variant_calling/gens_preprocessing.rule | 56 ++++++++--------- BALSAMIC/utils/cli.py | 61 ++++++++----------- BALSAMIC/utils/io.py | 3 +- 6 files changed, 81 insertions(+), 79 deletions(-) diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py index 4cf0d0a5a..24e54c108 100644 --- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py +++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py @@ -3,6 +3,7 @@ import numpy as np import warnings + def calculate_log2_ratio(purity, log2_ratio, ploidy): # Ensure that the inputs are within valid ranges if not (0 <= purity <= 1): @@ -59,7 +60,9 @@ def create_gens_cov_file( log2_data = [] # Process CNVkit file - cnr_dict_list: list[dict] = read_csv(csv_path=normalised_coverage_path, delimeter="\t") + cnr_dict_list: list[dict] = read_csv( + csv_path=normalised_coverage_path, delimeter="\t" + ) # Process PureCN purity file purecn_dict_list: list[dict] = read_csv(csv_path=tumor_purity_path, delimeter=",") @@ -70,18 +73,22 @@ def create_gens_cov_file( for row in cnr_dict_list: if row["gene"] == "Antitarget": continue - start = int(row["start"]) - end = int(row["end"]) - midpoint = start + int((end - start / 2)) - log2 = float(row["log2"]) - log2 = calculate_log2_ratio(purity, log2, ploidy) + start: int = int(row["start"]) + end: int = int(row["end"]) + midpoint: int = start + int((end - start / 2)) + log2: float = float(row["log2"]) + log2: float | None = calculate_log2_ratio(purity, log2, ploidy) if not log2: count_none_log2 += 1 - warnings.warn("Numerator is less than or equal to 0, returning None for region.") + warnings.warn( + "Numerator is less than or equal to 0, returning None for region." + ) continue - log2 = round(log2, 4) + log2: float = round(log2, 4) log2_data.append(f"{row['chromosome']}\t{midpoint - 1}\t{midpoint}\t{log2}") - warnings.warn(f"Some regions could not be transformed due to invalid values after plodiy and purity adjustment: {count_none_log2}") + warnings.warn( + f"Some regions could not be transformed due to invalid values after plodiy and purity adjustment: {count_none_log2}" + ) # Write log2 data to output file with open(output_file, "w") as log2_out: diff --git a/BALSAMIC/commands/config/case.py b/BALSAMIC/commands/config/case.py index cf4277914..48c054ee5 100644 --- a/BALSAMIC/commands/config/case.py +++ b/BALSAMIC/commands/config/case.py @@ -132,12 +132,19 @@ def case_config( if cadd_annotations: references.update(cadd_annotations_path) - references = get_gens_references( + gens_references: dict[str, str] = get_gens_references( genome_interval=genome_interval, gens_coverage_pon=gens_coverage_pon, gnomad_min_af5=gnomad_min_af5, panel_bed=panel_bed, - references=references, + ) + # Update references dictionary with GENS reference-files + references.update( + { + gens_file: path + for gens_file, path in gens_references.items() + if path is not None + } ) variants_observations = { diff --git a/BALSAMIC/constants/cluster_analysis.json b/BALSAMIC/constants/cluster_analysis.json index 10828dfcf..9a18e3381 100644 --- a/BALSAMIC/constants/cluster_analysis.json +++ b/BALSAMIC/constants/cluster_analysis.json @@ -11,7 +11,7 @@ "time": "00:15:00", "n": 1 }, - "gens_preprocessing": { + "gens_preprocess": { "time": "01:00:00", "n": 4 }, @@ -96,7 +96,7 @@ "time": "10:00:00", "n": 5 }, - "gatk_denoisereadcounts":{ + "gatk_denoise_read_counts":{ "time": "10:00:00", "n": 10 }, diff --git a/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule b/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule index 466a8beb4..d5ee5dc80 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule @@ -1,7 +1,7 @@ # vim: syntax=python tabstop=4 expandtab # coding: utf-8 -if config["analysis"]["sequencing_type"] == 'wgs': +if config["analysis"]["sequencing_type"] == SequencingType.WGS: rule sentieon_DNAscope_gnomad: input: ref = config["reference"]["reference_genome"], @@ -40,7 +40,7 @@ if config["analysis"]["sequencing_type"] == 'wgs': rm -rf {params.tmpdir}; """ - rule gatk_denoisereadcounts: + rule gatk_denoise_read_counts: input: gens_pon=config["reference"]["gens_coverage_pon"], readcounts_hdf5=cnv_dir + "{sample}.collectreadcounts.hdf5" @@ -51,11 +51,11 @@ if config["analysis"]["sequencing_type"] == 'wgs': tmpdir=tempfile.mkdtemp(prefix=tmp_dir), sample="{sample}" benchmark: - Path(benchmark_dir,"gatk_denoisereadcounts_{sample}.tsv").as_posix() + Path(benchmark_dir,"gatk_denoise_read_counts_{sample}.tsv").as_posix() singularity: Path(singularity_image,config["bioinfo_tools"].get("gatk") + ".sif").as_posix() threads: - get_threads(cluster_config,"gatk_denoisereadcounts") + get_threads(cluster_config,"gatk_denoise_read_counts") message: "Running GATK DenoiseReadCounts on {params.sample} for GENS." shell: @@ -72,7 +72,7 @@ if config["analysis"]["sequencing_type"] == 'wgs': rm -rf {params.tmpdir} """ - rule gens_preprocessing_wgs: + rule gens_preprocess_wgs: input: denoised_cr = cnv_dir + "{sample}.denoisedCR.tsv", gnomad_af5_vcf = cnv_dir + "SNV.germline.{sample}.dnascope_gnomad_af5.vcf.gz", @@ -80,19 +80,19 @@ if config["analysis"]["sequencing_type"] == 'wgs': gens_baf_bed = cnv_dir + "{sample}.baf.bed", gens_cov_bed = cnv_dir + "{sample}.cov.bed" params: - gens_preprocessing = get_script_path("preprocess_gens.py"), + gens_preprocess = get_script_path("preprocess_gens.py"), sequencing_type = sequencing_type, sample="{sample}" benchmark: - Path(benchmark_dir, "gens_preprocessing_{sample}.tsv").as_posix() + Path(benchmark_dir, "gens_preprocess_wgs_{sample}.tsv").as_posix() threads: - get_threads(cluster_config, "gens_preprocessing") + get_threads(cluster_config, "gens_preprocess") message: "Formatting output for GENS for sample: {params.sample}." shell: """ - python {params.gens_preprocessing} -s {params.sequencing_type} -o {output.gens_baf_bed} calculate-bafs --vcf-file-path {input.gnomad_af5_vcf} ; - python {params.gens_preprocessing} -s {params.sequencing_type} -o {output.gens_cov_bed} create-coverage-regions --normalised-coverage-path {input.denoised_cr} + python {params.gens_preprocess} -s {params.sequencing_type} -o {output.gens_baf_bed} calculate-bafs --vcf-file-path {input.gnomad_af5_vcf} ; + python {params.gens_preprocess} -s {params.sequencing_type} -o {output.gens_cov_bed} create-coverage-regions --normalised-coverage-path {input.denoised_cr} """ else: rule sentieon_DNAscope_gnomad_tga: @@ -117,24 +117,24 @@ else: "Calling germline variants on positions in Gnomad AF > 0.05 using Sentieon DNAscope for {params.sample}" shell: """ - export TMPDIR={params.tmpdir}; - export SENTIEON_TMPDIR={params.tmpdir}; - export SENTIEON_LICENSE={params.sentieon_lic}; + export TMPDIR={params.tmpdir}; + export SENTIEON_TMPDIR={params.tmpdir}; + export SENTIEON_LICENSE={params.sentieon_lic}; - {params.sentieon_exec} driver \ - -t {threads} \ - -r {input.ref} \ - -i {input.bam} \ - --interval {input.bed} \ - --interval_padding 20 \ - --algo DNAscope \ - --pcr_indel_mode {params.pcr_model} \ - --given {input.gnomad_af5} {output.gnomad_af5_vcf}; + {params.sentieon_exec} driver \ + -t {threads} \ + -r {input.ref} \ + -i {input.bam} \ + --interval {input.bed} \ + --interval_padding 20 \ + --algo DNAscope \ + --pcr_indel_mode {params.pcr_model} \ + --given {input.gnomad_af5} {output.gnomad_af5_vcf}; - rm -rf {params.tmpdir}; + rm -rf {params.tmpdir}; """ - rule gens_preprocessing_tga: + rule gens_preprocess_tga: input: cnvkit_cnr = cnv_dir + "tumor.merged.cnr", gnomad_af5_vcf = cnv_dir + "SNV.germline.{sample}.dnascope_gnomad_af5.vcf.gz", @@ -144,18 +144,18 @@ else: gens_cov_bed = cnv_dir + "{sample}.cov.bed" params: sequencing_type = sequencing_type, - gens_preprocessing = get_script_path("preprocess_gens.py"), + gens_preprocess = get_script_path("preprocess_gens.py"), gens_preprocess_cnvkit = get_script_path("postprocess_gens_cnvkit.py"), sample="{sample}" benchmark: - Path(benchmark_dir, "gens_preprocessing_{sample}.tsv").as_posix() + Path(benchmark_dir, "gens_preprocess_tga_{sample}.tsv").as_posix() threads: - get_threads(cluster_config, "gens_preprocessing") + get_threads(cluster_config, "gens_preprocess") message: "Formatting output for GENS for sample: {params.sample}." shell: """ - python {params.gens_preprocessing} -s {params.sequencing_type} -o {output.gens_baf_bed} calculate-bafs --vcf-file-path {input.gnomad_af5_vcf} ; + python {params.gens_preprocess} -s {params.sequencing_type} -o {output.gens_baf_bed} calculate-bafs --vcf-file-path {input.gnomad_af5_vcf} ; python {params.gens_preprocess_cnvkit} -o {output.gens_cov_bed} --normalised-coverage-path {input.cnvkit_cnr} --tumor-purity-path {input.purecn_purity_csv} """ diff --git a/BALSAMIC/utils/cli.py b/BALSAMIC/utils/cli.py index e27573019..1e0873dfb 100644 --- a/BALSAMIC/utils/cli.py +++ b/BALSAMIC/utils/cli.py @@ -220,50 +220,37 @@ def get_gens_references( gens_coverage_pon: Optional[str], gnomad_min_af5: Optional[str], panel_bed: Optional[str], - references: Dict[str, str], -) -> Dict: +) -> Dict[str, str]: """ - Assigns reference-files required for GENS if they have been supplied. + Assigns reference-files required for GENS if they have been supplied, else exists with error message. - :param genome_interval: Coverage-regions. (required for WGS GENS) - :param gens_coverage_pon: PON for GATK CollectReadCounts. (required for WGS GENS) - :param gnomad_min_af5: gnomad VCF filtered to keep variants above 5% VAF. (required for WGS and TGA GENS) - :param panel_bed: Bedfile supplied for TGA analyses. - :param references: Reference dictionary to be updated. + Args: + genome_interval: Optional[str]. Coverage-regions. (required for WGS GENS) + gens_coverage_pon: Optional[str]. PON for GATK CollectReadCounts. (required for WGS GENS) + gnomad_min_af5: Optional[str] gnomad VCF filtered to keep variants above 5% VAF (required for WGS and TGA GENS). + panel_bed: Optional[str] Bedfile supplied for TGA analyses. - :return: references: Updated reference dictionary. + Returns: + Dict[str, str] with paths to GENS reference files """ - gens_arguments = [genome_interval, gens_coverage_pon, gnomad_min_af5] - - # Check if any of the gens arguments are provided - if any(gens_arguments): - # Ensure all required arguments are provided or raise an error if panel_bed is not set - if not all(gens_arguments) and not panel_bed: - raise click.BadParameter( - "All three arguments (genome_interval, gens_coverage_pon, gnomad_min_af5) are required for GENS in WGS." - ) + if panel_bed and gnomad_min_af5: + return {"gnomad_min_af5": gnomad_min_af5} - # Construct the gens_ref_files dictionary - gens_ref_files = ( - {"gnomad_min_af5": gnomad_min_af5} - if panel_bed - else { - "genome_interval": genome_interval, - "gens_coverage_pon": gens_coverage_pon, - "gnomad_min_af5": gnomad_min_af5, - } - ) + if gnomad_min_af5 and genome_interval and gens_coverage_pon: + return { + "genome_interval": genome_interval, + "gens_coverage_pon": gens_coverage_pon, + "gnomad_min_af5": gnomad_min_af5, + } - # Update references dictionary with gens values - references.update( - { - gens_file: path - for gens_file, path in gens_ref_files.items() - if path is not None - } - ) - return references + error_message = ( + "GENS reference file is always required to run BALSAMIC." + "WGS requires arguments: genome_interval, gens_coverage_pon, gnomad_min_af5" + "TGA requires argument: gnomad_min_af5" + ) + LOG.error(error_message) + raise BalsamicError(error_message) def get_bioinfo_tools_version( diff --git a/BALSAMIC/utils/io.py b/BALSAMIC/utils/io.py index 555597a30..0b615265d 100644 --- a/BALSAMIC/utils/io.py +++ b/BALSAMIC/utils/io.py @@ -69,10 +69,11 @@ def write_json(json_obj: dict, path: str) -> None: def read_csv(csv_path: str, delimeter: str = ",") -> List[dict]: """Read data from a csv file.""" - with open(csv_path, mode='r') as csv_file: + with open(csv_path, mode="r") as csv_file: csv_reader = csv.DictReader(csv_file, delimiter=delimeter) return [row for row in csv_reader] + def read_yaml(yaml_path: str) -> dict: """Read data from a yaml file.""" if Path(yaml_path).exists(): From 9688e3df245c2d4b9a9c63d36b0946c871e8a3f4 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Thu, 1 Aug 2024 14:13:25 +0200 Subject: [PATCH 35/39] new purity and ploidy formula from PureCN --- .../assets/scripts/postprocess_gens_cnvkit.py | 35 ++++++++----------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py index 24e54c108..19b56331f 100644 --- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py +++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py @@ -12,15 +12,10 @@ def calculate_log2_ratio(purity, log2_ratio, ploidy): if ploidy <= 0: raise ValueError("Ploidy must be a positive number") - # Calculate the log2 ratio - numerator = (1 - purity) + (purity * log2_ratio) + # Ploidy and purity adjustment formula reference to PureCN issue: https://github.com/lima1/PureCN/issues/40 + log2_adjusted = (purity*log2_ratio*ploidy + 2*(1-purity)*log2_ratio - 2*(1-purity)) / (purity*ploidy) - if numerator <= 0: - return None - - log2_ratio = np.log2(numerator / ploidy) - - return log2_ratio + return log2_adjusted @click.command() @@ -69,26 +64,24 @@ def create_gens_cov_file( purity = float(purecn_dict_list[0]["Purity"]) ploidy = float(purecn_dict_list[0]["Ploidy"]) - count_none_log2 = 0 for row in cnr_dict_list: if row["gene"] == "Antitarget": continue + + # find midpoint start: int = int(row["start"]) end: int = int(row["end"]) - midpoint: int = start + int((end - start / 2)) + region_size: int = end - start + midpoint: int = start + int(region_size / 2) + + # adjust log2 ratio log2: float = float(row["log2"]) - log2: float | None = calculate_log2_ratio(purity, log2, ploidy) - if not log2: - count_none_log2 += 1 - warnings.warn( - "Numerator is less than or equal to 0, returning None for region." - ) - continue - log2: float = round(log2, 4) + log2: float = calculate_log2_ratio(purity, log2, ploidy) + log2: float = round(log2, 4) + + # store values in list log2_data.append(f"{row['chromosome']}\t{midpoint - 1}\t{midpoint}\t{log2}") - warnings.warn( - f"Some regions could not be transformed due to invalid values after plodiy and purity adjustment: {count_none_log2}" - ) + # Write log2 data to output file with open(output_file, "w") as log2_out: From 3abae1bf60a0f8ab69eae34f18d6172c82d6732d Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Fri, 2 Aug 2024 15:26:16 +0200 Subject: [PATCH 36/39] fix pytests --- BALSAMIC/commands/config/case.py | 29 ++--- BALSAMIC/utils/cli.py | 4 +- tests/commands/config/test_config_sample.py | 96 ++++++++++------ tests/conftest.py | 117 ++++++++++++++------ 4 files changed, 159 insertions(+), 87 deletions(-) diff --git a/BALSAMIC/commands/config/case.py b/BALSAMIC/commands/config/case.py index 48c054ee5..77839f68c 100644 --- a/BALSAMIC/commands/config/case.py +++ b/BALSAMIC/commands/config/case.py @@ -132,20 +132,21 @@ def case_config( if cadd_annotations: references.update(cadd_annotations_path) - gens_references: dict[str, str] = get_gens_references( - genome_interval=genome_interval, - gens_coverage_pon=gens_coverage_pon, - gnomad_min_af5=gnomad_min_af5, - panel_bed=panel_bed, - ) - # Update references dictionary with GENS reference-files - references.update( - { - gens_file: path - for gens_file, path in gens_references.items() - if path is not None - } - ) + if analysis_workflow is not AnalysisWorkflow.BALSAMIC_QC: + gens_references: dict[str, str] = get_gens_references( + genome_interval=genome_interval, + gens_coverage_pon=gens_coverage_pon, + gnomad_min_af5=gnomad_min_af5, + panel_bed=panel_bed, + ) + # Update references dictionary with GENS reference-files + references.update( + { + gens_file: path + for gens_file, path in gens_references.items() + if path is not None + } + ) variants_observations = { "clinical_snv_observations": clinical_snv_observations, diff --git a/BALSAMIC/utils/cli.py b/BALSAMIC/utils/cli.py index 1e0873dfb..cdd02e680 100644 --- a/BALSAMIC/utils/cli.py +++ b/BALSAMIC/utils/cli.py @@ -245,8 +245,8 @@ def get_gens_references( } error_message = ( - "GENS reference file is always required to run BALSAMIC." - "WGS requires arguments: genome_interval, gens_coverage_pon, gnomad_min_af5" + "GENS reference file is always required to run BALSAMIC.\n" + "WGS requires arguments: genome_interval, gens_coverage_pon, gnomad_min_af5\n" "TGA requires argument: gnomad_min_af5" ) LOG.error(error_message) diff --git a/tests/commands/config/test_config_sample.py b/tests/commands/config/test_config_sample.py index 6296e5939..8e5456251 100644 --- a/tests/commands/config/test_config_sample.py +++ b/tests/commands/config/test_config_sample.py @@ -17,8 +17,7 @@ def test_tumor_normal_config( fastq_dir_tumor_normal_parameterize: str, balsamic_cache: str, panel_bed_file: str, - sentieon_license: str, - sentieon_install_dir: str, + config_case_cli_tga: list[str], ): """Test tumor normal balsamic config case command.""" @@ -44,11 +43,8 @@ def test_tumor_normal_config( tumor_sample_name, "--normal-sample-name", normal_sample_name, - "--sentieon-install-dir", - sentieon_install_dir, - "--sentieon-license", - sentieon_license, - ], + ] + + config_case_cli_tga, ) # THEN a config should be created and exist @@ -66,8 +62,7 @@ def test_tumor_only_config( fastq_dir_tumor_only: str, balsamic_cache: str, panel_bed_file: str, - sentieon_license: str, - sentieon_install_dir: str, + config_case_cli_tga: list[str], ): """Test tumor only balsamic config case command.""" @@ -90,11 +85,8 @@ def test_tumor_only_config( balsamic_cache, "--tumor-sample-name", tumor_sample_name, - "--sentieon-install-dir", - sentieon_install_dir, - "--sentieon-license", - sentieon_license, - ], + ] + + config_case_cli_tga, ) # THEN a config should be created and exist @@ -199,8 +191,7 @@ def test_tumor_only_umi_config_background_file( balsamic_cache: str, panel_bed_file: str, background_variant_file: str, - sentieon_license: str, - sentieon_install_dir: str, + config_case_cli_tga: list[str], ): """Test balsamic umi config case providing a background variants file.""" @@ -227,11 +218,8 @@ def test_tumor_only_umi_config_background_file( balsamic_cache, "--tumor-sample-name", tumor_sample_name, - "--sentieon-install-dir", - sentieon_install_dir, - "--sentieon-license", - sentieon_license, - ], + ] + + config_case_cli_tga ) # THEN program exits and checks for filepath @@ -250,6 +238,7 @@ def test_pon_cnn_file( case_id_tumor_only: str, sentieon_license: str, sentieon_install_dir: str, + config_case_cli_tga: list[str], ): """Test balsamic config case with a PON reference.""" @@ -274,11 +263,8 @@ def test_pon_cnn_file( balsamic_cache, "--tumor-sample-name", tumor_sample_name, - "--sentieon-install-dir", - sentieon_install_dir, - "--sentieon-license", - sentieon_license, - ], + ] + + config_case_cli_tga, ) # THEN program exits and checks for filepath @@ -390,7 +376,7 @@ def test_config_graph_failed( assert case_result.exit_code == 1 -def test_missing_required_gens_arguments( +def test_missing_required_gens_arguments_wgs( invoke_cli, tumor_sample_name: str, analysis_dir: str, @@ -432,12 +418,54 @@ def test_missing_required_gens_arguments( ], ) # THEN the CLI should exit code 2 and display an informative error message - assert result.exit_code == 2 + assert result.exit_code == 1 assert ( - "All three arguments (genome_interval, gens_coverage_pon, gnomad_min_af5) are required for GENS in WGS." + "WGS requires arguments: genome_interval, gens_coverage_pon, gnomad_min_af5" in result.output ) +def test_missing_required_gens_arguments_tga( + invoke_cli, + tumor_sample_name: str, + analysis_dir: str, + balsamic_cache: str, + fastq_dir_tumor_only: str, + panel_bed_file: str, + case_id_tumor_only: str, + sentieon_license: str, + sentieon_install_dir: str, +): + """Test balsamic config case without required GENS arguments.""" + + # GIVEN CLI arguments including optional GENS input-files + + # WHEN invoking the config case command + result = invoke_cli( + [ + "config", + "case", + "--case-id", + case_id_tumor_only, + "--analysis-dir", + analysis_dir, + "--fastq-path", + fastq_dir_tumor_only, + "--balsamic-cache", + balsamic_cache, + "--tumor-sample-name", + tumor_sample_name, + "--sentieon-install-dir", + sentieon_install_dir, + "--sentieon-license", + sentieon_license, + ], + ) + # THEN the CLI should exit code 2 and display an informative error message + assert result.exit_code == 1 + assert ( + "TGA requires argument: gnomad_min_af5" + in result.output + ) def test_config_with_gens_arguments( invoke_cli, @@ -589,8 +617,7 @@ def test_config_tga_with_exome( fastq_dir_tumor_only: str, case_id_tumor_only: str, panel_bed_file: str, - sentieon_license: str, - sentieon_install_dir: str, + config_case_cli_tga: list[str], ): """Test balsamic config case with GENS arguments for TGA.""" @@ -614,11 +641,8 @@ def test_config_tga_with_exome( "-p", panel_bed_file, "--exome", - "--sentieon-install-dir", - sentieon_install_dir, - "--sentieon-license", - sentieon_license, - ], + ] + + config_case_cli_tga, ) # THEN a config should be created and exist assert result.exit_code == 0 diff --git a/tests/conftest.py b/tests/conftest.py index f3141ae8d..59be9c902 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1232,7 +1232,7 @@ def balsamic_pon_model( @pytest.fixture(scope="session") -def config_case_cli( +def config_case_cli_wgs( balsamic_cache: str, background_variant_file: str, cadd_annotations: str, @@ -1245,6 +1245,9 @@ def config_case_cli( cancer_somatic_snv_observations_path: str, sentieon_license: str, sentieon_install_dir: str, + gens_hg19_interval_list: str, + gens_min_5_af_gnomad_file: str, + gens_cov_pon_file: str, ) -> List[str]: """Return common config case CLI.""" return [ @@ -1272,8 +1275,59 @@ def config_case_cli( sentieon_install_dir, "--sentieon-license", sentieon_license, + "--genome-interval", + gens_hg19_interval_list, + "--gnomad-min-af5", + gens_min_5_af_gnomad_file, + "--gens-coverage-pon", + gens_cov_pon_file, ] +@pytest.fixture(scope="session") +def config_case_cli_tga( + balsamic_cache: str, + background_variant_file: str, + cadd_annotations: str, + swegen_snv_frequency_path: str, + swegen_sv_frequency_path: str, + clinical_snv_observations_path: str, + clinical_sv_observations_path: str, + somatic_sv_observations_path: str, + cancer_germline_snv_observations_path: str, + cancer_somatic_snv_observations_path: str, + sentieon_license: str, + sentieon_install_dir: str, + gens_min_5_af_gnomad_file: str, +) -> List[str]: + """Return common config case CLI.""" + return [ + "--balsamic-cache", + balsamic_cache, + "--background-variants", + background_variant_file, + "--cadd-annotations", + cadd_annotations, + "--swegen-snv", + swegen_snv_frequency_path, + "--swegen-sv", + swegen_sv_frequency_path, + "--clinical-snv-observations", + clinical_snv_observations_path, + "--clinical-sv-observations", + clinical_sv_observations_path, + "--cancer-somatic-sv-observations", + somatic_sv_observations_path, + "--cancer-germline-snv-observations", + cancer_germline_snv_observations_path, + "--cancer-somatic-snv-observations", + cancer_somatic_snv_observations_path, + "--sentieon-install-dir", + sentieon_install_dir, + "--sentieon-license", + sentieon_license, + "--gnomad-min-af5", + gens_min_5_af_gnomad_file, + ] @pytest.fixture(scope="session") def tumor_only_config_qc( @@ -1282,7 +1336,7 @@ def tumor_only_config_qc( fastq_dir_tumor_only_qc: str, tumor_sample_name: str, panel_bed_file: str, - config_case_cli: list[str], + config_case_cli_tga: list[str], ) -> str: """Invoke balsamic config sample to create sample configuration file for tumor-only TGA.""" @@ -1305,7 +1359,7 @@ def tumor_only_config_qc( "-p", panel_bed_file, ] - + config_case_cli, + + config_case_cli_tga, ) return Path( @@ -1319,8 +1373,9 @@ def tumor_normal_config_qc( tumor_sample_name: str, normal_sample_name: str, analysis_dir: str, + panel_bed_file: str, fastq_dir_tumor_normal_qc: str, - config_case_cli: list[str], + config_case_cli_tga: list[str], ) -> str: """Invoke balsamic config sample to create sample configuration file for tumor-normal TGA QC workflow.""" @@ -1342,8 +1397,10 @@ def tumor_normal_config_qc( tumor_sample_name, "--normal-sample-name", normal_sample_name, + "-p", + panel_bed_file, ] - + config_case_cli, + + config_case_cli_tga, ) return Path( @@ -1360,7 +1417,7 @@ def tumor_normal_config_qc_wgs( fastq_dir_tumor_normal_qc_wgs: str, tumor_sample_name: str, normal_sample_name: str, - config_case_cli: List[str], + config_case_cli_wgs: List[str], ) -> str: """Invoke balsamic config sample to create sample configuration file for tumor-normal WGS QC workflow.""" @@ -1383,7 +1440,7 @@ def tumor_normal_config_qc_wgs( "--normal-sample-name", normal_sample_name, ] - + config_case_cli, + + config_case_cli_wgs, ) return Path( @@ -1400,7 +1457,7 @@ def tumor_only_config( analysis_dir: str, fastq_dir_tumor_only: str, panel_bed_file: str, - config_case_cli: list[str], + config_case_cli_tga: list[str], ) -> str: """Invoke balsamic config sample to create sample configuration file for tumor-only TGA.""" @@ -1421,7 +1478,7 @@ def tumor_only_config( "-p", panel_bed_file, ] - + config_case_cli, + + config_case_cli_tga, ) return Path( analysis_dir, @@ -1438,7 +1495,7 @@ def tumor_normal_config( analysis_dir: str, fastq_dir_tumor_normal: str, panel_bed_file: str, - config_case_cli: list[str], + config_case_cli_tga: list[str], ) -> str: """Invoke balsamic config sample to create sample configuration file for tumor-normal TGA.""" @@ -1461,7 +1518,7 @@ def tumor_normal_config( "-p", panel_bed_file, ] - + config_case_cli, + + config_case_cli_tga, ) return Path( @@ -1478,7 +1535,7 @@ def tumor_only_umi_config( analysis_dir: str, fastq_dir_tumor_only: str, panel_bed_file: str, - config_case_cli: list[str], + config_case_cli_tga: list[str], ) -> str: """Invoke balsamic config sample to create sample configuration file for tumor-only TGA.""" @@ -1501,7 +1558,7 @@ def tumor_only_umi_config( "-p", panel_bed_file, ] - + config_case_cli, + + config_case_cli_tga, ) return Path( @@ -1519,7 +1576,7 @@ def tumor_normal_umi_config( analysis_dir: str, fastq_dir_tumor_normal: str, panel_bed_file: str, - config_case_cli: list[str], + config_case_cli_tga: list[str], ) -> str: """Invoke balsamic config sample to create sample configuration file for tumor-normal TGA.""" @@ -1546,7 +1603,7 @@ def tumor_normal_umi_config( "-p", panel_bed_file, ] - + config_case_cli, + + config_case_cli_tga, ) return Path( @@ -1562,7 +1619,7 @@ def tumor_only_wgs_config( analysis_dir: str, fastq_dir_tumor_only_wgs: str, tumor_sample_name: str, - config_case_cli: List[str], + config_case_cli_wgs: List[str], ) -> str: """Invoke balsamic config sample to create sample configuration file for tumor-only WGS.""" @@ -1581,7 +1638,7 @@ def tumor_only_wgs_config( "--tumor-sample-name", tumor_sample_name, ] - + config_case_cli, + + config_case_cli_wgs, ) return Path( @@ -1598,7 +1655,7 @@ def tumor_normal_wgs_config( fastq_dir_tumor_normal_wgs: str, tumor_sample_name: str, normal_sample_name: str, - config_case_cli: str, + config_case_cli_wgs: List[str], ) -> str: """Invoke balsamic config sample to create sample configuration file for tumor-normal WGS.""" @@ -1619,7 +1676,7 @@ def tumor_normal_wgs_config( "--normal-sample-name", normal_sample_name, ] - + config_case_cli, + + config_case_cli_wgs, ) return Path( @@ -1638,8 +1695,7 @@ def tumor_only_config_dummy_vep( fastq_dir_tumor_only_dummy_vep: str, panel_bed_file: str, background_variant_file: str, - sentieon_license: str, - sentieon_install_dir: str, + config_case_cli_tga: list[str], ) -> str: """Invoke balsamic config sample to create sample configuration file for tumor-only TGA with dummy VEP file.""" @@ -1663,11 +1719,8 @@ def tumor_only_config_dummy_vep( background_variant_file, "--tumor-sample-name", tumor_sample_name, - "--sentieon-install-dir", - sentieon_install_dir, - "--sentieon-license", - sentieon_license, - ], + ] + + config_case_cli_tga, ) return Path( analysis_dir, @@ -1685,8 +1738,7 @@ def tumor_only_pon_config( panel_bed_file: str, pon_cnn_path: str, balsamic_cache: str, - sentieon_license: str, - sentieon_install_dir: str, + config_case_cli_tga: List[str], ) -> str: """Invoke balsamic PON config sample to create sample configuration file for tumor-only TGA.""" @@ -1706,15 +1758,10 @@ def tumor_only_pon_config( panel_bed_file, "--pon-cnn", pon_cnn_path, - "--balsamic-cache", - balsamic_cache, "--tumor-sample-name", tumor_sample_name, - "--sentieon-install-dir", - sentieon_install_dir, - "--sentieon-license", - sentieon_license, - ], + ] + + config_case_cli_tga ) return Path( From ce9578685ad10f79eb340528700e8ffd5bbde059 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Fri, 2 Aug 2024 15:27:27 +0200 Subject: [PATCH 37/39] black --- BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py | 5 +++-- tests/commands/config/test_config_sample.py | 7 +++---- tests/conftest.py | 4 +++- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py index 19b56331f..d266465c3 100644 --- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py +++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py @@ -13,7 +13,9 @@ def calculate_log2_ratio(purity, log2_ratio, ploidy): raise ValueError("Ploidy must be a positive number") # Ploidy and purity adjustment formula reference to PureCN issue: https://github.com/lima1/PureCN/issues/40 - log2_adjusted = (purity*log2_ratio*ploidy + 2*(1-purity)*log2_ratio - 2*(1-purity)) / (purity*ploidy) + log2_adjusted = ( + purity * log2_ratio * ploidy + 2 * (1 - purity) * log2_ratio - 2 * (1 - purity) + ) / (purity * ploidy) return log2_adjusted @@ -82,7 +84,6 @@ def create_gens_cov_file( # store values in list log2_data.append(f"{row['chromosome']}\t{midpoint - 1}\t{midpoint}\t{log2}") - # Write log2 data to output file with open(output_file, "w") as log2_out: for resolution in ["o", "a", "b", "c", "d"]: diff --git a/tests/commands/config/test_config_sample.py b/tests/commands/config/test_config_sample.py index 8e5456251..c4abfed93 100644 --- a/tests/commands/config/test_config_sample.py +++ b/tests/commands/config/test_config_sample.py @@ -424,6 +424,7 @@ def test_missing_required_gens_arguments_wgs( in result.output ) + def test_missing_required_gens_arguments_tga( invoke_cli, tumor_sample_name: str, @@ -462,10 +463,8 @@ def test_missing_required_gens_arguments_tga( ) # THEN the CLI should exit code 2 and display an informative error message assert result.exit_code == 1 - assert ( - "TGA requires argument: gnomad_min_af5" - in result.output - ) + assert "TGA requires argument: gnomad_min_af5" in result.output + def test_config_with_gens_arguments( invoke_cli, diff --git a/tests/conftest.py b/tests/conftest.py index 59be9c902..711a1c61b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1283,6 +1283,7 @@ def config_case_cli_wgs( gens_cov_pon_file, ] + @pytest.fixture(scope="session") def config_case_cli_tga( balsamic_cache: str, @@ -1329,6 +1330,7 @@ def config_case_cli_tga( gens_min_5_af_gnomad_file, ] + @pytest.fixture(scope="session") def tumor_only_config_qc( case_id_tumor_only_qc: str, @@ -1761,7 +1763,7 @@ def tumor_only_pon_config( "--tumor-sample-name", tumor_sample_name, ] - + config_case_cli_tga + + config_case_cli_tga, ) return Path( From 6030b877bf7de313e35ade56908b6f351c11be03 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Tue, 13 Aug 2024 13:36:19 +0200 Subject: [PATCH 38/39] add io test --- .../assets/scripts/postprocess_gens_cnvkit.py | 3 --- BALSAMIC/commands/config/case.py | 17 ++++++++------- BALSAMIC/utils/cli.py | 21 ++++++++----------- tests/commands/config/test_config_sample.py | 9 ++++---- tests/utils/test_utils.py | 15 +++++++++++++ 5 files changed, 37 insertions(+), 28 deletions(-) diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py index d266465c3..a50e354d0 100644 --- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py +++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py @@ -1,8 +1,5 @@ import click from BALSAMIC.utils.io import read_csv -import numpy as np -import warnings - def calculate_log2_ratio(purity, log2_ratio, ploidy): # Ensure that the inputs are within valid ranges diff --git a/BALSAMIC/commands/config/case.py b/BALSAMIC/commands/config/case.py index 77839f68c..0c001b1a7 100644 --- a/BALSAMIC/commands/config/case.py +++ b/BALSAMIC/commands/config/case.py @@ -139,14 +139,15 @@ def case_config( gnomad_min_af5=gnomad_min_af5, panel_bed=panel_bed, ) - # Update references dictionary with GENS reference-files - references.update( - { - gens_file: path - for gens_file, path in gens_references.items() - if path is not None - } - ) + if gens_references: + # Update references dictionary with GENS reference-files + references.update( + { + gens_file: path + for gens_file, path in gens_references.items() + if path is not None + } + ) variants_observations = { "clinical_snv_observations": clinical_snv_observations, diff --git a/BALSAMIC/utils/cli.py b/BALSAMIC/utils/cli.py index cdd02e680..a146fa4d7 100644 --- a/BALSAMIC/utils/cli.py +++ b/BALSAMIC/utils/cli.py @@ -220,18 +220,18 @@ def get_gens_references( gens_coverage_pon: Optional[str], gnomad_min_af5: Optional[str], panel_bed: Optional[str], -) -> Dict[str, str]: +) -> Dict[str, str] | None: """ Assigns reference-files required for GENS if they have been supplied, else exists with error message. Args: - genome_interval: Optional[str]. Coverage-regions. (required for WGS GENS) - gens_coverage_pon: Optional[str]. PON for GATK CollectReadCounts. (required for WGS GENS) + genome_interval: Optional[str] Coverage-regions. (required for WGS GENS) + gens_coverage_pon: Optional[str] PON for GATK CollectReadCounts. (required for WGS GENS) gnomad_min_af5: Optional[str] gnomad VCF filtered to keep variants above 5% VAF (required for WGS and TGA GENS). panel_bed: Optional[str] Bedfile supplied for TGA analyses. Returns: - Dict[str, str] with paths to GENS reference files + Dict[str, str] with paths to GENS reference files or None """ if panel_bed and gnomad_min_af5: @@ -243,14 +243,11 @@ def get_gens_references( "gens_coverage_pon": gens_coverage_pon, "gnomad_min_af5": gnomad_min_af5, } - - error_message = ( - "GENS reference file is always required to run BALSAMIC.\n" - "WGS requires arguments: genome_interval, gens_coverage_pon, gnomad_min_af5\n" - "TGA requires argument: gnomad_min_af5" - ) - LOG.error(error_message) - raise BalsamicError(error_message) + if any([gnomad_min_af5, genome_interval, gens_coverage_pon]): + error_message = "GENS for WGS requires all arguments: genome_interval, gens_coverage_pon, gnomad_min_af5" + LOG.error(error_message) + raise BalsamicError(error_message) + return None def get_bioinfo_tools_version( diff --git a/tests/commands/config/test_config_sample.py b/tests/commands/config/test_config_sample.py index c4abfed93..aa76ca6a2 100644 --- a/tests/commands/config/test_config_sample.py +++ b/tests/commands/config/test_config_sample.py @@ -420,12 +420,12 @@ def test_missing_required_gens_arguments_wgs( # THEN the CLI should exit code 2 and display an informative error message assert result.exit_code == 1 assert ( - "WGS requires arguments: genome_interval, gens_coverage_pon, gnomad_min_af5" + "GENS for WGS requires all arguments: genome_interval, gens_coverage_pon, gnomad_min_af5" in result.output ) -def test_missing_required_gens_arguments_tga( +def test_missing_gens_arguments_tga( invoke_cli, tumor_sample_name: str, analysis_dir: str, @@ -461,9 +461,8 @@ def test_missing_required_gens_arguments_tga( sentieon_license, ], ) - # THEN the CLI should exit code 2 and display an informative error message - assert result.exit_code == 1 - assert "TGA requires argument: gnomad_min_af5" in result.output + # THEN the CLI should exit code 0 + assert result.exit_code == 0 def test_config_with_gens_arguments( diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py index eb0d7fbe6..0cd721f78 100644 --- a/tests/utils/test_utils.py +++ b/tests/utils/test_utils.py @@ -46,6 +46,7 @@ from BALSAMIC.utils.exc import BalsamicError, WorkflowRunError from BALSAMIC.utils.io import ( read_json, + read_csv, read_vcf_file, read_yaml, write_finish_file, @@ -479,6 +480,20 @@ def test_read_json(config_path: str): assert type(config_dict) is dict +def test_read_csv(purity_csv_path: str): + """Test data extraction from a CSV file.""" + + # GIVEN a CSV path + + # WHEN calling the function + csv_list: list[Dict] = read_csv(purity_csv_path) + + # THEN the config.json file should be correctly parsed + assert len(csv_list) == 1 + assert csv_list[0]["Purity"] == "0.64" + assert csv_list[0]["Sampleid"] == "tumor.initial" + + def test_read_json_error(): """Test data extraction from a BALSAMIC config JSON file for an invalid path.""" From 604f0596ad111811595505349b3d8090454f7087 Mon Sep 17 00:00:00 2001 From: Mathias Johansson Date: Tue, 13 Aug 2024 14:50:24 +0200 Subject: [PATCH 39/39] add new test --- .../assets/scripts/postprocess_gens_cnvkit.py | 9 ++++----- BALSAMIC/utils/io.py | 5 +++++ tests/utils/test_utils.py | 20 +++++++++++++++++++ 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py index a50e354d0..2899091ff 100644 --- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py +++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py @@ -1,5 +1,5 @@ import click -from BALSAMIC.utils.io import read_csv +from BALSAMIC.utils.io import read_csv, write_list_of_strings def calculate_log2_ratio(purity, log2_ratio, ploidy): # Ensure that the inputs are within valid ranges @@ -82,10 +82,9 @@ def create_gens_cov_file( log2_data.append(f"{row['chromosome']}\t{midpoint - 1}\t{midpoint}\t{log2}") # Write log2 data to output file - with open(output_file, "w") as log2_out: - for resolution in ["o", "a", "b", "c", "d"]: - for line in log2_data: - log2_out.write(f"{resolution}_{line}\n") + resolutions = ["o", "a", "b", "c", "d"] + resolution_log2_lines = [f"{resolution}_{line}" for resolution in resolutions for line in log2_data] + write_list_of_strings(resolution_log2_lines, output_file) if __name__ == "__main__": diff --git a/BALSAMIC/utils/io.py b/BALSAMIC/utils/io.py index 0b615265d..315899f93 100644 --- a/BALSAMIC/utils/io.py +++ b/BALSAMIC/utils/io.py @@ -73,6 +73,11 @@ def read_csv(csv_path: str, delimeter: str = ",") -> List[dict]: csv_reader = csv.DictReader(csv_file, delimiter=delimeter) return [row for row in csv_reader] +def write_list_of_strings(list_of_strings: list[str], output_file: str): + """Writes a list of strings to a file, each on a new line.""" + with open(output_file, 'w') as file: + for string in list_of_strings: + file.write(f"{string}\n") def read_yaml(yaml_path: str) -> dict: """Read data from a yaml file.""" diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py index 0cd721f78..503134c2e 100644 --- a/tests/utils/test_utils.py +++ b/tests/utils/test_utils.py @@ -49,6 +49,7 @@ read_csv, read_vcf_file, read_yaml, + write_list_of_strings, write_finish_file, write_json, write_sacct_to_yaml, @@ -455,6 +456,25 @@ def test_write_json(tmp_path, reference): assert len(list(tmp.iterdir())) == 1 +def test_write_list_of_strings(tmp_path): + """Test writing list of strings to file""" + + # GIVEN a list of strings and an output file path + list_of_strings = [f"header1\theader2\theader3", f"row1_col1\trow1_col2\trow1_col3"] + tmp = tmp_path / "tmp" + tmp.mkdir() + output_file: Path = Path(tmp / "output.csv") + + # WHEN writing list of strings + write_list_of_strings(list_of_strings, output_file.as_posix()) + + # THEN file should have been written + assert output_file.exists() + + # AND contain the same information + read_written_file = read_csv(csv_path=output_file, delimeter="\t") + assert read_written_file == [{"header1": "row1_col1", "header2": "row1_col2", "header3": "row1_col3"}] + assert len(read_written_file) == 1 def test_write_json_error(tmp_path: Path): """Test JSON write error."""