From e6516d61405881affc561151fedff1fcbc3d0f16 Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Fri, 14 Jun 2024 17:58:20 +0200
Subject: [PATCH 01/39] add gens functionality to tga

---
 .../assets/scripts/postprocess_gens_cnvkit.py |  36 +++
 BALSAMIC/assets/scripts/preprocess_gens.py    |   2 +-
 BALSAMIC/constants/rules.py                   |   2 +
 BALSAMIC/constants/tools.py                   |   3 +
 .../variant_calling/gens_preprocessing.rule   | 235 +++++++++++-------
 5 files changed, 191 insertions(+), 87 deletions(-)
 create mode 100644 BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py

diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
new file mode 100644
index 000000000..8c95c496c
--- /dev/null
+++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
@@ -0,0 +1,36 @@
+import click
+import pandas as pd
+
+@click.command()
+@click.option(
+    "-o",
+    "--output-file",
+    required=True,
+    type=click.Path(exists=False),
+    help="Name of output-file.",
+)
+@click.option(
+    "-c",
+    "--normalised-coverage-path",
+    required=True,
+    type=click.Path(exists=True),
+    help="Input CNVkit cnr result.",
+)
+def create_gens_cov_file(output_file, ormalised_coverage_path):
+    # Process CNVkit file
+    log2_data = []
+    cnvkit_df = pd.read_csv(ormalised_coverage_path, sep='\t')
+    for index, row in cnvkit_df.iterrows():
+        if row['gene'] == "Antitarget":
+            continue
+        midpoint = row['start'] + int((row['end'] - row['start']) / 2)
+        log2_data.append(f"{row['chromosome']}\t{midpoint-1}\t{midpoint}\t{row['log2']}")
+
+    # Write log2 data to output file
+    with open(output_file, 'w') as log2_out:
+        for resolution in ['o', 'a', 'b', 'c', 'd']:
+            for line in log2_data:
+                log2_out.write(f"{resolution}_{line}\n")
+
+if __name__ == '__main__':
+    create_gens_cov_file()
diff --git a/BALSAMIC/assets/scripts/preprocess_gens.py b/BALSAMIC/assets/scripts/preprocess_gens.py
index 0e146a3f1..e3c07a5e0 100755
--- a/BALSAMIC/assets/scripts/preprocess_gens.py
+++ b/BALSAMIC/assets/scripts/preprocess_gens.py
@@ -26,7 +26,7 @@
     "-s",
     "--sequencing-type",
     required=True,
-    type=click.Choice([SequencingType.WGS]),
+    type=click.Choice([SequencingType.WGS, SequencingType.TARGETED]),
     help="Sequencing type used.",
 )
 @click.pass_context
diff --git a/BALSAMIC/constants/rules.py b/BALSAMIC/constants/rules.py
index 947dbac72..59cdab9f2 100644
--- a/BALSAMIC/constants/rules.py
+++ b/BALSAMIC/constants/rules.py
@@ -77,6 +77,7 @@
             "snakemake_rules/align/postprocess_bam.rule",
         ],
         "varcall": [
+            "snakemake_rules/variant_calling/gens_preprocessing.rule",
             "snakemake_rules/variant_calling/germline.rule",
             "snakemake_rules/variant_calling/split_bed.rule",
             "snakemake_rules/variant_calling/somatic_cnv_tumor_only_tga.rule",
@@ -108,6 +109,7 @@
             "snakemake_rules/align/postprocess_bam.rule",
         ],
         "varcall": [
+            "snakemake_rules/variant_calling/gens_preprocessing.rule",
             "snakemake_rules/variant_calling/germline.rule",
             "snakemake_rules/variant_calling/split_bed.rule",
             "snakemake_rules/variant_calling/somatic_tumor_normal.rule",
diff --git a/BALSAMIC/constants/tools.py b/BALSAMIC/constants/tools.py
index 197650df4..71ff8ab65 100644
--- a/BALSAMIC/constants/tools.py
+++ b/BALSAMIC/constants/tools.py
@@ -40,6 +40,9 @@
                 "c": 1000,
                 "d": 100,
             },
+        },
+        SequencingType.TARGETED: {
+            "BAF_SKIP_N": {"o": 0, "a": 0, "b": 0, "c": 0, "d": 0},
         }
     },
 }
diff --git a/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule b/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule
index e2b32534f..73f70f551 100644
--- a/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule
+++ b/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule
@@ -1,98 +1,161 @@
 # vim: syntax=python tabstop=4 expandtab
 # coding: utf-8
 
-rule sentieon_DNAscope_gnomad:
-    input:
-        ref = config["reference"]["reference_genome"],
-        gnomad_af5= config["reference"]["gnomad_min_af5"],
-        bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = wildcards.sample)
-    output:
-        gnomad_af5_vcf = cnv_dir + "SNV.germline.{sample}.dnascope_gnomad_af5.vcf.gz",
-    params:
-        tmpdir = tempfile.mkdtemp(prefix=tmp_dir),
-        pcr_model = params.common.pcr_model,
-        sentieon_exec = config["SENTIEON_EXEC"],
-        sentieon_lic = config["SENTIEON_LICENSE"],
-        sentieon_ml_dnascope = config["SENTIEON_DNASCOPE"],
-        sample = "{sample}"
-    benchmark:
-        Path(benchmark_dir, "sentieon_DNAscope_gnomad_{sample}.tsv").as_posix()
-    threads:
-        get_threads(cluster_config, "sentieon_DNAscope_gnomad")
-    message:
-        "Calling germline variants on positions in Gnomad AF > 0.05 using Sentieon DNAscope for {params.sample}"
-    shell:
-      """
-export TMPDIR={params.tmpdir};
-export SENTIEON_TMPDIR={params.tmpdir};
-export SENTIEON_LICENSE={params.sentieon_lic};
-export SENTIEON_DNASCOPE={params.sentieon_ml_dnascope};
+if config["analysis"]["sequencing_type"] == 'wgs':
+    rule sentieon_DNAscope_gnomad:
+        input:
+            ref=config["reference"]["reference_genome"],
+            gnomad_af5=config["reference"]["gnomad_min_af5"],
+            bam=lambda wildcards: config_model.get_final_bam_name(bam_dir=bam_dir,sample_name=wildcards.sample)
+        output:
+            gnomad_af5_vcf=cnv_dir + "SNV.germline.{sample}.dnascope_gnomad_af5.vcf.gz",
+        params:
+            tmpdir=tempfile.mkdtemp(prefix=tmp_dir),
+            pcr_model=params.common.pcr_model,
+            sentieon_exec=config["SENTIEON_EXEC"],
+            sentieon_lic=config["SENTIEON_LICENSE"],
+            sample="{sample}"
+        benchmark:
+            Path(benchmark_dir,"sentieon_DNAscope_gnomad_{sample}.tsv").as_posix()
+        threads:
+            get_threads(cluster_config,"sentieon_DNAscope_gnomad")
+        message:
+            "Calling germline variants on positions in Gnomad AF > 0.05 using Sentieon DNAscope for {params.sample}"
+        shell:
+            """
+      export TMPDIR={params.tmpdir};
+      export SENTIEON_TMPDIR={params.tmpdir};
+      export SENTIEON_LICENSE={params.sentieon_lic};
+      export SENTIEON_DNASCOPE={params.sentieon_ml_dnascope};
+  
+      {params.sentieon_exec} driver \
+      -t {threads} \
+      -r {input.ref} \
+      -i {input.bam} \
+      --algo DNAscope \
+      --pcr_indel_mode {params.pcr_model} \
+      --given {input.gnomad_af5} {output.gnomad_af5_vcf};
+  
+      rm -rf {params.tmpdir};
+            """
 
-{params.sentieon_exec} driver \
--t {threads} \
--r {input.ref} \
--i {input.bam} \
---algo DNAscope \
---pcr_indel_mode {params.pcr_model} \
---given {input.gnomad_af5} {output.gnomad_af5_vcf};
+    rule gatk_denoisereadcounts:
+        input:
+            gens_pon=config["reference"]["gens_coverage_pon"],
+            readcounts_hdf5=cnv_dir + "{sample}.collectreadcounts.hdf5"
+        output:
+            denoised_cr=cnv_dir + "{sample}.denoisedCR.tsv",
+            standardized_cr=cnv_dir + "{sample}.standardizedCR.tsv"
+        params:
+            tmpdir=tempfile.mkdtemp(prefix=tmp_dir),
+            sample="{sample}"
+        benchmark:
+            Path(benchmark_dir,"gatk_denoisereadcounts_{sample}.tsv").as_posix()
+        singularity:
+            Path(singularity_image,config["bioinfo_tools"].get("gatk") + ".sif").as_posix()
+        threads:
+            get_threads(cluster_config,"gatk_denoisereadcounts")
+        message:
+            "Running GATK DenoiseReadCounts on {params.sample} for GENS."
+        shell:
+            """
+    export TMPDIR={params.tmpdir};  
 
-rm -rf {params.tmpdir};
-      """
+    gatk --java-options "-Xmx60g" DenoiseReadCounts  \
+    -I {input.readcounts_hdf5} \
+    --count-panel-of-normals {input.gens_pon} \
+    --tmp-dir {params.tmpdir} \
+    --standardized-copy-ratios {output.standardized_cr} \
+    --denoised-copy-ratios {output.denoised_cr}
 
-rule gatk_denoisereadcounts:
-    input:
-        gens_pon = config["reference"]["gens_coverage_pon"],
-        readcounts_hdf5 = cnv_dir + "{sample}.collectreadcounts.hdf5"
-    output:
-        denoised_cr = cnv_dir + "{sample}.denoisedCR.tsv",
-        standardized_cr = cnv_dir + "{sample}.standardizedCR.tsv"
-    params:
-        tmpdir=tempfile.mkdtemp(prefix=tmp_dir),
-        sample="{sample}"
-    benchmark:
-        Path(benchmark_dir, "gatk_denoisereadcounts_{sample}.tsv").as_posix()
-    singularity:
-        Path(singularity_image,config["bioinfo_tools"].get("gatk") + ".sif").as_posix()
-    threads:
-        get_threads(cluster_config,"gatk_denoisereadcounts")
-    message:
-        "Running GATK DenoiseReadCounts on {params.sample} for GENS."
-    shell:
-        """
-export TMPDIR={params.tmpdir};  
+    rm -rf {params.tmpdir}
+            """
+
+    rule gens_preprocessing_wgs:
+        input:
+            denoised_cr = cnv_dir + "{sample}.denoisedCR.tsv",
+            gnomad_af5_vcf = cnv_dir + "SNV.germline.{sample}.dnascope_gnomad_af5.vcf.gz",
+        output:
+            gens_baf_bed = cnv_dir + "{sample}.baf.bed",
+            gens_cov_bed = cnv_dir + "{sample}.cov.bed"
+        params:
+            gens_preprocessing = get_script_path("preprocess_gens.py"),
+            sequencing_type = sequencing_type,
+            sample="{sample}"
+        benchmark:
+            Path(benchmark_dir, "gens_preprocessing_{sample}.tsv").as_posix()
+        threads:
+            get_threads(cluster_config, "gens_preprocessing")
+        message:
+            "Formatting output for GENS for sample: {params.sample}."
+        shell:
+            """
+    python {params.gens_preprocessing} -s {params.sequencing_type} -o {output.gens_baf_bed} calculate-bafs --vcf-file-path {input.gnomad_af5_vcf}
+    python {params.gens_preprocessing} -s {params.sequencing_type} -o {output.gens_cov_bed} create-coverage-regions --normalised-coverage-path {input.denoised_cr}
+            """
+else:
+    rule sentieon_DNAscope_gnomad_tga:
+        input:
+            bed=config["panel"]["capture_kit"],
+            ref=config["reference"]["reference_genome"],
+            gnomad_af5=config["reference"]["gnomad_min_af5"],
+            bam=lambda wildcards: config_model.get_final_bam_name(bam_dir=bam_dir,sample_name=wildcards.sample)
+        output:
+            gnomad_af5_vcf=cnv_dir + "SNV.germline.{sample}.dnascope_gnomad_af5.vcf.gz",
+        params:
+            tmpdir=tempfile.mkdtemp(prefix=tmp_dir),
+            pcr_model=params.common.pcr_model,
+            sentieon_exec=config["SENTIEON_EXEC"],
+            sentieon_lic=config["SENTIEON_LICENSE"],
+            sample="{sample}"
+        benchmark:
+            Path(benchmark_dir,"sentieon_DNAscope_gnomad_{sample}.tsv").as_posix()
+        threads:
+            get_threads(cluster_config,"sentieon_DNAscope_gnomad")
+        message:
+            "Calling germline variants on positions in Gnomad AF > 0.05 using Sentieon DNAscope for {params.sample}"
+        shell:
+            """
+      export TMPDIR={params.tmpdir};
+      export SENTIEON_TMPDIR={params.tmpdir};
+      export SENTIEON_LICENSE={params.sentieon_lic};
+      export SENTIEON_DNASCOPE={params.sentieon_ml_dnascope};
   
-gatk --java-options "-Xmx60g" DenoiseReadCounts  \
--I {input.readcounts_hdf5} \
---count-panel-of-normals {input.gens_pon} \
---tmp-dir {params.tmpdir} \
---standardized-copy-ratios {output.standardized_cr} \
---denoised-copy-ratios {output.denoised_cr}
+      {params.sentieon_exec} driver \
+      -t {threads} \
+      -r {input.ref} \
+      -i {input.bam} \
+      --interval {input.bed} \
+      --interval_padding 100 \
+      --algo DNAscope \
+      --pcr_indel_mode {params.pcr_model} \
+      --given {input.gnomad_af5} {output.gnomad_af5_vcf};
   
-rm -rf {params.tmpdir}
-        """
+      rm -rf {params.tmpdir};
+            """
 
-rule gens_preprocessing:
-    input:
-        denoised_cr = cnv_dir + "{sample}.denoisedCR.tsv",
-        gnomad_af5_vcf = cnv_dir + "SNV.germline.{sample}.dnascope_gnomad_af5.vcf.gz",
-    output:
-        gens_baf_bed = cnv_dir + "{sample}.baf.bed",
-        gens_cov_bed = cnv_dir + "{sample}.cov.bed"
-    params:
-        gens_preprocessing = get_script_path("preprocess_gens.py"),
-        sequencing_type = sequencing_type,
-        sample="{sample}"
-    benchmark:
-        Path(benchmark_dir, "gens_preprocessing_{sample}.tsv").as_posix()
-    threads:
-        get_threads(cluster_config, "gens_preprocessing")
-    message:
-        "Formatting output for GENS for sample: {params.sample}."
-    shell:
-        """
-python {params.gens_preprocessing} -s {params.sequencing_type} -o {output.gens_baf_bed} calculate-bafs --vcf-file-path {input.gnomad_af5_vcf}
-python {params.gens_preprocessing} -s {params.sequencing_type} -o {output.gens_cov_bed} create-coverage-regions --normalised-coverage-path {input.denoised_cr}
-        """
+    rule gens_preprocessing_tga:
+        input:
+            cnvkit_cnr = cnv_dir + "tumor.merged.cnr",
+            gnomad_af5_vcf = cnv_dir + "SNV.germline.{sample}.dnascope_gnomad_af5.vcf.gz",
+        output:
+            gens_baf_bed = cnv_dir + "{sample}.baf.bed",
+            gens_cov_bed = cnv_dir + "{sample}.cov.bed"
+        params:
+            gens_preprocessing = get_script_path("preprocess_gens.py"),
+            gens_preprocess_cnvkit = get_script_path("postprocess_gens_cnvkit.py"),
+            sample="{sample}"
+        benchmark:
+            Path(benchmark_dir, "gens_preprocessing_{sample}.tsv").as_posix()
+        threads:
+            get_threads(cluster_config, "gens_preprocessing")
+        message:
+            "Formatting output for GENS for sample: {params.sample}."
+        shell:
+            """
+    python {params.gens_preprocessing} -s {params.sequencing_type} -o {output.gens_baf_bed} calculate-bafs --vcf-file-path {input.gnomad_af5_vcf}
+    python {params.gens_preprocess_cnvkit} -o {output.gens_cov_bed} --normalised-coverage-path {input.cnvkit_cnr}
+            """
 
 rule finalize_gens_outputfiles:
     input:

From 7f6e521fc7fdc5235f9fb43891112b9ba18aaaff Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Fri, 14 Jun 2024 18:01:24 +0200
Subject: [PATCH 02/39] add gens inputs

---
 BALSAMIC/workflows/balsamic.smk | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/BALSAMIC/workflows/balsamic.smk b/BALSAMIC/workflows/balsamic.smk
index 49a5f34e0..25226cc49 100644
--- a/BALSAMIC/workflows/balsamic.smk
+++ b/BALSAMIC/workflows/balsamic.smk
@@ -9,7 +9,7 @@ from pathlib import Path
 from typing import Dict, List
 
 from BALSAMIC.constants.constants import FileType
-from BALSAMIC.constants.analysis import FastqName, MutationType, SampleType
+from BALSAMIC.constants.analysis import FastqName, MutationType, SampleType, SequencingType
 from BALSAMIC.constants.paths import BALSAMIC_DIR, SENTIEON_DNASCOPE_DIR, SENTIEON_TNSCOPE_DIR
 from BALSAMIC.constants.rules import SNAKEMAKE_RULES
 from BALSAMIC.constants.variant_filters import (
@@ -522,10 +522,15 @@ if config['analysis']['analysis_type'] == "single":
     )
 
 # GENS Outputs
-if config["analysis"]["sequencing_type"] == "wgs" and "gens_coverage_pon" in config["reference"]:
+if config["analysis"]["sequencing_type"] == SequencingType.WGS and "gens_coverage_pon" in config["reference"]:
     analysis_specific_results.extend(
         expand(cnv_dir + "{sample}.{gens_input}.bed.gz", sample=sample_names, gens_input=["cov", "baf"])
     )
+if config["analysis"]["sequencing_type"] == SequencingType.TARGETED:
+    analysis_specific_results.extend(
+        expand(cnv_dir + "{sample}.{gens_input}.bed.gz", sample=sample_names, gens_input=["cov", "baf"])
+    )
+
 
 # Dragen
 if config["analysis"]["sequencing_type"] == "wgs" and config['analysis']['analysis_type'] == "single":

From 24e41a9d33a37038b78a2f0064f869bbe3beee14 Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Fri, 14 Jun 2024 18:56:08 +0200
Subject: [PATCH 03/39] add gens gnomad af to tga

---
 .../assets/scripts/postprocess_gens_cnvkit.py | 18 ++++++----
 BALSAMIC/commands/config/case.py              | 25 ++-----------
 BALSAMIC/constants/tools.py                   |  2 +-
 BALSAMIC/utils/cli.py                         | 36 +++++++++++++++++++
 BALSAMIC/workflows/balsamic.smk               |  2 +-
 5 files changed, 51 insertions(+), 32 deletions(-)

diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
index 8c95c496c..cc387202f 100644
--- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
+++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
@@ -1,6 +1,7 @@
 import click
 import pandas as pd
 
+
 @click.command()
 @click.option(
     "-o",
@@ -19,18 +20,21 @@
 def create_gens_cov_file(output_file, ormalised_coverage_path):
     # Process CNVkit file
     log2_data = []
-    cnvkit_df = pd.read_csv(ormalised_coverage_path, sep='\t')
+    cnvkit_df = pd.read_csv(ormalised_coverage_path, sep="\t")
     for index, row in cnvkit_df.iterrows():
-        if row['gene'] == "Antitarget":
+        if row["gene"] == "Antitarget":
             continue
-        midpoint = row['start'] + int((row['end'] - row['start']) / 2)
-        log2_data.append(f"{row['chromosome']}\t{midpoint-1}\t{midpoint}\t{row['log2']}")
+        midpoint = row["start"] + int((row["end"] - row["start"]) / 2)
+        log2_data.append(
+            f"{row['chromosome']}\t{midpoint-1}\t{midpoint}\t{row['log2']}"
+        )
 
     # Write log2 data to output file
-    with open(output_file, 'w') as log2_out:
-        for resolution in ['o', 'a', 'b', 'c', 'd']:
+    with open(output_file, "w") as log2_out:
+        for resolution in ["o", "a", "b", "c", "d"]:
             for line in log2_data:
                 log2_out.write(f"{resolution}_{line}\n")
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     create_gens_cov_file()
diff --git a/BALSAMIC/commands/config/case.py b/BALSAMIC/commands/config/case.py
index 0c4444bbc..2e038762e 100644
--- a/BALSAMIC/commands/config/case.py
+++ b/BALSAMIC/commands/config/case.py
@@ -50,6 +50,7 @@
     get_bioinfo_tools_version,
     get_panel_chrom,
     get_sample_list,
+    get_gens_references,
 )
 from BALSAMIC.utils.io import read_json, write_json
 from BALSAMIC.utils.utils import get_absolute_paths_dict
@@ -129,29 +130,7 @@ def case_config(
     if cadd_annotations:
         references.update(cadd_annotations_path)
 
-    if any([genome_interval, gens_coverage_pon, gnomad_min_af5]):
-        if panel_bed:
-            raise click.BadParameter(
-                "GENS is currently not compatible with TGA analysis, only WGS."
-            )
-        if not all([genome_interval, gens_coverage_pon, gnomad_min_af5]):
-            raise click.BadParameter(
-                "All three arguments (genome_interval gens_coverage_pon, gnomad_min_af5) are required for GENS."
-            )
-
-        gens_ref_files = {
-            "genome_interval": genome_interval,
-            "gens_coverage_pon": gens_coverage_pon,
-            "gnomad_min_af5": gnomad_min_af5,
-        }
-
-        references.update(
-            {
-                gens_file: path
-                for gens_file, path in gens_ref_files.items()
-                if path is not None
-            }
-        )
+    references = get_gens_references(genome_interval, gens_coverage_pon, gnomad_min_af5, panel_bed, references)
 
     variants_observations = {
         "clinical_snv_observations": clinical_snv_observations,
diff --git a/BALSAMIC/constants/tools.py b/BALSAMIC/constants/tools.py
index 71ff8ab65..0fe61fc4a 100644
--- a/BALSAMIC/constants/tools.py
+++ b/BALSAMIC/constants/tools.py
@@ -43,6 +43,6 @@
         },
         SequencingType.TARGETED: {
             "BAF_SKIP_N": {"o": 0, "a": 0, "b": 0, "c": 0, "d": 0},
-        }
+        },
     },
 }
diff --git a/BALSAMIC/utils/cli.py b/BALSAMIC/utils/cli.py
index 1fd903ac5..c4580454e 100644
--- a/BALSAMIC/utils/cli.py
+++ b/BALSAMIC/utils/cli.py
@@ -215,6 +215,42 @@ def bioinfo_tool_version_conda(
     return conda_bioinfo_version
 
 
+def get_gens_references(
+    genome_interval: Optional[str],
+    gens_coverage_pon: Optional[str],
+    gnomad_min_af5: Optional[str],
+    panel_bed: Optional[str],
+    references: Dict[str, str],
+) -> Dict:
+
+    gens_arguments = [genome_interval, gens_coverage_pon, gnomad_min_af5]
+
+    # Check if any of the gens arguments are provided
+    if any(gens_arguments):
+        # Ensure all required arguments are provided or raise an error if panel_bed is not set
+        if not all(gens_arguments) and not panel_bed:
+            raise click.BadParameter(
+                "All three arguments (genome_interval, gens_coverage_pon, gnomad_min_af5) are required for GENS in WGS."
+            )
+
+        # Construct the gens_ref_files dictionary
+        gens_ref_files = (
+            {"gnomad_min_af5": gnomad_min_af5}
+            if panel_bed
+            else {
+                "genome_interval": genome_interval,
+                "gens_coverage_pon": gens_coverage_pon,
+                "gnomad_min_af5": gnomad_min_af5,
+            }
+        )
+
+        # Update references dictionary with gens values
+        references.update(
+            {gens_file: path for gens_file, path in gens_ref_files.items() if path}
+        )
+        return references
+
+
 def get_bioinfo_tools_version(
     bioinfo_tools: dict, container_conda_env_path: Path
 ) -> dict:
diff --git a/BALSAMIC/workflows/balsamic.smk b/BALSAMIC/workflows/balsamic.smk
index 25226cc49..4014d8510 100644
--- a/BALSAMIC/workflows/balsamic.smk
+++ b/BALSAMIC/workflows/balsamic.smk
@@ -526,7 +526,7 @@ if config["analysis"]["sequencing_type"] == SequencingType.WGS and "gens_coverag
     analysis_specific_results.extend(
         expand(cnv_dir + "{sample}.{gens_input}.bed.gz", sample=sample_names, gens_input=["cov", "baf"])
     )
-if config["analysis"]["sequencing_type"] == SequencingType.TARGETED:
+if config["analysis"]["sequencing_type"] == SequencingType.TARGETED and "gnomad_min_af5" in config["reference"]:
     analysis_specific_results.extend(
         expand(cnv_dir + "{sample}.{gens_input}.bed.gz", sample=sample_names, gens_input=["cov", "baf"])
     )

From cdbe43fce0404ba4964169dde0bbf85c73ec9f8b Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Fri, 14 Jun 2024 18:56:17 +0200
Subject: [PATCH 04/39] black

---
 BALSAMIC/commands/config/case.py | 4 +++-
 BALSAMIC/utils/cli.py            | 1 -
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/BALSAMIC/commands/config/case.py b/BALSAMIC/commands/config/case.py
index 2e038762e..081de6265 100644
--- a/BALSAMIC/commands/config/case.py
+++ b/BALSAMIC/commands/config/case.py
@@ -130,7 +130,9 @@ def case_config(
     if cadd_annotations:
         references.update(cadd_annotations_path)
 
-    references = get_gens_references(genome_interval, gens_coverage_pon, gnomad_min_af5, panel_bed, references)
+    references = get_gens_references(
+        genome_interval, gens_coverage_pon, gnomad_min_af5, panel_bed, references
+    )
 
     variants_observations = {
         "clinical_snv_observations": clinical_snv_observations,
diff --git a/BALSAMIC/utils/cli.py b/BALSAMIC/utils/cli.py
index c4580454e..47ef70c29 100644
--- a/BALSAMIC/utils/cli.py
+++ b/BALSAMIC/utils/cli.py
@@ -222,7 +222,6 @@ def get_gens_references(
     panel_bed: Optional[str],
     references: Dict[str, str],
 ) -> Dict:
-
     gens_arguments = [genome_interval, gens_coverage_pon, gnomad_min_af5]
 
     # Check if any of the gens arguments are provided

From f262919814253facba4da62af034e07f612164fc Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Fri, 14 Jun 2024 19:07:39 +0200
Subject: [PATCH 05/39] fix

---
 BALSAMIC/utils/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/BALSAMIC/utils/cli.py b/BALSAMIC/utils/cli.py
index 47ef70c29..2fdf20902 100644
--- a/BALSAMIC/utils/cli.py
+++ b/BALSAMIC/utils/cli.py
@@ -245,7 +245,7 @@ def get_gens_references(
 
         # Update references dictionary with gens values
         references.update(
-            {gens_file: path for gens_file, path in gens_ref_files.items() if path}
+            {gens_file: path for gens_file, path in gens_ref_files.items() if path is not None}
         )
         return references
 

From c2aafb9af6a0427d74d8c69baaf7295149f62920 Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Fri, 14 Jun 2024 19:14:12 +0200
Subject: [PATCH 06/39] fix

---
 BALSAMIC/utils/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/BALSAMIC/utils/cli.py b/BALSAMIC/utils/cli.py
index 2fdf20902..94ab873f7 100644
--- a/BALSAMIC/utils/cli.py
+++ b/BALSAMIC/utils/cli.py
@@ -247,7 +247,7 @@ def get_gens_references(
         references.update(
             {gens_file: path for gens_file, path in gens_ref_files.items() if path is not None}
         )
-        return references
+    return references
 
 
 def get_bioinfo_tools_version(

From cc9e5064eff4d92531b18ecae3339c1760fdf567 Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Mon, 17 Jun 2024 07:47:01 +0200
Subject: [PATCH 07/39] fix

---
 BALSAMIC/constants/rules.py     | 2 --
 BALSAMIC/workflows/balsamic.smk | 8 ++------
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/BALSAMIC/constants/rules.py b/BALSAMIC/constants/rules.py
index 59cdab9f2..947dbac72 100644
--- a/BALSAMIC/constants/rules.py
+++ b/BALSAMIC/constants/rules.py
@@ -77,7 +77,6 @@
             "snakemake_rules/align/postprocess_bam.rule",
         ],
         "varcall": [
-            "snakemake_rules/variant_calling/gens_preprocessing.rule",
             "snakemake_rules/variant_calling/germline.rule",
             "snakemake_rules/variant_calling/split_bed.rule",
             "snakemake_rules/variant_calling/somatic_cnv_tumor_only_tga.rule",
@@ -109,7 +108,6 @@
             "snakemake_rules/align/postprocess_bam.rule",
         ],
         "varcall": [
-            "snakemake_rules/variant_calling/gens_preprocessing.rule",
             "snakemake_rules/variant_calling/germline.rule",
             "snakemake_rules/variant_calling/split_bed.rule",
             "snakemake_rules/variant_calling/somatic_tumor_normal.rule",
diff --git a/BALSAMIC/workflows/balsamic.smk b/BALSAMIC/workflows/balsamic.smk
index 4014d8510..63f4ccaa8 100644
--- a/BALSAMIC/workflows/balsamic.smk
+++ b/BALSAMIC/workflows/balsamic.smk
@@ -406,7 +406,7 @@ if "dragen" in config:
     rules_to_include.append("snakemake_rules/concatenation/concatenation.rule")
 
 # Add rule for GENS
-if "gens_coverage_pon" in config["reference"]:
+if "gnomad_min_af5" in config["reference"]:
     rules_to_include.append("snakemake_rules/variant_calling/gatk_read_counts.rule")
     rules_to_include.append("snakemake_rules/variant_calling/gens_preprocessing.rule")
 
@@ -522,11 +522,7 @@ if config['analysis']['analysis_type'] == "single":
     )
 
 # GENS Outputs
-if config["analysis"]["sequencing_type"] == SequencingType.WGS and "gens_coverage_pon" in config["reference"]:
-    analysis_specific_results.extend(
-        expand(cnv_dir + "{sample}.{gens_input}.bed.gz", sample=sample_names, gens_input=["cov", "baf"])
-    )
-if config["analysis"]["sequencing_type"] == SequencingType.TARGETED and "gnomad_min_af5" in config["reference"]:
+if "gnomad_min_af5" in config["reference"]:
     analysis_specific_results.extend(
         expand(cnv_dir + "{sample}.{gens_input}.bed.gz", sample=sample_names, gens_input=["cov", "baf"])
     )

From d471395e900fbf1f8b77db898c3228dbd237c9f8 Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Mon, 17 Jun 2024 07:57:47 +0200
Subject: [PATCH 08/39] fix

---
 BALSAMIC/workflows/balsamic.smk | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/BALSAMIC/workflows/balsamic.smk b/BALSAMIC/workflows/balsamic.smk
index 63f4ccaa8..c15af04e2 100644
--- a/BALSAMIC/workflows/balsamic.smk
+++ b/BALSAMIC/workflows/balsamic.smk
@@ -407,8 +407,9 @@ if "dragen" in config:
 
 # Add rule for GENS
 if "gnomad_min_af5" in config["reference"]:
-    rules_to_include.append("snakemake_rules/variant_calling/gatk_read_counts.rule")
     rules_to_include.append("snakemake_rules/variant_calling/gens_preprocessing.rule")
+if "gnomad_min_af5" in config["reference"] and sequence_type == SequencingType.WGS:
+    rules_to_include.append("snakemake_rules/variant_calling/gatk_read_counts.rule")
 
 LOG.info(f"The following rules will be included in the workflow: {rules_to_include}")
 LOG.info(f"The following Germline variant callers will be included in the workflow: {germline_caller}")

From 05bd3ae51bf78c09b027962ef10b384c4332a018 Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Mon, 17 Jun 2024 08:48:36 +0200
Subject: [PATCH 09/39] fix

---
 .../snakemake_rules/variant_calling/gens_preprocessing.rule     | 2 --
 1 file changed, 2 deletions(-)

diff --git a/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule b/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule
index 73f70f551..3eb57cce0 100644
--- a/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule
+++ b/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule
@@ -26,7 +26,6 @@ if config["analysis"]["sequencing_type"] == 'wgs':
       export TMPDIR={params.tmpdir};
       export SENTIEON_TMPDIR={params.tmpdir};
       export SENTIEON_LICENSE={params.sentieon_lic};
-      export SENTIEON_DNASCOPE={params.sentieon_ml_dnascope};
   
       {params.sentieon_exec} driver \
       -t {threads} \
@@ -119,7 +118,6 @@ else:
       export TMPDIR={params.tmpdir};
       export SENTIEON_TMPDIR={params.tmpdir};
       export SENTIEON_LICENSE={params.sentieon_lic};
-      export SENTIEON_DNASCOPE={params.sentieon_ml_dnascope};
   
       {params.sentieon_exec} driver \
       -t {threads} \

From 686855630586b1faf8ba11f00b546b5e6ef78abb Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Mon, 17 Jun 2024 09:02:57 +0200
Subject: [PATCH 10/39] doc strings and named args

---
 BALSAMIC/commands/config/case.py |  6 +++++-
 BALSAMIC/utils/cli.py            | 12 ++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/BALSAMIC/commands/config/case.py b/BALSAMIC/commands/config/case.py
index 081de6265..02bea71b2 100644
--- a/BALSAMIC/commands/config/case.py
+++ b/BALSAMIC/commands/config/case.py
@@ -131,7 +131,11 @@ def case_config(
         references.update(cadd_annotations_path)
 
     references = get_gens_references(
-        genome_interval, gens_coverage_pon, gnomad_min_af5, panel_bed, references
+        genome_interval=genome_interval,
+        gens_coverage_pon=gens_coverage_pon,
+        gnomad_min_af5=gnomad_min_af5,
+        panel_bed=panel_bed,
+        references=references
     )
 
     variants_observations = {
diff --git a/BALSAMIC/utils/cli.py b/BALSAMIC/utils/cli.py
index 94ab873f7..361082d94 100644
--- a/BALSAMIC/utils/cli.py
+++ b/BALSAMIC/utils/cli.py
@@ -222,6 +222,18 @@ def get_gens_references(
     panel_bed: Optional[str],
     references: Dict[str, str],
 ) -> Dict:
+    """
+    Assigns reference-files required for GENS if they have been supplied.
+
+    :param genome_interval: Coverage-regions. (required for WGS GENS)
+    :param gens_coverage_pon: PON for GATK CollectReadCounts. (required for WGS GENS)
+    :param gnomad_min_af5: gnomad VCF filtered to keep variants above 5% VAF. (required for WGS and TGA GENS)
+    :param panel_bed: Bedfile supplied for TGA analyses.
+    :param references: Reference dictionary to be updated.
+
+    :return: references: Updated reference dictionary.
+    """
+
     gens_arguments = [genome_interval, gens_coverage_pon, gnomad_min_af5]
 
     # Check if any of the gens arguments are provided

From b03a9f0cdc3772cc1d4cfa8609912db473b966a3 Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Mon, 17 Jun 2024 09:03:46 +0200
Subject: [PATCH 11/39] black

---
 BALSAMIC/commands/config/case.py | 2 +-
 BALSAMIC/utils/cli.py            | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/BALSAMIC/commands/config/case.py b/BALSAMIC/commands/config/case.py
index 02bea71b2..69e12533f 100644
--- a/BALSAMIC/commands/config/case.py
+++ b/BALSAMIC/commands/config/case.py
@@ -135,7 +135,7 @@ def case_config(
         gens_coverage_pon=gens_coverage_pon,
         gnomad_min_af5=gnomad_min_af5,
         panel_bed=panel_bed,
-        references=references
+        references=references,
     )
 
     variants_observations = {
diff --git a/BALSAMIC/utils/cli.py b/BALSAMIC/utils/cli.py
index 361082d94..7dce18211 100644
--- a/BALSAMIC/utils/cli.py
+++ b/BALSAMIC/utils/cli.py
@@ -257,7 +257,11 @@ def get_gens_references(
 
         # Update references dictionary with gens values
         references.update(
-            {gens_file: path for gens_file, path in gens_ref_files.items() if path is not None}
+            {
+                gens_file: path
+                for gens_file, path in gens_ref_files.items()
+                if path is not None
+            }
         )
     return references
 

From 582cfb07d62fd2bdc5758caea5bc56a123cf0eff Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Mon, 17 Jun 2024 09:14:03 +0200
Subject: [PATCH 12/39] changelog

---
 CHANGELOG.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index d9157bd1b..8b76deeb3 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -5,6 +5,7 @@
 Added:
 ^^^^^^
 * MSIsensor-pro container https://github.com/Clinical-Genomics/BALSAMIC/pull/1444
+* GENS input files for TGA https://github.com/Clinical-Genomics/BALSAMIC/pull/1448
 
 Changed:
 ^^^^^^^^

From e9c4c7f0b610d0e075db387e4f870673d13a99b4 Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Mon, 17 Jun 2024 10:09:24 +0200
Subject: [PATCH 13/39] bug fix

---
 BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule | 1 +
 1 file changed, 1 insertion(+)

diff --git a/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule b/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule
index 3eb57cce0..654d7139f 100644
--- a/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule
+++ b/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule
@@ -140,6 +140,7 @@ else:
             gens_baf_bed = cnv_dir + "{sample}.baf.bed",
             gens_cov_bed = cnv_dir + "{sample}.cov.bed"
         params:
+            sequencing_type = sequencing_type,
             gens_preprocessing = get_script_path("preprocess_gens.py"),
             gens_preprocess_cnvkit = get_script_path("postprocess_gens_cnvkit.py"),
             sample="{sample}"

From 8f88da780fc283e4f51d5214a2ac7e2433dcca95 Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Mon, 17 Jun 2024 10:43:25 +0200
Subject: [PATCH 14/39] fix

---
 BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
index cc387202f..d06bfc3c2 100644
--- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
+++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
@@ -17,10 +17,17 @@
     type=click.Path(exists=True),
     help="Input CNVkit cnr result.",
 )
-def create_gens_cov_file(output_file, ormalised_coverage_path):
+def create_gens_cov_file(output_file, normalised_coverage_path):
+    """
+    Post-processes the CNVkit cnr output for upload to GENS.
+    Removing Antitarget regions and outputting the coverages in multiple resolution-formats.
+
+    :param output_file: Path to GENS output.cov file
+    :param normalised_coverage_path: Path to input CNVkit cnr file.
+    """
     # Process CNVkit file
     log2_data = []
-    cnvkit_df = pd.read_csv(ormalised_coverage_path, sep="\t")
+    cnvkit_df = pd.read_csv(normalised_coverage_path, sep="\t")
     for index, row in cnvkit_df.iterrows():
         if row["gene"] == "Antitarget":
             continue

From 4a4671e5e4cb2848e2c2ab1ca221ef815fe085eb Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Mon, 17 Jun 2024 16:45:33 +0200
Subject: [PATCH 15/39] lower padding

---
 .../snakemake_rules/variant_calling/gens_preprocessing.rule     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule b/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule
index 654d7139f..f051200ac 100644
--- a/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule
+++ b/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule
@@ -124,7 +124,7 @@ else:
       -r {input.ref} \
       -i {input.bam} \
       --interval {input.bed} \
-      --interval_padding 100 \
+      --interval_padding 20 \
       --algo DNAscope \
       --pcr_indel_mode {params.pcr_model} \
       --given {input.gnomad_af5} {output.gnomad_af5_vcf};

From 5bcf7687d7b92e365d1fa6c348d23139a161dd56 Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Tue, 18 Jun 2024 09:17:38 +0200
Subject: [PATCH 16/39] add tumor purity adjustment to gens cov file

---
 .../assets/scripts/postprocess_gens_cnvkit.py | 22 +++++++++++++++++--
 .../variant_calling/gens_preprocessing.rule   | 11 +++++-----
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
index d06bfc3c2..b2003cc24 100644
--- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
+++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
@@ -17,23 +17,41 @@
     type=click.Path(exists=True),
     help="Input CNVkit cnr result.",
 )
-def create_gens_cov_file(output_file, normalised_coverage_path):
+@click.option(
+    "-p",
+    "--tumor-purity-path",
+    required=False,
+    type=click.Path(exists=True),
+    help="Tumor purity file from PureCN",
+)
+def create_gens_cov_file(output_file, normalised_coverage_path, tumor_purity_path):
     """
     Post-processes the CNVkit cnr output for upload to GENS.
     Removing Antitarget regions and outputting the coverages in multiple resolution-formats.
 
     :param output_file: Path to GENS output.cov file
     :param normalised_coverage_path: Path to input CNVkit cnr file.
+    :param tumor_purity_path: Path to PureCN purity estimate csv file
     """
     # Process CNVkit file
     log2_data = []
     cnvkit_df = pd.read_csv(normalised_coverage_path, sep="\t")
+
+    # Process PureCN purity file
+    purity = None
+    if tumor_purity_path:
+        purecn_df = pd.read_csv(tumor_purity_path, sep=",")
+        purity = float(purecn_df.iloc[0]["Purity"])
+
     for index, row in cnvkit_df.iterrows():
         if row["gene"] == "Antitarget":
             continue
         midpoint = row["start"] + int((row["end"] - row["start"]) / 2)
+        log2 = row['log2']
+        if purity:
+            log2 = round(float(log2) / purity, 4)
         log2_data.append(
-            f"{row['chromosome']}\t{midpoint-1}\t{midpoint}\t{row['log2']}"
+            f"{row['chromosome']}\t{midpoint-1}\t{midpoint}\t{log2}"
         )
 
     # Write log2 data to output file
diff --git a/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule b/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule
index f051200ac..18164d985 100644
--- a/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule
+++ b/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule
@@ -65,7 +65,7 @@ if config["analysis"]["sequencing_type"] == 'wgs':
     --count-panel-of-normals {input.gens_pon} \
     --tmp-dir {params.tmpdir} \
     --standardized-copy-ratios {output.standardized_cr} \
-    --denoised-copy-ratios {output.denoised_cr}
+    --denoised-copy-ratios {output.denoised_cr} ;
 
     rm -rf {params.tmpdir}
             """
@@ -89,7 +89,7 @@ if config["analysis"]["sequencing_type"] == 'wgs':
             "Formatting output for GENS for sample: {params.sample}."
         shell:
             """
-    python {params.gens_preprocessing} -s {params.sequencing_type} -o {output.gens_baf_bed} calculate-bafs --vcf-file-path {input.gnomad_af5_vcf}
+    python {params.gens_preprocessing} -s {params.sequencing_type} -o {output.gens_baf_bed} calculate-bafs --vcf-file-path {input.gnomad_af5_vcf} ;
     python {params.gens_preprocessing} -s {params.sequencing_type} -o {output.gens_cov_bed} create-coverage-regions --normalised-coverage-path {input.denoised_cr}
             """
 else:
@@ -136,6 +136,7 @@ else:
         input:
             cnvkit_cnr = cnv_dir + "tumor.merged.cnr",
             gnomad_af5_vcf = cnv_dir + "SNV.germline.{sample}.dnascope_gnomad_af5.vcf.gz",
+            purecn_purity_csv = cnv_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".purecn.purity.csv",
         output:
             gens_baf_bed = cnv_dir + "{sample}.baf.bed",
             gens_cov_bed = cnv_dir + "{sample}.cov.bed"
@@ -152,8 +153,8 @@ else:
             "Formatting output for GENS for sample: {params.sample}."
         shell:
             """
-    python {params.gens_preprocessing} -s {params.sequencing_type} -o {output.gens_baf_bed} calculate-bafs --vcf-file-path {input.gnomad_af5_vcf}
-    python {params.gens_preprocess_cnvkit} -o {output.gens_cov_bed} --normalised-coverage-path {input.cnvkit_cnr}
+    python {params.gens_preprocessing} -s {params.sequencing_type} -o {output.gens_baf_bed} calculate-bafs --vcf-file-path {input.gnomad_af5_vcf} ;
+    python {params.gens_preprocess_cnvkit} -o {output.gens_cov_bed} --normalised-coverage-path {input.cnvkit_cnr} --tumor-purity-path {input.purecn_purity_csv} 
             """
 
 rule finalize_gens_outputfiles:
@@ -175,7 +176,7 @@ rule finalize_gens_outputfiles:
         "Bgzip and index GENS output: {params.gens_input} for sample: {params.sample_id}."
     shell:
         """
-bgzip {input.gens_input}
+bgzip {input.gens_input} ;
 tabix {input.gens_input}.gz
         """
 

From acde38f688c4f1aab24e3c039eabc273eef703e2 Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Wed, 19 Jun 2024 15:33:36 +0200
Subject: [PATCH 17/39] typehints

---
 BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
index b2003cc24..e0542b85c 100644
--- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
+++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
@@ -24,7 +24,7 @@
     type=click.Path(exists=True),
     help="Tumor purity file from PureCN",
 )
-def create_gens_cov_file(output_file, normalised_coverage_path, tumor_purity_path):
+def create_gens_cov_file(output_file: str, normalised_coverage_path: str, tumor_purity_path: str | None):
     """
     Post-processes the CNVkit cnr output for upload to GENS.
     Removing Antitarget regions and outputting the coverages in multiple resolution-formats.

From ed4c018a4873160106f3f389e7bc580f64bfdac5 Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Wed, 19 Jun 2024 15:40:37 +0200
Subject: [PATCH 18/39] black

---
 BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
index e0542b85c..4ad870c12 100644
--- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
+++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
@@ -24,7 +24,9 @@
     type=click.Path(exists=True),
     help="Tumor purity file from PureCN",
 )
-def create_gens_cov_file(output_file: str, normalised_coverage_path: str, tumor_purity_path: str | None):
+def create_gens_cov_file(
+    output_file: str, normalised_coverage_path: str, tumor_purity_path: str | None
+):
     """
     Post-processes the CNVkit cnr output for upload to GENS.
     Removing Antitarget regions and outputting the coverages in multiple resolution-formats.
@@ -47,12 +49,10 @@ def create_gens_cov_file(output_file: str, normalised_coverage_path: str, tumor_
         if row["gene"] == "Antitarget":
             continue
         midpoint = row["start"] + int((row["end"] - row["start"]) / 2)
-        log2 = row['log2']
+        log2 = row["log2"]
         if purity:
             log2 = round(float(log2) / purity, 4)
-        log2_data.append(
-            f"{row['chromosome']}\t{midpoint-1}\t{midpoint}\t{log2}"
-        )
+        log2_data.append(f"{row['chromosome']}\t{midpoint-1}\t{midpoint}\t{log2}")
 
     # Write log2 data to output file
     with open(output_file, "w") as log2_out:

From a108a0e2f406699f76418fb14a85fe8bf9cf60ea Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Wed, 19 Jun 2024 16:17:21 +0200
Subject: [PATCH 19/39] fix pytests

---
 tests/commands/config/test_config_sample.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/tests/commands/config/test_config_sample.py b/tests/commands/config/test_config_sample.py
index 9185e7afa..1cc920fc0 100644
--- a/tests/commands/config/test_config_sample.py
+++ b/tests/commands/config/test_config_sample.py
@@ -367,7 +367,7 @@ def test_missing_required_gens_arguments(
     # THEN the CLI should exit code 2 and display an informative error message
     assert result.exit_code == 2
     assert (
-        "All three arguments (genome_interval gens_coverage_pon, gnomad_min_af5) are required for GENS."
+        "All three arguments (genome_interval, gens_coverage_pon, gnomad_min_af5) are required for GENS in WGS."
         in result.output
     )
 
@@ -448,21 +448,17 @@ def test_config_with_gens_arguments_for_tga(
             balsamic_cache,
             "--tumor-sample-name",
             tumor_sample_name,
-            "--gens-coverage-pon",
-            gens_cov_pon_file,
             "--gnomad-min-af5",
             gens_min_5_af_gnomad_file,
-            "--genome-interval",
-            gens_hg19_interval_list,
             "-p",
             panel_bed_file,
         ],
     )
-    # THEN config should fail with error message
-    assert result.exit_code == 2
-    assert (
-        "GENS is currently not compatible with TGA analysis, only WGS." in result.output
-    )
+    # THEN a config should be created and exist
+    assert result.exit_code == 0
+    assert Path(
+        analysis_dir, case_id_tumor_only, f"{case_id_tumor_only}.{FileType.JSON}"
+    ).exists()
 
 
 def test_config_wgs_with_exome(

From 99d3d57514f79f545dcf27f909dd32a0bfb418d5 Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Fri, 28 Jun 2024 12:15:43 +0200
Subject: [PATCH 20/39] update purity adjustment formula

---
 BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
index 4ad870c12..a0ce25d37 100644
--- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
+++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
@@ -44,6 +44,7 @@ def create_gens_cov_file(
     if tumor_purity_path:
         purecn_df = pd.read_csv(tumor_purity_path, sep=",")
         purity = float(purecn_df.iloc[0]["Purity"])
+        ploidy = float(purecn_df.iloc[0]["Ploidy"])
 
     for index, row in cnvkit_df.iterrows():
         if row["gene"] == "Antitarget":
@@ -51,7 +52,7 @@ def create_gens_cov_file(
         midpoint = row["start"] + int((row["end"] - row["start"]) / 2)
         log2 = row["log2"]
         if purity:
-            log2 = round(float(log2) / purity, 4)
+            log2 = round(float(1 - purity) + purity * (log2 / ploidy), 4)
         log2_data.append(f"{row['chromosome']}\t{midpoint-1}\t{midpoint}\t{log2}")
 
     # Write log2 data to output file

From 44f25e1c0fceeaa6d297ce2701152ce7c718d1a2 Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Fri, 28 Jun 2024 12:39:09 +0200
Subject: [PATCH 21/39] m conflict

---
 BALSAMIC/workflows/balsamic.smk | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/BALSAMIC/workflows/balsamic.smk b/BALSAMIC/workflows/balsamic.smk
index b4b67c8c6..0ecce72c4 100644
--- a/BALSAMIC/workflows/balsamic.smk
+++ b/BALSAMIC/workflows/balsamic.smk
@@ -9,17 +9,12 @@ from pathlib import Path
 from typing import Dict, List
 
 from BALSAMIC.constants.constants import FileType
-<<<<<<< HEAD
 from BALSAMIC.constants.analysis import FastqName, MutationType, SampleType, SequencingType
-from BALSAMIC.constants.paths import BALSAMIC_DIR, SENTIEON_DNASCOPE_DIR, SENTIEON_TNSCOPE_DIR
-=======
-from BALSAMIC.constants.analysis import FastqName, MutationType, SampleType
 from BALSAMIC.constants.paths import (
     BALSAMIC_DIR,
     SENTIEON_DNASCOPE_DIR,
     SENTIEON_TNSCOPE_DIR,
 )
->>>>>>> develop
 from BALSAMIC.constants.rules import SNAKEMAKE_RULES
 from BALSAMIC.constants.variant_filters import (
     COMMON_SETTINGS,

From 9335478a7d6917b63a937f772fc907e769d5d695 Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Fri, 28 Jun 2024 12:44:41 +0200
Subject: [PATCH 22/39] m conflict

---
 BALSAMIC/workflows/balsamic.smk | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/BALSAMIC/workflows/balsamic.smk b/BALSAMIC/workflows/balsamic.smk
index 0ecce72c4..960b83432 100644
--- a/BALSAMIC/workflows/balsamic.smk
+++ b/BALSAMIC/workflows/balsamic.smk
@@ -698,14 +698,7 @@ if config["analysis"]["analysis_type"] == "single":
     )
 
 # GENS Outputs
-<<<<<<< HEAD
 if "gnomad_min_af5" in config["reference"]:
-=======
-if (
-    config["analysis"]["sequencing_type"] == "wgs"
-    and "gens_coverage_pon" in config["reference"]
-):
->>>>>>> develop
     analysis_specific_results.extend(
         expand(
             cnv_dir + "{sample}.{gens_input}.bed.gz",

From 576811644c121dcaa1f397d8a66c0ba5082975d4 Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Mon, 15 Jul 2024 12:18:17 +0200
Subject: [PATCH 23/39] adjust formula

---
 BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
index a0ce25d37..21e706c31 100644
--- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
+++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
@@ -1,6 +1,9 @@
 import click
 import pandas as pd
+import numpy as np
 
+def adjust_log2_ratio(log2_ratio, purity):
+    return np.log2((2**log2_ratio - (1 - purity)) / purity)
 
 @click.command()
 @click.option(
@@ -52,7 +55,7 @@ def create_gens_cov_file(
         midpoint = row["start"] + int((row["end"] - row["start"]) / 2)
         log2 = row["log2"]
         if purity:
-            log2 = round(float(1 - purity) + purity * (log2 / ploidy), 4)
+            log2 = adjust_log2_ratio(log2, purity)
         log2_data.append(f"{row['chromosome']}\t{midpoint-1}\t{midpoint}\t{log2}")
 
     # Write log2 data to output file

From 64338874c78cf2e15fb08bbbd05eb88e01768e9a Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Mon, 15 Jul 2024 13:09:45 +0200
Subject: [PATCH 24/39] add round

---
 BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
index 21e706c31..f05f3ef35 100644
--- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
+++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
@@ -55,7 +55,7 @@ def create_gens_cov_file(
         midpoint = row["start"] + int((row["end"] - row["start"]) / 2)
         log2 = row["log2"]
         if purity:
-            log2 = adjust_log2_ratio(log2, purity)
+            log2 = round(adjust_log2_ratio(log2, purity), 4)
         log2_data.append(f"{row['chromosome']}\t{midpoint-1}\t{midpoint}\t{log2}")
 
     # Write log2 data to output file

From f3a2c665ef3efcbf2696613f34426381a5b74fa5 Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Tue, 23 Jul 2024 16:36:11 +0200
Subject: [PATCH 25/39] code review

---
 .../assets/scripts/postprocess_gens_cnvkit.py | 51 ++++++++++++-------
 BALSAMIC/utils/io.py                          |  7 +++
 2 files changed, 40 insertions(+), 18 deletions(-)

diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
index f05f3ef35..1e56f4a77 100644
--- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
+++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
@@ -1,9 +1,21 @@
 import click
-import pandas as pd
+from BALSAMIC.utils.io import read_csv
 import numpy as np
 
-def adjust_log2_ratio(log2_ratio, purity):
-    return np.log2((2**log2_ratio - (1 - purity)) / purity)
+def calculate_log2_ratio(purity, log2_ratio, ploidy):
+    # Ensure that the inputs are within valid ranges
+    if not (0 <= purity <= 1):
+        raise ValueError("Purity must be between 0 and 1")
+
+    if ploidy <= 0:
+        raise ValueError("Ploidy must be a positive number")
+
+    # Calculate the log2 ratio
+    numerator = (1 - purity) + (purity * log2_ratio)
+    log2_ratio = np.log2(numerator / ploidy)
+
+    return log2_ratio
+
 
 @click.command()
 @click.option(
@@ -18,7 +30,7 @@ def adjust_log2_ratio(log2_ratio, purity):
     "--normalised-coverage-path",
     required=True,
     type=click.Path(exists=True),
-    help="Input CNVkit cnr result.",
+    help="Input CNVkit cnr. result.",
 )
 @click.option(
     "-p",
@@ -30,33 +42,36 @@ def adjust_log2_ratio(log2_ratio, purity):
 def create_gens_cov_file(
     output_file: str, normalised_coverage_path: str, tumor_purity_path: str | None
 ):
-    """
-    Post-processes the CNVkit cnr output for upload to GENS.
-    Removing Antitarget regions and outputting the coverages in multiple resolution-formats.
+    """Post-processes the CNVkit .cnr output for upload to GENS.
 
-    :param output_file: Path to GENS output.cov file
-    :param normalised_coverage_path: Path to input CNVkit cnr file.
-    :param tumor_purity_path: Path to PureCN purity estimate csv file
+    Removes Antitarget regions, adjusts for purity and ploidy and outputs the coverages in multiple resolution-formats.
+
+    Args:
+        output_file: Path to GENS output.cov file
+        normalised_coverage_path: Path to input CNVkit cnr file.
+        tumor_purity_path: Path to PureCN purity estimate csv file
     """
-    # Process CNVkit file
     log2_data = []
-    cnvkit_df = pd.read_csv(normalised_coverage_path, sep="\t")
+
+    # Process CNVkit file
+    cnr_dict_list: list[dict] = read_csv(csv_path=normalised_coverage_path, delimeter="\t")
 
     # Process PureCN purity file
     purity = None
     if tumor_purity_path:
-        purecn_df = pd.read_csv(tumor_purity_path, sep=",")
-        purity = float(purecn_df.iloc[0]["Purity"])
-        ploidy = float(purecn_df.iloc[0]["Ploidy"])
+        purecn_dict_list: list[dict] = read_csv(csv_path=tumor_purity_path, delimeter=",")
+        purity = float(purecn_dict_list[0]["Purity"])
+        ploidy = float(purecn_dict_list[0]["Ploidy"])
 
-    for index, row in cnvkit_df.iterrows():
+    for row in cnr_dict_list:
         if row["gene"] == "Antitarget":
             continue
         midpoint = row["start"] + int((row["end"] - row["start"]) / 2)
         log2 = row["log2"]
         if purity:
-            log2 = round(adjust_log2_ratio(log2, purity), 4)
-        log2_data.append(f"{row['chromosome']}\t{midpoint-1}\t{midpoint}\t{log2}")
+            log2 = round(calculate_log2_ratio(purity, log2, ploidy), 4)
+        log2_data.append(f"{row['chromosome']}\t{midpoint - 1}\t{midpoint}\t{log2}")
+
 
     # Write log2 data to output file
     with open(output_file, "w") as log2_out:
diff --git a/BALSAMIC/utils/io.py b/BALSAMIC/utils/io.py
index ccc4d8a20..d5aba9944 100644
--- a/BALSAMIC/utils/io.py
+++ b/BALSAMIC/utils/io.py
@@ -1,5 +1,6 @@
 """Input/Output utility methods."""
 
+import csv
 import gzip
 import json
 import logging
@@ -66,6 +67,12 @@ def write_json(json_obj: dict, path: str) -> None:
         raise OSError(f"Error while writing JSON file: {path}, error: {error}")
 
 
+def read_csv(csv_path: str, delimeter: str) -> List[dict]:
+    """Read data from a csv file."""
+    with open(csv_path, mode='r') as csv_file:
+        csv_reader = csv.DictReader(csv_file, delimiter=delimeter)
+        return [row for row in csv_reader]
+
 def read_yaml(yaml_path: str) -> dict:
     """Read data from a yaml file."""
     if Path(yaml_path).exists():

From 2b91865619e578218fae5f9513996a2f69f65b41 Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Wed, 24 Jul 2024 08:37:22 +0200
Subject: [PATCH 26/39] fix bug

---
 BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
index 1e56f4a77..83a5c3a7a 100644
--- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
+++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
@@ -66,7 +66,9 @@ def create_gens_cov_file(
     for row in cnr_dict_list:
         if row["gene"] == "Antitarget":
             continue
-        midpoint = row["start"] + int((row["end"] - row["start"]) / 2)
+        start = int(row["start"])
+        end = int(row["end"])
+        midpoint = start + int((end - start / 2))
         log2 = row["log2"]
         if purity:
             log2 = round(calculate_log2_ratio(purity, log2, ploidy), 4)

From 821bbd3f082181a74ab09e796f8b4aeb722b6f12 Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Wed, 24 Jul 2024 08:53:40 +0200
Subject: [PATCH 27/39] float

---
 BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
index 83a5c3a7a..29f874b42 100644
--- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
+++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
@@ -69,7 +69,7 @@ def create_gens_cov_file(
         start = int(row["start"])
         end = int(row["end"])
         midpoint = start + int((end - start / 2))
-        log2 = row["log2"]
+        log2 = float(row["log2"])
         if purity:
             log2 = round(calculate_log2_ratio(purity, log2, ploidy), 4)
         log2_data.append(f"{row['chromosome']}\t{midpoint - 1}\t{midpoint}\t{log2}")

From 6aa551065a16c59bb4c0ee899f3470eca1c05592 Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Wed, 24 Jul 2024 10:51:45 +0200
Subject: [PATCH 28/39] comment out purity and ploidy adjustment

---
 BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
index 29f874b42..ef46ccfc3 100644
--- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
+++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
@@ -70,8 +70,8 @@ def create_gens_cov_file(
         end = int(row["end"])
         midpoint = start + int((end - start / 2))
         log2 = float(row["log2"])
-        if purity:
-            log2 = round(calculate_log2_ratio(purity, log2, ploidy), 4)
+        #if purity:
+        #    log2 = round(calculate_log2_ratio(purity, log2, ploidy), 4)
         log2_data.append(f"{row['chromosome']}\t{midpoint - 1}\t{midpoint}\t{log2}")
 
 

From f70318973b43b5a93300652d71915651cf2426da Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Thu, 25 Jul 2024 13:58:24 +0200
Subject: [PATCH 29/39] adjust purity and ploidy

---
 .../assets/scripts/postprocess_gens_cnvkit.py   | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
index ef46ccfc3..b395158d3 100644
--- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
+++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
@@ -1,6 +1,7 @@
 import click
 from BALSAMIC.utils.io import read_csv
 import numpy as np
+import warnings
 
 def calculate_log2_ratio(purity, log2_ratio, ploidy):
     # Ensure that the inputs are within valid ranges
@@ -12,6 +13,10 @@ def calculate_log2_ratio(purity, log2_ratio, ploidy):
 
     # Calculate the log2 ratio
     numerator = (1 - purity) + (purity * log2_ratio)
+
+    if numerator <= 0:
+        return None
+
     log2_ratio = np.log2(numerator / ploidy)
 
     return log2_ratio
@@ -63,6 +68,7 @@ def create_gens_cov_file(
         purity = float(purecn_dict_list[0]["Purity"])
         ploidy = float(purecn_dict_list[0]["Ploidy"])
 
+    count_none_log2 = 0
     for row in cnr_dict_list:
         if row["gene"] == "Antitarget":
             continue
@@ -70,10 +76,15 @@ def create_gens_cov_file(
         end = int(row["end"])
         midpoint = start + int((end - start / 2))
         log2 = float(row["log2"])
-        #if purity:
-        #    log2 = round(calculate_log2_ratio(purity, log2, ploidy), 4)
-        log2_data.append(f"{row['chromosome']}\t{midpoint - 1}\t{midpoint}\t{log2}")
+        if purity:
+            log2 = round(calculate_log2_ratio(purity, log2, ploidy), 4)
+            if not log2:
+                count_none_log2 += 1
+                warnings.warn("Numerator is less than or equal to 0, returning None for region.")
+                continue
 
+        log2_data.append(f"{row['chromosome']}\t{midpoint - 1}\t{midpoint}\t{log2}")
+    warnings.warn(f"Some regions could not be transformed due to invalid values after plodiy and purity adjustment: {count_none_log2}")
 
     # Write log2 data to output file
     with open(output_file, "w") as log2_out:

From 8994fc85fbf83fea840507697aba089dc30df466 Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Thu, 25 Jul 2024 15:12:09 +0200
Subject: [PATCH 30/39] bug fix

---
 BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
index b395158d3..560f949b1 100644
--- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
+++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
@@ -77,12 +77,12 @@ def create_gens_cov_file(
         midpoint = start + int((end - start / 2))
         log2 = float(row["log2"])
         if purity:
-            log2 = round(calculate_log2_ratio(purity, log2, ploidy), 4)
+            log2 = calculate_log2_ratio(purity, log2, ploidy)
             if not log2:
                 count_none_log2 += 1
                 warnings.warn("Numerator is less than or equal to 0, returning None for region.")
                 continue
-
+            log2 = round(log2, 4)
         log2_data.append(f"{row['chromosome']}\t{midpoint - 1}\t{midpoint}\t{log2}")
     warnings.warn(f"Some regions could not be transformed due to invalid values after plodiy and purity adjustment: {count_none_log2}")
 

From 11be7bfe547bbf2bd0fd1c574f34421dbdc1f0e6 Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Thu, 25 Jul 2024 16:10:56 +0200
Subject: [PATCH 31/39] remove purity and ploidy adjustment

---
 BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
index 560f949b1..05e72426f 100644
--- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
+++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
@@ -76,13 +76,13 @@ def create_gens_cov_file(
         end = int(row["end"])
         midpoint = start + int((end - start / 2))
         log2 = float(row["log2"])
-        if purity:
-            log2 = calculate_log2_ratio(purity, log2, ploidy)
-            if not log2:
-                count_none_log2 += 1
-                warnings.warn("Numerator is less than or equal to 0, returning None for region.")
-                continue
-            log2 = round(log2, 4)
+        #if purity:
+        #    log2 = calculate_log2_ratio(purity, log2, ploidy)
+        #    if not log2:
+        #        count_none_log2 += 1
+        #        warnings.warn("Numerator is less than or equal to 0, returning None for region.")
+        #        continue
+        #    log2 = round(log2, 4)
         log2_data.append(f"{row['chromosome']}\t{midpoint - 1}\t{midpoint}\t{log2}")
     warnings.warn(f"Some regions could not be transformed due to invalid values after plodiy and purity adjustment: {count_none_log2}")
 

From 3d44b03941eb50ef6c7be1a889535db9da914854 Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Mon, 29 Jul 2024 08:17:01 +0200
Subject: [PATCH 32/39] new threads

---
 BALSAMIC/constants/cluster_analysis.json                      | 4 ++++
 .../snakemake_rules/variant_calling/gens_preprocessing.rule   | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/BALSAMIC/constants/cluster_analysis.json b/BALSAMIC/constants/cluster_analysis.json
index d63b5e136..10828dfcf 100644
--- a/BALSAMIC/constants/cluster_analysis.json
+++ b/BALSAMIC/constants/cluster_analysis.json
@@ -156,6 +156,10 @@
 		"time": "24:00:00",
 		"n": 36
 	},
+	"sentieon_DNAscope_gnomad_tga": {
+		"time": "24:00:00",
+		"n": 12
+	},
 	"sentieon_TNhaplotyper": {
 		"time": "24:00:00",
 		"n": 36
diff --git a/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule b/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule
index 0b5d35e78..466a8beb4 100644
--- a/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule
+++ b/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule
@@ -112,7 +112,7 @@ else:
         benchmark:
             Path(benchmark_dir,"sentieon_DNAscope_gnomad_{sample}.tsv").as_posix()
         threads:
-            get_threads(cluster_config,"sentieon_DNAscope_gnomad")
+            get_threads(cluster_config,"sentieon_DNAscope_gnomad_tga")
         message:
             "Calling germline variants on positions in Gnomad AF > 0.05 using Sentieon DNAscope for {params.sample}"
         shell:

From b484670e9081cc4f4c1043b047d0ec43e39b63b3 Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Wed, 31 Jul 2024 14:24:28 +0200
Subject: [PATCH 33/39] return purity adjustment

---
 .../assets/scripts/postprocess_gens_cnvkit.py | 25 ++++++++-----------
 .../containers/varcall_py3/varcall_py3.yaml   |  1 -
 BALSAMIC/utils/io.py                          |  2 +-
 3 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
index 05e72426f..4cf0d0a5a 100644
--- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
+++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
@@ -40,12 +40,12 @@ def calculate_log2_ratio(purity, log2_ratio, ploidy):
 @click.option(
     "-p",
     "--tumor-purity-path",
-    required=False,
+    required=True,
     type=click.Path(exists=True),
     help="Tumor purity file from PureCN",
 )
 def create_gens_cov_file(
-    output_file: str, normalised_coverage_path: str, tumor_purity_path: str | None
+    output_file: str, normalised_coverage_path: str, tumor_purity_path: str
 ):
     """Post-processes the CNVkit .cnr output for upload to GENS.
 
@@ -62,11 +62,9 @@ def create_gens_cov_file(
     cnr_dict_list: list[dict] = read_csv(csv_path=normalised_coverage_path, delimeter="\t")
 
     # Process PureCN purity file
-    purity = None
-    if tumor_purity_path:
-        purecn_dict_list: list[dict] = read_csv(csv_path=tumor_purity_path, delimeter=",")
-        purity = float(purecn_dict_list[0]["Purity"])
-        ploidy = float(purecn_dict_list[0]["Ploidy"])
+    purecn_dict_list: list[dict] = read_csv(csv_path=tumor_purity_path, delimeter=",")
+    purity = float(purecn_dict_list[0]["Purity"])
+    ploidy = float(purecn_dict_list[0]["Ploidy"])
 
     count_none_log2 = 0
     for row in cnr_dict_list:
@@ -76,13 +74,12 @@ def create_gens_cov_file(
         end = int(row["end"])
         midpoint = start + int((end - start / 2))
         log2 = float(row["log2"])
-        #if purity:
-        #    log2 = calculate_log2_ratio(purity, log2, ploidy)
-        #    if not log2:
-        #        count_none_log2 += 1
-        #        warnings.warn("Numerator is less than or equal to 0, returning None for region.")
-        #        continue
-        #    log2 = round(log2, 4)
+        log2 = calculate_log2_ratio(purity, log2, ploidy)
+        if not log2:
+            count_none_log2 += 1
+            warnings.warn("Numerator is less than or equal to 0, returning None for region.")
+            continue
+            log2 = round(log2, 4)
         log2_data.append(f"{row['chromosome']}\t{midpoint - 1}\t{midpoint}\t{log2}")
     warnings.warn(f"Some regions could not be transformed due to invalid values after plodiy and purity adjustment: {count_none_log2}")
 
diff --git a/BALSAMIC/containers/varcall_py3/varcall_py3.yaml b/BALSAMIC/containers/varcall_py3/varcall_py3.yaml
index 4f106103e..5c643a93a 100644
--- a/BALSAMIC/containers/varcall_py3/varcall_py3.yaml
+++ b/BALSAMIC/containers/varcall_py3/varcall_py3.yaml
@@ -163,7 +163,6 @@ dependencies:
   - svdb=2.8.1
   - sysroot_linux-64=2.12
   - tabix=1.11
-  - tiddit=3.3.2
   - tk=8.6.12
   - tktable=2.10
   - toolz=0.12.0
diff --git a/BALSAMIC/utils/io.py b/BALSAMIC/utils/io.py
index d5aba9944..555597a30 100644
--- a/BALSAMIC/utils/io.py
+++ b/BALSAMIC/utils/io.py
@@ -67,7 +67,7 @@ def write_json(json_obj: dict, path: str) -> None:
         raise OSError(f"Error while writing JSON file: {path}, error: {error}")
 
 
-def read_csv(csv_path: str, delimeter: str) -> List[dict]:
+def read_csv(csv_path: str, delimeter: str = ",") -> List[dict]:
     """Read data from a csv file."""
     with open(csv_path, mode='r') as csv_file:
         csv_reader = csv.DictReader(csv_file, delimiter=delimeter)

From 1fac87657d100f9fab0b81e69116d85c7e782f07 Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Wed, 31 Jul 2024 15:08:07 +0200
Subject: [PATCH 34/39] code review

---
 .../assets/scripts/postprocess_gens_cnvkit.py | 25 +++++---
 BALSAMIC/commands/config/case.py              | 11 +++-
 BALSAMIC/constants/cluster_analysis.json      |  4 +-
 .../variant_calling/gens_preprocessing.rule   | 56 ++++++++---------
 BALSAMIC/utils/cli.py                         | 61 ++++++++-----------
 BALSAMIC/utils/io.py                          |  3 +-
 6 files changed, 81 insertions(+), 79 deletions(-)

diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
index 4cf0d0a5a..24e54c108 100644
--- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
+++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
@@ -3,6 +3,7 @@
 import numpy as np
 import warnings
 
+
 def calculate_log2_ratio(purity, log2_ratio, ploidy):
     # Ensure that the inputs are within valid ranges
     if not (0 <= purity <= 1):
@@ -59,7 +60,9 @@ def create_gens_cov_file(
     log2_data = []
 
     # Process CNVkit file
-    cnr_dict_list: list[dict] = read_csv(csv_path=normalised_coverage_path, delimeter="\t")
+    cnr_dict_list: list[dict] = read_csv(
+        csv_path=normalised_coverage_path, delimeter="\t"
+    )
 
     # Process PureCN purity file
     purecn_dict_list: list[dict] = read_csv(csv_path=tumor_purity_path, delimeter=",")
@@ -70,18 +73,22 @@ def create_gens_cov_file(
     for row in cnr_dict_list:
         if row["gene"] == "Antitarget":
             continue
-        start = int(row["start"])
-        end = int(row["end"])
-        midpoint = start + int((end - start / 2))
-        log2 = float(row["log2"])
-        log2 = calculate_log2_ratio(purity, log2, ploidy)
+        start: int = int(row["start"])
+        end: int = int(row["end"])
+        midpoint: int = start + int((end - start / 2))
+        log2: float = float(row["log2"])
+        log2: float | None = calculate_log2_ratio(purity, log2, ploidy)
         if not log2:
             count_none_log2 += 1
-            warnings.warn("Numerator is less than or equal to 0, returning None for region.")
+            warnings.warn(
+                "Numerator is less than or equal to 0, returning None for region."
+            )
             continue
-            log2 = round(log2, 4)
+            log2: float = round(log2, 4)
         log2_data.append(f"{row['chromosome']}\t{midpoint - 1}\t{midpoint}\t{log2}")
-    warnings.warn(f"Some regions could not be transformed due to invalid values after plodiy and purity adjustment: {count_none_log2}")
+    warnings.warn(
+        f"Some regions could not be transformed due to invalid values after plodiy and purity adjustment: {count_none_log2}"
+    )
 
     # Write log2 data to output file
     with open(output_file, "w") as log2_out:
diff --git a/BALSAMIC/commands/config/case.py b/BALSAMIC/commands/config/case.py
index cf4277914..48c054ee5 100644
--- a/BALSAMIC/commands/config/case.py
+++ b/BALSAMIC/commands/config/case.py
@@ -132,12 +132,19 @@ def case_config(
     if cadd_annotations:
         references.update(cadd_annotations_path)
 
-    references = get_gens_references(
+    gens_references: dict[str, str] = get_gens_references(
         genome_interval=genome_interval,
         gens_coverage_pon=gens_coverage_pon,
         gnomad_min_af5=gnomad_min_af5,
         panel_bed=panel_bed,
-        references=references,
+    )
+    # Update references dictionary with GENS reference-files
+    references.update(
+        {
+            gens_file: path
+            for gens_file, path in gens_references.items()
+            if path is not None
+        }
     )
 
     variants_observations = {
diff --git a/BALSAMIC/constants/cluster_analysis.json b/BALSAMIC/constants/cluster_analysis.json
index 10828dfcf..9a18e3381 100644
--- a/BALSAMIC/constants/cluster_analysis.json
+++ b/BALSAMIC/constants/cluster_analysis.json
@@ -11,7 +11,7 @@
 		"time": "00:15:00",
 		"n": 1
 	},
-	"gens_preprocessing": {
+	"gens_preprocess": {
 		"time": "01:00:00",
 		"n": 4
 	},
@@ -96,7 +96,7 @@
 		"time": "10:00:00",
 		"n": 5
 	},
-	"gatk_denoisereadcounts":{
+	"gatk_denoise_read_counts":{
 		"time": "10:00:00",
 		"n": 10
 	},
diff --git a/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule b/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule
index 466a8beb4..d5ee5dc80 100644
--- a/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule
+++ b/BALSAMIC/snakemake_rules/variant_calling/gens_preprocessing.rule
@@ -1,7 +1,7 @@
 # vim: syntax=python tabstop=4 expandtab
 # coding: utf-8
 
-if config["analysis"]["sequencing_type"] == 'wgs':
+if config["analysis"]["sequencing_type"] == SequencingType.WGS:
     rule sentieon_DNAscope_gnomad:
         input:
             ref = config["reference"]["reference_genome"],
@@ -40,7 +40,7 @@ if config["analysis"]["sequencing_type"] == 'wgs':
     rm -rf {params.tmpdir};
           """
 
-    rule gatk_denoisereadcounts:
+    rule gatk_denoise_read_counts:
         input:
             gens_pon=config["reference"]["gens_coverage_pon"],
             readcounts_hdf5=cnv_dir + "{sample}.collectreadcounts.hdf5"
@@ -51,11 +51,11 @@ if config["analysis"]["sequencing_type"] == 'wgs':
             tmpdir=tempfile.mkdtemp(prefix=tmp_dir),
             sample="{sample}"
         benchmark:
-            Path(benchmark_dir,"gatk_denoisereadcounts_{sample}.tsv").as_posix()
+            Path(benchmark_dir,"gatk_denoise_read_counts_{sample}.tsv").as_posix()
         singularity:
             Path(singularity_image,config["bioinfo_tools"].get("gatk") + ".sif").as_posix()
         threads:
-            get_threads(cluster_config,"gatk_denoisereadcounts")
+            get_threads(cluster_config,"gatk_denoise_read_counts")
         message:
             "Running GATK DenoiseReadCounts on {params.sample} for GENS."
         shell:
@@ -72,7 +72,7 @@ if config["analysis"]["sequencing_type"] == 'wgs':
     rm -rf {params.tmpdir}
             """
 
-    rule gens_preprocessing_wgs:
+    rule gens_preprocess_wgs:
         input:
             denoised_cr = cnv_dir + "{sample}.denoisedCR.tsv",
             gnomad_af5_vcf = cnv_dir + "SNV.germline.{sample}.dnascope_gnomad_af5.vcf.gz",
@@ -80,19 +80,19 @@ if config["analysis"]["sequencing_type"] == 'wgs':
             gens_baf_bed = cnv_dir + "{sample}.baf.bed",
             gens_cov_bed = cnv_dir + "{sample}.cov.bed"
         params:
-            gens_preprocessing = get_script_path("preprocess_gens.py"),
+            gens_preprocess = get_script_path("preprocess_gens.py"),
             sequencing_type = sequencing_type,
             sample="{sample}"
         benchmark:
-            Path(benchmark_dir, "gens_preprocessing_{sample}.tsv").as_posix()
+            Path(benchmark_dir, "gens_preprocess_wgs_{sample}.tsv").as_posix()
         threads:
-            get_threads(cluster_config, "gens_preprocessing")
+            get_threads(cluster_config, "gens_preprocess")
         message:
             "Formatting output for GENS for sample: {params.sample}."
         shell:
             """
-    python {params.gens_preprocessing} -s {params.sequencing_type} -o {output.gens_baf_bed} calculate-bafs --vcf-file-path {input.gnomad_af5_vcf} ;
-    python {params.gens_preprocessing} -s {params.sequencing_type} -o {output.gens_cov_bed} create-coverage-regions --normalised-coverage-path {input.denoised_cr}
+    python {params.gens_preprocess} -s {params.sequencing_type} -o {output.gens_baf_bed} calculate-bafs --vcf-file-path {input.gnomad_af5_vcf} ;
+    python {params.gens_preprocess} -s {params.sequencing_type} -o {output.gens_cov_bed} create-coverage-regions --normalised-coverage-path {input.denoised_cr}
             """
 else:
     rule sentieon_DNAscope_gnomad_tga:
@@ -117,24 +117,24 @@ else:
             "Calling germline variants on positions in Gnomad AF > 0.05 using Sentieon DNAscope for {params.sample}"
         shell:
             """
-      export TMPDIR={params.tmpdir};
-      export SENTIEON_TMPDIR={params.tmpdir};
-      export SENTIEON_LICENSE={params.sentieon_lic};
+    export TMPDIR={params.tmpdir};
+    export SENTIEON_TMPDIR={params.tmpdir};
+    export SENTIEON_LICENSE={params.sentieon_lic};
   
-      {params.sentieon_exec} driver \
-      -t {threads} \
-      -r {input.ref} \
-      -i {input.bam} \
-      --interval {input.bed} \
-      --interval_padding 20 \
-      --algo DNAscope \
-      --pcr_indel_mode {params.pcr_model} \
-      --given {input.gnomad_af5} {output.gnomad_af5_vcf};
+     {params.sentieon_exec} driver \
+    -t {threads} \
+    -r {input.ref} \
+    -i {input.bam} \
+    --interval {input.bed} \
+    --interval_padding 20 \
+    --algo DNAscope \
+    --pcr_indel_mode {params.pcr_model} \
+    --given {input.gnomad_af5} {output.gnomad_af5_vcf};
   
-      rm -rf {params.tmpdir};
+    rm -rf {params.tmpdir};
             """
 
-    rule gens_preprocessing_tga:
+    rule gens_preprocess_tga:
         input:
             cnvkit_cnr = cnv_dir + "tumor.merged.cnr",
             gnomad_af5_vcf = cnv_dir + "SNV.germline.{sample}.dnascope_gnomad_af5.vcf.gz",
@@ -144,18 +144,18 @@ else:
             gens_cov_bed = cnv_dir + "{sample}.cov.bed"
         params:
             sequencing_type = sequencing_type,
-            gens_preprocessing = get_script_path("preprocess_gens.py"),
+            gens_preprocess = get_script_path("preprocess_gens.py"),
             gens_preprocess_cnvkit = get_script_path("postprocess_gens_cnvkit.py"),
             sample="{sample}"
         benchmark:
-            Path(benchmark_dir, "gens_preprocessing_{sample}.tsv").as_posix()
+            Path(benchmark_dir, "gens_preprocess_tga_{sample}.tsv").as_posix()
         threads:
-            get_threads(cluster_config, "gens_preprocessing")
+            get_threads(cluster_config, "gens_preprocess")
         message:
             "Formatting output for GENS for sample: {params.sample}."
         shell:
             """
-    python {params.gens_preprocessing} -s {params.sequencing_type} -o {output.gens_baf_bed} calculate-bafs --vcf-file-path {input.gnomad_af5_vcf} ;
+    python {params.gens_preprocess} -s {params.sequencing_type} -o {output.gens_baf_bed} calculate-bafs --vcf-file-path {input.gnomad_af5_vcf} ;
     python {params.gens_preprocess_cnvkit} -o {output.gens_cov_bed} --normalised-coverage-path {input.cnvkit_cnr} --tumor-purity-path {input.purecn_purity_csv} 
             """
 
diff --git a/BALSAMIC/utils/cli.py b/BALSAMIC/utils/cli.py
index e27573019..1e0873dfb 100644
--- a/BALSAMIC/utils/cli.py
+++ b/BALSAMIC/utils/cli.py
@@ -220,50 +220,37 @@ def get_gens_references(
     gens_coverage_pon: Optional[str],
     gnomad_min_af5: Optional[str],
     panel_bed: Optional[str],
-    references: Dict[str, str],
-) -> Dict:
+) -> Dict[str, str]:
     """
-    Assigns reference-files required for GENS if they have been supplied.
+    Assigns reference-files required for GENS if they have been supplied, else exists with error message.
 
-    :param genome_interval: Coverage-regions. (required for WGS GENS)
-    :param gens_coverage_pon: PON for GATK CollectReadCounts. (required for WGS GENS)
-    :param gnomad_min_af5: gnomad VCF filtered to keep variants above 5% VAF. (required for WGS and TGA GENS)
-    :param panel_bed: Bedfile supplied for TGA analyses.
-    :param references: Reference dictionary to be updated.
+    Args:
+        genome_interval: Optional[str]. Coverage-regions. (required for WGS GENS)
+        gens_coverage_pon: Optional[str]. PON for GATK CollectReadCounts. (required for WGS GENS)
+        gnomad_min_af5: Optional[str] gnomad VCF filtered to keep variants above 5% VAF (required for WGS and TGA GENS).
+        panel_bed: Optional[str] Bedfile supplied for TGA analyses.
 
-    :return: references: Updated reference dictionary.
+    Returns:
+         Dict[str, str] with paths to GENS reference files
     """
 
-    gens_arguments = [genome_interval, gens_coverage_pon, gnomad_min_af5]
-
-    # Check if any of the gens arguments are provided
-    if any(gens_arguments):
-        # Ensure all required arguments are provided or raise an error if panel_bed is not set
-        if not all(gens_arguments) and not panel_bed:
-            raise click.BadParameter(
-                "All three arguments (genome_interval, gens_coverage_pon, gnomad_min_af5) are required for GENS in WGS."
-            )
+    if panel_bed and gnomad_min_af5:
+        return {"gnomad_min_af5": gnomad_min_af5}
 
-        # Construct the gens_ref_files dictionary
-        gens_ref_files = (
-            {"gnomad_min_af5": gnomad_min_af5}
-            if panel_bed
-            else {
-                "genome_interval": genome_interval,
-                "gens_coverage_pon": gens_coverage_pon,
-                "gnomad_min_af5": gnomad_min_af5,
-            }
-        )
+    if gnomad_min_af5 and genome_interval and gens_coverage_pon:
+        return {
+            "genome_interval": genome_interval,
+            "gens_coverage_pon": gens_coverage_pon,
+            "gnomad_min_af5": gnomad_min_af5,
+        }
 
-        # Update references dictionary with gens values
-        references.update(
-            {
-                gens_file: path
-                for gens_file, path in gens_ref_files.items()
-                if path is not None
-            }
-        )
-    return references
+    error_message = (
+        "GENS reference file is always required to run BALSAMIC."
+        "WGS requires arguments: genome_interval, gens_coverage_pon, gnomad_min_af5"
+        "TGA requires argument: gnomad_min_af5"
+    )
+    LOG.error(error_message)
+    raise BalsamicError(error_message)
 
 
 def get_bioinfo_tools_version(
diff --git a/BALSAMIC/utils/io.py b/BALSAMIC/utils/io.py
index 555597a30..0b615265d 100644
--- a/BALSAMIC/utils/io.py
+++ b/BALSAMIC/utils/io.py
@@ -69,10 +69,11 @@ def write_json(json_obj: dict, path: str) -> None:
 
 def read_csv(csv_path: str, delimeter: str = ",") -> List[dict]:
     """Read data from a csv file."""
-    with open(csv_path, mode='r') as csv_file:
+    with open(csv_path, mode="r") as csv_file:
         csv_reader = csv.DictReader(csv_file, delimiter=delimeter)
         return [row for row in csv_reader]
 
+
 def read_yaml(yaml_path: str) -> dict:
     """Read data from a yaml file."""
     if Path(yaml_path).exists():

From 9688e3df245c2d4b9a9c63d36b0946c871e8a3f4 Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Thu, 1 Aug 2024 14:13:25 +0200
Subject: [PATCH 35/39] new purity and ploidy formula from PureCN

---
 .../assets/scripts/postprocess_gens_cnvkit.py | 35 ++++++++-----------
 1 file changed, 14 insertions(+), 21 deletions(-)

diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
index 24e54c108..19b56331f 100644
--- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
+++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
@@ -12,15 +12,10 @@ def calculate_log2_ratio(purity, log2_ratio, ploidy):
     if ploidy <= 0:
         raise ValueError("Ploidy must be a positive number")
 
-    # Calculate the log2 ratio
-    numerator = (1 - purity) + (purity * log2_ratio)
+    # Ploidy and purity adjustment formula reference to PureCN issue: https://github.com/lima1/PureCN/issues/40
+    log2_adjusted = (purity*log2_ratio*ploidy + 2*(1-purity)*log2_ratio - 2*(1-purity)) / (purity*ploidy)
 
-    if numerator <= 0:
-        return None
-
-    log2_ratio = np.log2(numerator / ploidy)
-
-    return log2_ratio
+    return log2_adjusted
 
 
 @click.command()
@@ -69,26 +64,24 @@ def create_gens_cov_file(
     purity = float(purecn_dict_list[0]["Purity"])
     ploidy = float(purecn_dict_list[0]["Ploidy"])
 
-    count_none_log2 = 0
     for row in cnr_dict_list:
         if row["gene"] == "Antitarget":
             continue
+
+        # find midpoint
         start: int = int(row["start"])
         end: int = int(row["end"])
-        midpoint: int = start + int((end - start / 2))
+        region_size: int = end - start
+        midpoint: int = start + int(region_size / 2)
+
+        # adjust log2 ratio
         log2: float = float(row["log2"])
-        log2: float | None = calculate_log2_ratio(purity, log2, ploidy)
-        if not log2:
-            count_none_log2 += 1
-            warnings.warn(
-                "Numerator is less than or equal to 0, returning None for region."
-            )
-            continue
-            log2: float = round(log2, 4)
+        log2: float = calculate_log2_ratio(purity, log2, ploidy)
+        log2: float = round(log2, 4)
+
+        # store values in list
         log2_data.append(f"{row['chromosome']}\t{midpoint - 1}\t{midpoint}\t{log2}")
-    warnings.warn(
-        f"Some regions could not be transformed due to invalid values after plodiy and purity adjustment: {count_none_log2}"
-    )
+
 
     # Write log2 data to output file
     with open(output_file, "w") as log2_out:

From 3abae1bf60a0f8ab69eae34f18d6172c82d6732d Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Fri, 2 Aug 2024 15:26:16 +0200
Subject: [PATCH 36/39] fix pytests

---
 BALSAMIC/commands/config/case.py            |  29 ++---
 BALSAMIC/utils/cli.py                       |   4 +-
 tests/commands/config/test_config_sample.py |  96 ++++++++++------
 tests/conftest.py                           | 117 ++++++++++++++------
 4 files changed, 159 insertions(+), 87 deletions(-)

diff --git a/BALSAMIC/commands/config/case.py b/BALSAMIC/commands/config/case.py
index 48c054ee5..77839f68c 100644
--- a/BALSAMIC/commands/config/case.py
+++ b/BALSAMIC/commands/config/case.py
@@ -132,20 +132,21 @@ def case_config(
     if cadd_annotations:
         references.update(cadd_annotations_path)
 
-    gens_references: dict[str, str] = get_gens_references(
-        genome_interval=genome_interval,
-        gens_coverage_pon=gens_coverage_pon,
-        gnomad_min_af5=gnomad_min_af5,
-        panel_bed=panel_bed,
-    )
-    # Update references dictionary with GENS reference-files
-    references.update(
-        {
-            gens_file: path
-            for gens_file, path in gens_references.items()
-            if path is not None
-        }
-    )
+    if analysis_workflow is not AnalysisWorkflow.BALSAMIC_QC:
+        gens_references: dict[str, str] = get_gens_references(
+            genome_interval=genome_interval,
+            gens_coverage_pon=gens_coverage_pon,
+            gnomad_min_af5=gnomad_min_af5,
+            panel_bed=panel_bed,
+        )
+        # Update references dictionary with GENS reference-files
+        references.update(
+            {
+                gens_file: path
+                for gens_file, path in gens_references.items()
+                if path is not None
+            }
+        )
 
     variants_observations = {
         "clinical_snv_observations": clinical_snv_observations,
diff --git a/BALSAMIC/utils/cli.py b/BALSAMIC/utils/cli.py
index 1e0873dfb..cdd02e680 100644
--- a/BALSAMIC/utils/cli.py
+++ b/BALSAMIC/utils/cli.py
@@ -245,8 +245,8 @@ def get_gens_references(
         }
 
     error_message = (
-        "GENS reference file is always required to run BALSAMIC."
-        "WGS requires arguments: genome_interval, gens_coverage_pon, gnomad_min_af5"
+        "GENS reference file is always required to run BALSAMIC.\n"
+        "WGS requires arguments: genome_interval, gens_coverage_pon, gnomad_min_af5\n"
         "TGA requires argument: gnomad_min_af5"
     )
     LOG.error(error_message)
diff --git a/tests/commands/config/test_config_sample.py b/tests/commands/config/test_config_sample.py
index 6296e5939..8e5456251 100644
--- a/tests/commands/config/test_config_sample.py
+++ b/tests/commands/config/test_config_sample.py
@@ -17,8 +17,7 @@ def test_tumor_normal_config(
     fastq_dir_tumor_normal_parameterize: str,
     balsamic_cache: str,
     panel_bed_file: str,
-    sentieon_license: str,
-    sentieon_install_dir: str,
+    config_case_cli_tga: list[str],
 ):
     """Test tumor normal balsamic config case command."""
 
@@ -44,11 +43,8 @@ def test_tumor_normal_config(
             tumor_sample_name,
             "--normal-sample-name",
             normal_sample_name,
-            "--sentieon-install-dir",
-            sentieon_install_dir,
-            "--sentieon-license",
-            sentieon_license,
-        ],
+        ]
+        + config_case_cli_tga,
     )
 
     # THEN a config should be created and exist
@@ -66,8 +62,7 @@ def test_tumor_only_config(
     fastq_dir_tumor_only: str,
     balsamic_cache: str,
     panel_bed_file: str,
-    sentieon_license: str,
-    sentieon_install_dir: str,
+    config_case_cli_tga: list[str],
 ):
     """Test tumor only balsamic config case command."""
 
@@ -90,11 +85,8 @@ def test_tumor_only_config(
             balsamic_cache,
             "--tumor-sample-name",
             tumor_sample_name,
-            "--sentieon-install-dir",
-            sentieon_install_dir,
-            "--sentieon-license",
-            sentieon_license,
-        ],
+        ]
+        + config_case_cli_tga,
     )
 
     # THEN a config should be created and exist
@@ -199,8 +191,7 @@ def test_tumor_only_umi_config_background_file(
     balsamic_cache: str,
     panel_bed_file: str,
     background_variant_file: str,
-    sentieon_license: str,
-    sentieon_install_dir: str,
+    config_case_cli_tga: list[str],
 ):
     """Test balsamic umi config case providing a background variants file."""
 
@@ -227,11 +218,8 @@ def test_tumor_only_umi_config_background_file(
             balsamic_cache,
             "--tumor-sample-name",
             tumor_sample_name,
-            "--sentieon-install-dir",
-            sentieon_install_dir,
-            "--sentieon-license",
-            sentieon_license,
-        ],
+        ]
+        + config_case_cli_tga
     )
 
     # THEN program exits and checks for filepath
@@ -250,6 +238,7 @@ def test_pon_cnn_file(
     case_id_tumor_only: str,
     sentieon_license: str,
     sentieon_install_dir: str,
+    config_case_cli_tga: list[str],
 ):
     """Test balsamic config case with a PON reference."""
 
@@ -274,11 +263,8 @@ def test_pon_cnn_file(
             balsamic_cache,
             "--tumor-sample-name",
             tumor_sample_name,
-            "--sentieon-install-dir",
-            sentieon_install_dir,
-            "--sentieon-license",
-            sentieon_license,
-        ],
+        ]
+        + config_case_cli_tga,
     )
 
     # THEN program exits and checks for filepath
@@ -390,7 +376,7 @@ def test_config_graph_failed(
     assert case_result.exit_code == 1
 
 
-def test_missing_required_gens_arguments(
+def test_missing_required_gens_arguments_wgs(
     invoke_cli,
     tumor_sample_name: str,
     analysis_dir: str,
@@ -432,12 +418,54 @@ def test_missing_required_gens_arguments(
         ],
     )
     # THEN the CLI should exit code 2 and display an informative error message
-    assert result.exit_code == 2
+    assert result.exit_code == 1
     assert (
-        "All three arguments (genome_interval, gens_coverage_pon, gnomad_min_af5) are required for GENS in WGS."
+        "WGS requires arguments: genome_interval, gens_coverage_pon, gnomad_min_af5"
         in result.output
     )
 
+def test_missing_required_gens_arguments_tga(
+    invoke_cli,
+    tumor_sample_name: str,
+    analysis_dir: str,
+    balsamic_cache: str,
+    fastq_dir_tumor_only: str,
+    panel_bed_file: str,
+    case_id_tumor_only: str,
+    sentieon_license: str,
+    sentieon_install_dir: str,
+):
+    """Test balsamic config case without required GENS arguments."""
+
+    # GIVEN CLI arguments including optional GENS input-files
+
+    # WHEN invoking the config case command
+    result = invoke_cli(
+        [
+            "config",
+            "case",
+            "--case-id",
+            case_id_tumor_only,
+            "--analysis-dir",
+            analysis_dir,
+            "--fastq-path",
+            fastq_dir_tumor_only,
+            "--balsamic-cache",
+            balsamic_cache,
+            "--tumor-sample-name",
+            tumor_sample_name,
+            "--sentieon-install-dir",
+            sentieon_install_dir,
+            "--sentieon-license",
+            sentieon_license,
+        ],
+    )
+    # THEN the CLI should exit code 2 and display an informative error message
+    assert result.exit_code == 1
+    assert (
+        "TGA requires argument: gnomad_min_af5"
+        in result.output
+    )
 
 def test_config_with_gens_arguments(
     invoke_cli,
@@ -589,8 +617,7 @@ def test_config_tga_with_exome(
     fastq_dir_tumor_only: str,
     case_id_tumor_only: str,
     panel_bed_file: str,
-    sentieon_license: str,
-    sentieon_install_dir: str,
+    config_case_cli_tga: list[str],
 ):
     """Test balsamic config case with GENS arguments for TGA."""
 
@@ -614,11 +641,8 @@ def test_config_tga_with_exome(
             "-p",
             panel_bed_file,
             "--exome",
-            "--sentieon-install-dir",
-            sentieon_install_dir,
-            "--sentieon-license",
-            sentieon_license,
-        ],
+        ]
+        + config_case_cli_tga,
     )
     # THEN a config should be created and exist
     assert result.exit_code == 0
diff --git a/tests/conftest.py b/tests/conftest.py
index f3141ae8d..59be9c902 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1232,7 +1232,7 @@ def balsamic_pon_model(
 
 
 @pytest.fixture(scope="session")
-def config_case_cli(
+def config_case_cli_wgs(
     balsamic_cache: str,
     background_variant_file: str,
     cadd_annotations: str,
@@ -1245,6 +1245,9 @@ def config_case_cli(
     cancer_somatic_snv_observations_path: str,
     sentieon_license: str,
     sentieon_install_dir: str,
+    gens_hg19_interval_list: str,
+    gens_min_5_af_gnomad_file: str,
+    gens_cov_pon_file: str,
 ) -> List[str]:
     """Return common config case CLI."""
     return [
@@ -1272,8 +1275,59 @@ def config_case_cli(
         sentieon_install_dir,
         "--sentieon-license",
         sentieon_license,
+        "--genome-interval",
+        gens_hg19_interval_list,
+        "--gnomad-min-af5",
+        gens_min_5_af_gnomad_file,
+        "--gens-coverage-pon",
+        gens_cov_pon_file,
     ]
 
+@pytest.fixture(scope="session")
+def config_case_cli_tga(
+    balsamic_cache: str,
+    background_variant_file: str,
+    cadd_annotations: str,
+    swegen_snv_frequency_path: str,
+    swegen_sv_frequency_path: str,
+    clinical_snv_observations_path: str,
+    clinical_sv_observations_path: str,
+    somatic_sv_observations_path: str,
+    cancer_germline_snv_observations_path: str,
+    cancer_somatic_snv_observations_path: str,
+    sentieon_license: str,
+    sentieon_install_dir: str,
+    gens_min_5_af_gnomad_file: str,
+) -> List[str]:
+    """Return common config case CLI."""
+    return [
+        "--balsamic-cache",
+        balsamic_cache,
+        "--background-variants",
+        background_variant_file,
+        "--cadd-annotations",
+        cadd_annotations,
+        "--swegen-snv",
+        swegen_snv_frequency_path,
+        "--swegen-sv",
+        swegen_sv_frequency_path,
+        "--clinical-snv-observations",
+        clinical_snv_observations_path,
+        "--clinical-sv-observations",
+        clinical_sv_observations_path,
+        "--cancer-somatic-sv-observations",
+        somatic_sv_observations_path,
+        "--cancer-germline-snv-observations",
+        cancer_germline_snv_observations_path,
+        "--cancer-somatic-snv-observations",
+        cancer_somatic_snv_observations_path,
+        "--sentieon-install-dir",
+        sentieon_install_dir,
+        "--sentieon-license",
+        sentieon_license,
+        "--gnomad-min-af5",
+        gens_min_5_af_gnomad_file,
+    ]
 
 @pytest.fixture(scope="session")
 def tumor_only_config_qc(
@@ -1282,7 +1336,7 @@ def tumor_only_config_qc(
     fastq_dir_tumor_only_qc: str,
     tumor_sample_name: str,
     panel_bed_file: str,
-    config_case_cli: list[str],
+    config_case_cli_tga: list[str],
 ) -> str:
     """Invoke balsamic config sample to create sample configuration file for tumor-only TGA."""
 
@@ -1305,7 +1359,7 @@ def tumor_only_config_qc(
             "-p",
             panel_bed_file,
         ]
-        + config_case_cli,
+        + config_case_cli_tga,
     )
 
     return Path(
@@ -1319,8 +1373,9 @@ def tumor_normal_config_qc(
     tumor_sample_name: str,
     normal_sample_name: str,
     analysis_dir: str,
+    panel_bed_file: str,
     fastq_dir_tumor_normal_qc: str,
-    config_case_cli: list[str],
+    config_case_cli_tga: list[str],
 ) -> str:
     """Invoke balsamic config sample to create sample configuration file for tumor-normal TGA QC workflow."""
 
@@ -1342,8 +1397,10 @@ def tumor_normal_config_qc(
             tumor_sample_name,
             "--normal-sample-name",
             normal_sample_name,
+            "-p",
+            panel_bed_file,
         ]
-        + config_case_cli,
+        + config_case_cli_tga,
     )
 
     return Path(
@@ -1360,7 +1417,7 @@ def tumor_normal_config_qc_wgs(
     fastq_dir_tumor_normal_qc_wgs: str,
     tumor_sample_name: str,
     normal_sample_name: str,
-    config_case_cli: List[str],
+    config_case_cli_wgs: List[str],
 ) -> str:
     """Invoke balsamic config sample to create sample configuration file for tumor-normal WGS QC workflow."""
 
@@ -1383,7 +1440,7 @@ def tumor_normal_config_qc_wgs(
             "--normal-sample-name",
             normal_sample_name,
         ]
-        + config_case_cli,
+        + config_case_cli_wgs,
     )
 
     return Path(
@@ -1400,7 +1457,7 @@ def tumor_only_config(
     analysis_dir: str,
     fastq_dir_tumor_only: str,
     panel_bed_file: str,
-    config_case_cli: list[str],
+    config_case_cli_tga: list[str],
 ) -> str:
     """Invoke balsamic config sample to create sample configuration file for tumor-only TGA."""
 
@@ -1421,7 +1478,7 @@ def tumor_only_config(
             "-p",
             panel_bed_file,
         ]
-        + config_case_cli,
+        + config_case_cli_tga,
     )
     return Path(
         analysis_dir,
@@ -1438,7 +1495,7 @@ def tumor_normal_config(
     analysis_dir: str,
     fastq_dir_tumor_normal: str,
     panel_bed_file: str,
-    config_case_cli: list[str],
+    config_case_cli_tga: list[str],
 ) -> str:
     """Invoke balsamic config sample to create sample configuration file for tumor-normal TGA."""
 
@@ -1461,7 +1518,7 @@ def tumor_normal_config(
             "-p",
             panel_bed_file,
         ]
-        + config_case_cli,
+        + config_case_cli_tga,
     )
 
     return Path(
@@ -1478,7 +1535,7 @@ def tumor_only_umi_config(
     analysis_dir: str,
     fastq_dir_tumor_only: str,
     panel_bed_file: str,
-    config_case_cli: list[str],
+    config_case_cli_tga: list[str],
 ) -> str:
     """Invoke balsamic config sample to create sample configuration file for tumor-only TGA."""
 
@@ -1501,7 +1558,7 @@ def tumor_only_umi_config(
             "-p",
             panel_bed_file,
         ]
-        + config_case_cli,
+        + config_case_cli_tga,
     )
 
     return Path(
@@ -1519,7 +1576,7 @@ def tumor_normal_umi_config(
     analysis_dir: str,
     fastq_dir_tumor_normal: str,
     panel_bed_file: str,
-    config_case_cli: list[str],
+    config_case_cli_tga: list[str],
 ) -> str:
     """Invoke balsamic config sample to create sample configuration file for tumor-normal TGA."""
 
@@ -1546,7 +1603,7 @@ def tumor_normal_umi_config(
             "-p",
             panel_bed_file,
         ]
-        + config_case_cli,
+        + config_case_cli_tga,
     )
 
     return Path(
@@ -1562,7 +1619,7 @@ def tumor_only_wgs_config(
     analysis_dir: str,
     fastq_dir_tumor_only_wgs: str,
     tumor_sample_name: str,
-    config_case_cli: List[str],
+    config_case_cli_wgs: List[str],
 ) -> str:
     """Invoke balsamic config sample to create sample configuration file for tumor-only WGS."""
 
@@ -1581,7 +1638,7 @@ def tumor_only_wgs_config(
             "--tumor-sample-name",
             tumor_sample_name,
         ]
-        + config_case_cli,
+        + config_case_cli_wgs,
     )
 
     return Path(
@@ -1598,7 +1655,7 @@ def tumor_normal_wgs_config(
     fastq_dir_tumor_normal_wgs: str,
     tumor_sample_name: str,
     normal_sample_name: str,
-    config_case_cli: str,
+    config_case_cli_wgs: List[str],
 ) -> str:
     """Invoke balsamic config sample to create sample configuration file for tumor-normal WGS."""
 
@@ -1619,7 +1676,7 @@ def tumor_normal_wgs_config(
             "--normal-sample-name",
             normal_sample_name,
         ]
-        + config_case_cli,
+        + config_case_cli_wgs,
     )
 
     return Path(
@@ -1638,8 +1695,7 @@ def tumor_only_config_dummy_vep(
     fastq_dir_tumor_only_dummy_vep: str,
     panel_bed_file: str,
     background_variant_file: str,
-    sentieon_license: str,
-    sentieon_install_dir: str,
+    config_case_cli_tga: list[str],
 ) -> str:
     """Invoke balsamic config sample to create sample configuration file for tumor-only TGA with dummy VEP file."""
 
@@ -1663,11 +1719,8 @@ def tumor_only_config_dummy_vep(
             background_variant_file,
             "--tumor-sample-name",
             tumor_sample_name,
-            "--sentieon-install-dir",
-            sentieon_install_dir,
-            "--sentieon-license",
-            sentieon_license,
-        ],
+        ]
+        + config_case_cli_tga,
     )
     return Path(
         analysis_dir,
@@ -1685,8 +1738,7 @@ def tumor_only_pon_config(
     panel_bed_file: str,
     pon_cnn_path: str,
     balsamic_cache: str,
-    sentieon_license: str,
-    sentieon_install_dir: str,
+    config_case_cli_tga: List[str],
 ) -> str:
     """Invoke balsamic PON config sample to create sample configuration file for tumor-only TGA."""
 
@@ -1706,15 +1758,10 @@ def tumor_only_pon_config(
             panel_bed_file,
             "--pon-cnn",
             pon_cnn_path,
-            "--balsamic-cache",
-            balsamic_cache,
             "--tumor-sample-name",
             tumor_sample_name,
-            "--sentieon-install-dir",
-            sentieon_install_dir,
-            "--sentieon-license",
-            sentieon_license,
-        ],
+        ]
+        + config_case_cli_tga
     )
 
     return Path(

From ce9578685ad10f79eb340528700e8ffd5bbde059 Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Fri, 2 Aug 2024 15:27:27 +0200
Subject: [PATCH 37/39] black

---
 BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py | 5 +++--
 tests/commands/config/test_config_sample.py        | 7 +++----
 tests/conftest.py                                  | 4 +++-
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
index 19b56331f..d266465c3 100644
--- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
+++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
@@ -13,7 +13,9 @@ def calculate_log2_ratio(purity, log2_ratio, ploidy):
         raise ValueError("Ploidy must be a positive number")
 
     # Ploidy and purity adjustment formula reference to PureCN issue: https://github.com/lima1/PureCN/issues/40
-    log2_adjusted = (purity*log2_ratio*ploidy + 2*(1-purity)*log2_ratio - 2*(1-purity)) / (purity*ploidy)
+    log2_adjusted = (
+        purity * log2_ratio * ploidy + 2 * (1 - purity) * log2_ratio - 2 * (1 - purity)
+    ) / (purity * ploidy)
 
     return log2_adjusted
 
@@ -82,7 +84,6 @@ def create_gens_cov_file(
         # store values in list
         log2_data.append(f"{row['chromosome']}\t{midpoint - 1}\t{midpoint}\t{log2}")
 
-
     # Write log2 data to output file
     with open(output_file, "w") as log2_out:
         for resolution in ["o", "a", "b", "c", "d"]:
diff --git a/tests/commands/config/test_config_sample.py b/tests/commands/config/test_config_sample.py
index 8e5456251..c4abfed93 100644
--- a/tests/commands/config/test_config_sample.py
+++ b/tests/commands/config/test_config_sample.py
@@ -424,6 +424,7 @@ def test_missing_required_gens_arguments_wgs(
         in result.output
     )
 
+
 def test_missing_required_gens_arguments_tga(
     invoke_cli,
     tumor_sample_name: str,
@@ -462,10 +463,8 @@ def test_missing_required_gens_arguments_tga(
     )
     # THEN the CLI should exit code 2 and display an informative error message
     assert result.exit_code == 1
-    assert (
-        "TGA requires argument: gnomad_min_af5"
-        in result.output
-    )
+    assert "TGA requires argument: gnomad_min_af5" in result.output
+
 
 def test_config_with_gens_arguments(
     invoke_cli,
diff --git a/tests/conftest.py b/tests/conftest.py
index 59be9c902..711a1c61b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1283,6 +1283,7 @@ def config_case_cli_wgs(
         gens_cov_pon_file,
     ]
 
+
 @pytest.fixture(scope="session")
 def config_case_cli_tga(
     balsamic_cache: str,
@@ -1329,6 +1330,7 @@ def config_case_cli_tga(
         gens_min_5_af_gnomad_file,
     ]
 
+
 @pytest.fixture(scope="session")
 def tumor_only_config_qc(
     case_id_tumor_only_qc: str,
@@ -1761,7 +1763,7 @@ def tumor_only_pon_config(
             "--tumor-sample-name",
             tumor_sample_name,
         ]
-        + config_case_cli_tga
+        + config_case_cli_tga,
     )
 
     return Path(

From 6030b877bf7de313e35ade56908b6f351c11be03 Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Tue, 13 Aug 2024 13:36:19 +0200
Subject: [PATCH 38/39] add io test

---
 .../assets/scripts/postprocess_gens_cnvkit.py |  3 ---
 BALSAMIC/commands/config/case.py              | 17 ++++++++-------
 BALSAMIC/utils/cli.py                         | 21 ++++++++-----------
 tests/commands/config/test_config_sample.py   |  9 ++++----
 tests/utils/test_utils.py                     | 15 +++++++++++++
 5 files changed, 37 insertions(+), 28 deletions(-)

diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
index d266465c3..a50e354d0 100644
--- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
+++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
@@ -1,8 +1,5 @@
 import click
 from BALSAMIC.utils.io import read_csv
-import numpy as np
-import warnings
-
 
 def calculate_log2_ratio(purity, log2_ratio, ploidy):
     # Ensure that the inputs are within valid ranges
diff --git a/BALSAMIC/commands/config/case.py b/BALSAMIC/commands/config/case.py
index 77839f68c..0c001b1a7 100644
--- a/BALSAMIC/commands/config/case.py
+++ b/BALSAMIC/commands/config/case.py
@@ -139,14 +139,15 @@ def case_config(
             gnomad_min_af5=gnomad_min_af5,
             panel_bed=panel_bed,
         )
-        # Update references dictionary with GENS reference-files
-        references.update(
-            {
-                gens_file: path
-                for gens_file, path in gens_references.items()
-                if path is not None
-            }
-        )
+        if gens_references:
+            # Update references dictionary with GENS reference-files
+            references.update(
+                {
+                    gens_file: path
+                    for gens_file, path in gens_references.items()
+                    if path is not None
+                }
+            )
 
     variants_observations = {
         "clinical_snv_observations": clinical_snv_observations,
diff --git a/BALSAMIC/utils/cli.py b/BALSAMIC/utils/cli.py
index cdd02e680..a146fa4d7 100644
--- a/BALSAMIC/utils/cli.py
+++ b/BALSAMIC/utils/cli.py
@@ -220,18 +220,18 @@ def get_gens_references(
     gens_coverage_pon: Optional[str],
     gnomad_min_af5: Optional[str],
     panel_bed: Optional[str],
-) -> Dict[str, str]:
+) -> Dict[str, str] | None:
     """
     Assigns reference-files required for GENS if they have been supplied, else exists with error message.
 
     Args:
-        genome_interval: Optional[str]. Coverage-regions. (required for WGS GENS)
-        gens_coverage_pon: Optional[str]. PON for GATK CollectReadCounts. (required for WGS GENS)
+        genome_interval: Optional[str] Coverage-regions. (required for WGS GENS)
+        gens_coverage_pon: Optional[str] PON for GATK CollectReadCounts. (required for WGS GENS)
         gnomad_min_af5: Optional[str] gnomad VCF filtered to keep variants above 5% VAF (required for WGS and TGA GENS).
         panel_bed: Optional[str] Bedfile supplied for TGA analyses.
 
     Returns:
-         Dict[str, str] with paths to GENS reference files
+         Dict[str, str] with paths to GENS reference files or None
     """
 
     if panel_bed and gnomad_min_af5:
@@ -243,14 +243,11 @@ def get_gens_references(
             "gens_coverage_pon": gens_coverage_pon,
             "gnomad_min_af5": gnomad_min_af5,
         }
-
-    error_message = (
-        "GENS reference file is always required to run BALSAMIC.\n"
-        "WGS requires arguments: genome_interval, gens_coverage_pon, gnomad_min_af5\n"
-        "TGA requires argument: gnomad_min_af5"
-    )
-    LOG.error(error_message)
-    raise BalsamicError(error_message)
+    if any([gnomad_min_af5, genome_interval, gens_coverage_pon]):
+        error_message = "GENS for WGS requires all arguments: genome_interval, gens_coverage_pon, gnomad_min_af5"
+        LOG.error(error_message)
+        raise BalsamicError(error_message)
+    return None
 
 
 def get_bioinfo_tools_version(
diff --git a/tests/commands/config/test_config_sample.py b/tests/commands/config/test_config_sample.py
index c4abfed93..aa76ca6a2 100644
--- a/tests/commands/config/test_config_sample.py
+++ b/tests/commands/config/test_config_sample.py
@@ -420,12 +420,12 @@ def test_missing_required_gens_arguments_wgs(
     # THEN the CLI should exit code 2 and display an informative error message
     assert result.exit_code == 1
     assert (
-        "WGS requires arguments: genome_interval, gens_coverage_pon, gnomad_min_af5"
+        "GENS for WGS requires all arguments: genome_interval, gens_coverage_pon, gnomad_min_af5"
         in result.output
     )
 
 
-def test_missing_required_gens_arguments_tga(
+def test_missing_gens_arguments_tga(
     invoke_cli,
     tumor_sample_name: str,
     analysis_dir: str,
@@ -461,9 +461,8 @@ def test_missing_required_gens_arguments_tga(
             sentieon_license,
         ],
     )
-    # THEN the CLI should exit code 2 and display an informative error message
-    assert result.exit_code == 1
-    assert "TGA requires argument: gnomad_min_af5" in result.output
+    # THEN the CLI should exit code 0
+    assert result.exit_code == 0
 
 
 def test_config_with_gens_arguments(
diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py
index eb0d7fbe6..0cd721f78 100644
--- a/tests/utils/test_utils.py
+++ b/tests/utils/test_utils.py
@@ -46,6 +46,7 @@
 from BALSAMIC.utils.exc import BalsamicError, WorkflowRunError
 from BALSAMIC.utils.io import (
     read_json,
+    read_csv,
     read_vcf_file,
     read_yaml,
     write_finish_file,
@@ -479,6 +480,20 @@ def test_read_json(config_path: str):
     assert type(config_dict) is dict
 
 
+def test_read_csv(purity_csv_path: str):
+    """Test data extraction from a CSV file."""
+
+    # GIVEN a CSV path
+
+    # WHEN calling the function
+    csv_list: list[Dict] = read_csv(purity_csv_path)
+
+    # THEN the config.json file should be correctly parsed
+    assert len(csv_list) == 1
+    assert csv_list[0]["Purity"] == "0.64"
+    assert csv_list[0]["Sampleid"] == "tumor.initial"
+
+
 def test_read_json_error():
     """Test data extraction from a BALSAMIC config JSON file for an invalid path."""
 

From 604f0596ad111811595505349b3d8090454f7087 Mon Sep 17 00:00:00 2001
From: Mathias Johansson <math.joh.bio@gmail.com>
Date: Tue, 13 Aug 2024 14:50:24 +0200
Subject: [PATCH 39/39] add new test

---
 .../assets/scripts/postprocess_gens_cnvkit.py |  9 ++++-----
 BALSAMIC/utils/io.py                          |  5 +++++
 tests/utils/test_utils.py                     | 20 +++++++++++++++++++
 3 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
index a50e354d0..2899091ff 100644
--- a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
+++ b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
@@ -1,5 +1,5 @@
 import click
-from BALSAMIC.utils.io import read_csv
+from BALSAMIC.utils.io import read_csv, write_list_of_strings
 
 def calculate_log2_ratio(purity, log2_ratio, ploidy):
     # Ensure that the inputs are within valid ranges
@@ -82,10 +82,9 @@ def create_gens_cov_file(
         log2_data.append(f"{row['chromosome']}\t{midpoint - 1}\t{midpoint}\t{log2}")
 
     # Write log2 data to output file
-    with open(output_file, "w") as log2_out:
-        for resolution in ["o", "a", "b", "c", "d"]:
-            for line in log2_data:
-                log2_out.write(f"{resolution}_{line}\n")
+    resolutions = ["o", "a", "b", "c", "d"]
+    resolution_log2_lines = [f"{resolution}_{line}" for resolution in resolutions for line in log2_data]
+    write_list_of_strings(resolution_log2_lines, output_file)
 
 
 if __name__ == "__main__":
diff --git a/BALSAMIC/utils/io.py b/BALSAMIC/utils/io.py
index 0b615265d..315899f93 100644
--- a/BALSAMIC/utils/io.py
+++ b/BALSAMIC/utils/io.py
@@ -73,6 +73,11 @@ def read_csv(csv_path: str, delimeter: str = ",") -> List[dict]:
         csv_reader = csv.DictReader(csv_file, delimiter=delimeter)
         return [row for row in csv_reader]
 
+def write_list_of_strings(list_of_strings: list[str], output_file: str):
+    """Writes a list of strings to a file, each on a new line."""
+    with open(output_file, 'w') as file:
+        for string in list_of_strings:
+            file.write(f"{string}\n")
 
 def read_yaml(yaml_path: str) -> dict:
     """Read data from a yaml file."""
diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py
index 0cd721f78..503134c2e 100644
--- a/tests/utils/test_utils.py
+++ b/tests/utils/test_utils.py
@@ -49,6 +49,7 @@
     read_csv,
     read_vcf_file,
     read_yaml,
+    write_list_of_strings,
     write_finish_file,
     write_json,
     write_sacct_to_yaml,
@@ -455,6 +456,25 @@ def test_write_json(tmp_path, reference):
 
     assert len(list(tmp.iterdir())) == 1
 
+def test_write_list_of_strings(tmp_path):
+    """Test writing list of strings to file"""
+
+    # GIVEN a list of strings and an output file path
+    list_of_strings = [f"header1\theader2\theader3", f"row1_col1\trow1_col2\trow1_col3"]
+    tmp = tmp_path / "tmp"
+    tmp.mkdir()
+    output_file: Path = Path(tmp / "output.csv")
+
+    # WHEN writing list of strings
+    write_list_of_strings(list_of_strings, output_file.as_posix())
+
+    # THEN file should have been written
+    assert output_file.exists()
+
+    # AND contain the same information
+    read_written_file = read_csv(csv_path=output_file, delimeter="\t")
+    assert read_written_file == [{"header1": "row1_col1", "header2": "row1_col2", "header3": "row1_col3"}]
+    assert len(read_written_file) == 1
 
 def test_write_json_error(tmp_path: Path):
     """Test JSON write error."""