From fc034e5cc0a462396a1824f2f379707ec5d5e852 Mon Sep 17 00:00:00 2001
From: Matin Nuhamunada <matinnu@biosustain.dtu.dk>
Date: Mon, 6 May 2024 14:01:32 +0000
Subject: [PATCH 01/20] docs: update citation

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 5852dd0a..093b43b0 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@
 At present, `BGCFlow` is only tested and confirmed to work on **Linux** systems with `conda` / `mamba` package manager.
 
 ## Publication
-> Matin Nuhamunada, Omkar S. Mohite, Patrick V. Phaneuf, Bernhard O. Palsson, and Tilmann Weber. (2023). BGCFlow: Systematic pangenome workflow for the analysis of biosynthetic gene clusters across large genomic datasets. bioRxiv 2023.06.14.545018; doi: [https://doi.org/10.1101/2023.06.14.545018](https://doi.org/10.1101/2023.06.14.545018)
+> Matin Nuhamunada, Omkar S Mohite, Patrick V Phaneuf, Bernhard O Palsson, Tilmann Weber, BGCFlow: systematic pangenome workflow for the analysis of biosynthetic gene clusters across large genomic datasets, Nucleic Acids Research, 2024;, gkae314, [https://doi.org/10.1093/nar/gkae314](https://doi.org/10.1093/nar/gkae314)
 
 ## Pre-requisites
 `BGCFlow` requires `gcc` and the `conda`/`mamba` package manager. See [installation instruction](https://github.com/NBChub/bgcflow/wiki/00-Installation-Guide) for details.
@@ -27,7 +27,7 @@ A quick and easy way to use `BGCFlow` using the command line interface wrapper:
 
 ```bash
 # create and activate a new conda environment
-conda create -n bgcflow -c conda-forge python=3.11 pip openjdk -y # also install java for metabase
+mamba create -n bgcflow -c conda-forge python=3.11 pip openjdk -y # also install java for metabase
 conda activate bgcflow
 
 # install `BGCFlow` wrapper

From 5deef98e1541eda4bb8e53e4c80896088da6b49f Mon Sep 17 00:00:00 2001
From: Matin Nuhamunada <matinnu@biosustain.dtu.dk>
Date: Wed, 8 May 2024 09:50:07 +0200
Subject: [PATCH 02/20] fix: upgrade bgc genome preparation script to handle
 weird input

---
 workflow/rules/bgc.smk | 39 ++++++++++++++++++---------------------
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/workflow/rules/bgc.smk b/workflow/rules/bgc.smk
index 353b879e..b0ce9af7 100644
--- a/workflow/rules/bgc.smk
+++ b/workflow/rules/bgc.smk
@@ -5,6 +5,7 @@ rule downstream_bgc_prep:
         ),
         table="data/processed/{name}/tables/df_gtdb_meta.csv",
     output:
+        input_list=temp("data/interim/bgcs/{name}/{version}/input_list.txt"),
         taxonomy="data/interim/bgcs/taxonomy/taxonomy_{name}_antismash_{version}.tsv",
         outdir=directory("data/interim/bgcs/{name}/{version}"),
         bgc_mapping="data/interim/bgcs/{name}/{name}_antismash_{version}.csv",
@@ -12,33 +13,29 @@ rule downstream_bgc_prep:
         "../envs/bgc_analytics.yaml"
     params:
         dataset="data/interim/bgcs/datasets.tsv",
-    log:
-        general="logs/bgcs/downstream_bgc_prep/{name}/downstream_bgc_prep-{version}.log",
-        symlink="logs/bgcs/downstream_bgc_prep/{name}/bgc_downstream_bgc_prep-{version}.log",
-        taxonomy="logs/bgcs/downstream_bgc_prep/{name}/tax_downstream_bgc_prep-{version}.log",
+    log: "logs/bgcs/downstream_bgc_prep/{name}/downstream_bgc_prep-{version}.log",
     shell:
         """
-        echo "Preparing BGCs for {wildcards.name} downstream analysis..." > {log.general}
-        #mkdir -p {output.outdir} 2>> {log.general}
-        # Generate symlink for each regions in genomes in dataset
-        for i in $(dirname {input.gbk})
-        do
-            echo Processing $i >> {log.symlink}
-            python workflow/bgcflow/bgcflow/data/bgc_downstream_prep.py $i {output.outdir} 2>> {log.symlink}
-        done
-        # generate taxonomic information for dataset
-        python workflow/bgcflow/bgcflow/data/bigslice_prep.py {input.table} {output.taxonomy} 2>> {log.taxonomy}
+        echo "Preparing BGCs for {wildcards.name} downstream analysis..." >> {log}
+
+        echo "Step 1. Generate symlink for each regions in genomes in dataset" >> {log}
+        echo {input.gbk} | tr ' ' '\n' > {output.input_list} 2>> {log}
+        head -n 5 {output.input_list} >> {log}
+        python workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py {output.input_list} {output.outdir} 2>> {log}
+
+        echo "Step 2. Generate taxonomic information for dataset" >> {log}
+        python workflow/bgcflow/bgcflow/data/bigslice_prep.py {input.table} {output.taxonomy} 2>> {log}
         # append new dataset information
         ## check if previous dataset exists
         if [[ -s {params.dataset} ]]
         then
-            echo "Previous dataset detected, appending dataset information for {wildcards.name}..." >> {log.symlink}
-            sed -i 'a {wildcards.name}_antismash_{wildcards.version}\t{wildcards.name}_antismash_{wildcards.version}\ttaxonomy/taxonomy_{wildcards.name}_antismash_{wildcards.version}.tsv\t{wildcards.name}' {params.dataset} 2>> {log.general}
+            echo "Previous dataset detected, appending dataset information for {wildcards.name}..."
+            sed -i 'a {wildcards.name}_antismash_{wildcards.version}\t{wildcards.name}_antismash_{wildcards.version}\ttaxonomy/taxonomy_{wildcards.name}_antismash_{wildcards.version}.tsv\t{wildcards.name}' {params.dataset} 2>> {log}
         else
-            echo "No previous dataset detected, generating dataset information for {wildcards.name}..." >> {log.symlink}
-            echo -e '# Dataset name\tPath to folder\tPath to taxonomy\tDescription' > {params.dataset} >> {log.general}
-            sed -i 'a {wildcards.name}_antismash_{wildcards.version}\t{wildcards.name}_antismash_{wildcards.version}\ttaxonomy/taxonomy_{wildcards.name}_antismash_{wildcards.version}.tsv\t{wildcards.name}' {params.dataset} 2>> {log.general}
+            echo "No previous dataset detected, generating dataset information for {wildcards.name}..." 2>> {log}
+            echo -e '# Dataset name\tPath to folder\tPath to taxonomy\tDescription' > {params.dataset} 2>> {log}
+            sed -i 'a {wildcards.name}_antismash_{wildcards.version}\t{wildcards.name}_antismash_{wildcards.version}\ttaxonomy/taxonomy_{wildcards.name}_antismash_{wildcards.version}.tsv\t{wildcards.name}' {params.dataset} 2>> {log}
         fi
-        # generate mapping for visualization
-        python workflow/bgcflow/bgcflow/data/get_bigscape_mapping.py {output.outdir} {output.bgc_mapping} 2>> {log.general}
+        echo "Step 3. Generate mapping for visualization" >> {log}
+        python workflow/bgcflow/bgcflow/data/get_bigscape_mapping.py {output.outdir} {output.bgc_mapping} 2>> {log}
         """

From d7998aace8ea5d7dd854bb51f96d8c14fcd814ae Mon Sep 17 00:00:00 2001
From: Matin Nuhamunada <matinnu@biosustain.dtu.dk>
Date: Wed, 8 May 2024 10:13:10 +0200
Subject: [PATCH 03/20] fix: update get_antismash_inputs function to retrieve
 region gbks

---
 workflow/rules/common.smk | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index e276b493..fcdd7609 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -520,21 +520,29 @@ def get_prokka_refdb(genome_id, params, df_samples, mapping_file, config=config)
     return output
 
 
-# bigscape.smk, bigslice.smk, and bgc_analytics.smk #
 def get_antismash_inputs(name, version, df_samples):
     """
-    Given a project name, find the corresponding sample file to use
+    This function retrieves the list of antismash GenBank (.gbk) files for a given project.
 
-    Arguments:
-        name {str} -- project name
-        version {str} -- antismash version
-        df_samples {pd.DataFrame} -- sample table
+    It iterates over the sample table (DataFrame), selects the rows where the project name matches the provided name,
+    and for each matching sample, it constructs a path to the directory where the antismash files for that sample are stored.
+    It then collects all GenBank files in these directories that have 'region' in their name.
+
+    Parameters:
+        name (str): The name of the project for which to retrieve the antismash files.
+        version (str): The version of antismash used to generate the files.
+        df_samples (pd.DataFrame): A DataFrame containing the sample table. It is expected to have a 'name' column.
 
     Returns:
-        output {list} -- list of antismash gbk files
+        output (list): A list of strings, where each string is the path to an antismash GenBank file for the given project.
     """
     selection = [i for i in df_samples.index if name in df_samples.loc[i, "name"]]
-    output = [f"data/interim/antismash/{version}/{s}/{s}.gbk" for s in selection]
+    output = []
+    for genome_id in selection:
+        genome_path = Path(f"data/interim/antismash/{version}/{genome_id}/")
+        region_genbanks = list(genome_path.glob("*.region*.gbk"))
+        for r in region_genbanks:
+            output.append(str(r))
     return output
 
 

From 6355679a5af110efc89af4e212683d96fd6d63d6 Mon Sep 17 00:00:00 2001
From: Matin Nuhamunada <matinnu@biosustain.dtu.dk>
Date: Fri, 17 May 2024 12:40:48 +0200
Subject: [PATCH 04/20] fix: change input requirement for bgc downstream
 preparation

---
 .../data/bgc_downstream_prep_selection.py       |  5 +++--
 workflow/rules/bgc.smk                          | 17 ++++++++++-------
 workflow/rules/common.smk                       |  2 +-
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py b/workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py
index 88099515..8c4ce1f5 100644
--- a/workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py
+++ b/workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py
@@ -19,7 +19,7 @@ def generate_symlink(path, genome_id, output_dir, selected_bgcs=False):
     logging.debug(f"Deducting genome id as {genome_id}")
     ctr = 0
     matches = selected_bgcs.stem
-    for gbk in path.glob("*.gbk"):
+    for gbk in path.glob("*region*.gbk"):
         if gbk.stem in matches:
             logging.debug(f"Found match: {gbk.stem}")
             filename = gbk.name
@@ -125,6 +125,7 @@ def bgc_downstream_prep(input_file, output_dir):
             "genome_id": genome_id,
             "value": region_change_log,
         }
+    logging.info("Writing change logs...")
     change_logs = {}
     genome_ids = set(v["genome_id"] for v in change_log_containers.values())
     for genome_id in genome_ids:
@@ -134,7 +135,7 @@ def bgc_downstream_prep(input_file, output_dir):
                 entry_name = list(v["value"].keys())[0]
                 change_log[entry_name] = v["value"][entry_name]
         change_logs[genome_id] = change_log
-    logging.debug(change_logs)
+        logging.debug(f"Change log for {genome_id}: {change_log}")
 
     for genome_id in change_logs.keys():
         outpath = Path(output_dir) / genome_id
diff --git a/workflow/rules/bgc.smk b/workflow/rules/bgc.smk
index b0ce9af7..48c782c8 100644
--- a/workflow/rules/bgc.smk
+++ b/workflow/rules/bgc.smk
@@ -1,11 +1,12 @@
 rule downstream_bgc_prep:
     input:
-        gbk=lambda wildcards: get_antismash_inputs(
-            wildcards.name, wildcards.version, DF_SAMPLES
+        gbk=lambda wildcards: expand("data/interim/antismash/{version}/{strains}/{strains}.gbk",
+            version=wildcards.version,
+            strains=[s for s in PEP_PROJECTS[wildcards.name].sample_table.genome_id.unique()],
         ),
         table="data/processed/{name}/tables/df_gtdb_meta.csv",
     output:
-        input_list=temp("data/interim/bgcs/{name}/{version}/input_list.txt"),
+        input_list="data/interim/bgcs/{name}/{version}/input_list.txt",
         taxonomy="data/interim/bgcs/taxonomy/taxonomy_{name}_antismash_{version}.tsv",
         outdir=directory("data/interim/bgcs/{name}/{version}"),
         bgc_mapping="data/interim/bgcs/{name}/{name}_antismash_{version}.csv",
@@ -13,14 +14,16 @@ rule downstream_bgc_prep:
         "../envs/bgc_analytics.yaml"
     params:
         dataset="data/interim/bgcs/datasets.tsv",
+        regions=lambda wildcards: get_antismash_regions(
+            wildcards.name, wildcards.version, DF_SAMPLES
+        ),
     log: "logs/bgcs/downstream_bgc_prep/{name}/downstream_bgc_prep-{version}.log",
     shell:
         """
         echo "Preparing BGCs for {wildcards.name} downstream analysis..." >> {log}
 
         echo "Step 1. Generate symlink for each regions in genomes in dataset" >> {log}
-        echo {input.gbk} | tr ' ' '\n' > {output.input_list} 2>> {log}
-        head -n 5 {output.input_list} >> {log}
+        echo {params.regions} | tr ' ' '\n' >> {output.input_list} 2>> {log}
         python workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py {output.input_list} {output.outdir} 2>> {log}
 
         echo "Step 2. Generate taxonomic information for dataset" >> {log}
@@ -29,10 +32,10 @@ rule downstream_bgc_prep:
         ## check if previous dataset exists
         if [[ -s {params.dataset} ]]
         then
-            echo "Previous dataset detected, appending dataset information for {wildcards.name}..."
+            echo "Previous dataset detected, appending dataset information for {wildcards.name}..." >> {log}
             sed -i 'a {wildcards.name}_antismash_{wildcards.version}\t{wildcards.name}_antismash_{wildcards.version}\ttaxonomy/taxonomy_{wildcards.name}_antismash_{wildcards.version}.tsv\t{wildcards.name}' {params.dataset} 2>> {log}
         else
-            echo "No previous dataset detected, generating dataset information for {wildcards.name}..." 2>> {log}
+            echo "No previous dataset detected, generating dataset information for {wildcards.name}..." >> {log}
             echo -e '# Dataset name\tPath to folder\tPath to taxonomy\tDescription' > {params.dataset} 2>> {log}
             sed -i 'a {wildcards.name}_antismash_{wildcards.version}\t{wildcards.name}_antismash_{wildcards.version}\ttaxonomy/taxonomy_{wildcards.name}_antismash_{wildcards.version}.tsv\t{wildcards.name}' {params.dataset} 2>> {log}
         fi
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index fcdd7609..84e81718 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -520,7 +520,7 @@ def get_prokka_refdb(genome_id, params, df_samples, mapping_file, config=config)
     return output
 
 
-def get_antismash_inputs(name, version, df_samples):
+def get_antismash_regions(name, version, df_samples):
     """
     This function retrieves the list of antismash GenBank (.gbk) files for a given project.
 

From 862900415b28a53a90d1ba106f95d3894519622b Mon Sep 17 00:00:00 2001
From: Matin Nuhamunada <matinnu@biosustain.dtu.dk>
Date: Wed, 29 May 2024 15:24:25 +0200
Subject: [PATCH 05/20] chore: correct typos

---
 workflow/rules/antismash.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/rules/antismash.smk b/workflow/rules/antismash.smk
index 1ecad295..4a7f4261 100644
--- a/workflow/rules/antismash.smk
+++ b/workflow/rules/antismash.smk
@@ -47,7 +47,7 @@ if antismash_major_version <= 6:
             """
             antismash \
                 --genefinding-tool {params.genefinding} \
-                --database {params.antismash_db_path,} \
+                --database {params.antismash_db_path} \
                 --output-dir {params.folder} \
                 --cb-general \
                 --cb-subclusters \

From ac20d03546da67aae924e23bc113fa5ff119dfe3 Mon Sep 17 00:00:00 2001
From: Matin Nuhamunada <matinnu@biosustain.dtu.dk>
Date: Thu, 30 May 2024 13:30:33 +0000
Subject: [PATCH 06/20] fix: reinclude full antiSMASH gbks for downstream
 process

---
 workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py | 3 ++-
 workflow/rules/bgc.smk                                         | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py b/workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py
index 8c4ce1f5..bff37fa5 100644
--- a/workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py
+++ b/workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py
@@ -18,8 +18,9 @@ def generate_symlink(path, genome_id, output_dir, selected_bgcs=False):
     outpath.mkdir(parents=True, exist_ok=True)
     logging.debug(f"Deducting genome id as {genome_id}")
     ctr = 0
+    change_log = None
     matches = selected_bgcs.stem
-    for gbk in path.glob("*region*.gbk"):
+    for gbk in path.glob("*.gbk"):
         if gbk.stem in matches:
             logging.debug(f"Found match: {gbk.stem}")
             filename = gbk.name
diff --git a/workflow/rules/bgc.smk b/workflow/rules/bgc.smk
index 48c782c8..538d829d 100644
--- a/workflow/rules/bgc.smk
+++ b/workflow/rules/bgc.smk
@@ -23,6 +23,7 @@ rule downstream_bgc_prep:
         echo "Preparing BGCs for {wildcards.name} downstream analysis..." >> {log}
 
         echo "Step 1. Generate symlink for each regions in genomes in dataset" >> {log}
+        echo {input.gbk} | tr ' ' '\n' >> {output.input_list} 2>> {log}
         echo {params.regions} | tr ' ' '\n' >> {output.input_list} 2>> {log}
         python workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py {output.input_list} {output.outdir} 2>> {log}
 

From 0b65b42b40ba1a64500d2d95db3545d0218255c4 Mon Sep 17 00:00:00 2001
From: Matin Nuhamunada <matinnu@biosustain.dtu.dk>
Date: Thu, 30 May 2024 13:36:32 +0000
Subject: [PATCH 07/20] feat: use database schema 0.3.1

---
 workflow/rules/build-database.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/rules/build-database.smk b/workflow/rules/build-database.smk
index 5318c948..6932a688 100644
--- a/workflow/rules/build-database.smk
+++ b/workflow/rules/build-database.smk
@@ -105,7 +105,7 @@ rule get_dbt_template:
     params:
         dbt = "data/processed/{name}/dbt/antiSMASH_{version}",
         dbt_repo = "https://github.com/NBChub/bgcflow_dbt-duckdb",
-        release = "0.2.1",
+        release = "0.3.1",
         cutoff = "0.30",
         as_version = "{version}"
     shell:

From 0ac724262aa77e2ccea5b06af99357d6f68e5788 Mon Sep 17 00:00:00 2001
From: Matin Nuhamunada <matinnu@biosustain.dtu.dk>
Date: Mon, 3 Jun 2024 18:43:10 +0000
Subject: [PATCH 08/20] feat: enable parameter to change taxon in antismash

---
 workflow/rules/antismash.smk | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/workflow/rules/antismash.smk b/workflow/rules/antismash.smk
index 4a7f4261..2746391d 100644
--- a/workflow/rules/antismash.smk
+++ b/workflow/rules/antismash.smk
@@ -112,6 +112,7 @@ elif antismash_major_version >= 7:
             folder=directory("data/interim/antismash/{version}/{strains}/"),
             antismash_db_path=antismash_db_path,
             genefinding="none",
+            taxon="bacteria",
         shell:
             """
             set +e
@@ -132,7 +133,7 @@ elif antismash_major_version >= 7:
 
             # Run AntiSMASH
             antismash --genefinding-tool {params.genefinding} --output-dir {params.folder} \
-                --database {params.antismash_db_path} \
+                --database {params.antismash_db_path} --taxon {params.taxon} \
                 --cb-general --cb-subclusters --cb-knownclusters -c {threads} $antismash_input --logfile {log} 2>> {log}
 
             # Check if the run failed due to changed detection results or changed protocluster types
@@ -142,7 +143,7 @@ elif antismash_major_version >= 7:
                 # Use genbank input instead
                 echo "Previous JSON result is invalid, starting AntiSMASH from scratch..." >> {log}
                 antismash --genefinding-tool {params.genefinding} --output-dir {params.folder} \
-                    --database {params.antismash_db_path} \
+                    --database {params.antismash_db_path} --taxon {params.taxon} \
                     --cb-general --cb-subclusters --cb-knownclusters -c {threads} {input.gbk} --logfile {log} 2>> {log}
             fi
             """

From 340c9478bb42236adbced2401450a1571b64f92d Mon Sep 17 00:00:00 2001
From: Matin Nuhamunada <matinnu@biosustain.dtu.dk>
Date: Mon, 3 Jun 2024 18:43:36 +0000
Subject: [PATCH 09/20] chore: include .gbff as recognized format

---
 workflow/rules/convert_genbank.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/rules/convert_genbank.smk b/workflow/rules/convert_genbank.smk
index 865d8311..cf69c5b0 100644
--- a/workflow/rules/convert_genbank.smk
+++ b/workflow/rules/convert_genbank.smk
@@ -9,7 +9,7 @@ if len(CUSTOM_GENBANK) > 0:
         log: "logs/prokka/copy_custom_fasta/copy_custom_fasta-{custom_genbank}.log"
         shell:
             """
-            if [[ {input} == *.gb || {input} == *.gbk || {input} == *.genbank ]]
+            if [[ {input} == *.gb || {input} == *.gbk || {input} == *.genbank || {input} == *.gbff ]]
             then
                 cp {input} {output} 2>> {log}
             else

From db9d42cad1e1068c5a0ff0aceb5cf03cee19c725 Mon Sep 17 00:00:00 2001
From: Matin Nuhamunada <matinnu@biosustain.dtu.dk>
Date: Tue, 4 Jun 2024 15:39:31 +0000
Subject: [PATCH 10/20] fix: update lsabgc environment

---
 workflow/envs/lsabgc.post-deploy.sh |   2 +-
 workflow/envs/lsabgc.yaml           | 393 ++++++++++++++++++----------
 2 files changed, 263 insertions(+), 132 deletions(-)

diff --git a/workflow/envs/lsabgc.post-deploy.sh b/workflow/envs/lsabgc.post-deploy.sh
index b0838581..024956bc 100644
--- a/workflow/envs/lsabgc.post-deploy.sh
+++ b/workflow/envs/lsabgc.post-deploy.sh
@@ -3,7 +3,7 @@
 resource_dir="resources"
 output_lsabgc="$resource_dir/lsaBGC"
 repository="https://github.com/Kalan-Lab/lsaBGC"
-version="1.40.0"
+version="1.52"
 release="$repository/archive/refs/tags/v$version.tar.gz"
 
 log="logs/lsabgc/install.log"
diff --git a/workflow/envs/lsabgc.yaml b/workflow/envs/lsabgc.yaml
index 89b980a2..6dd8d1bf 100644
--- a/workflow/envs/lsabgc.yaml
+++ b/workflow/envs/lsabgc.yaml
@@ -1,257 +1,383 @@
-name: lsabgc_env
+name: lsaBGC
 channels:
-  - conda-forge
-  - bioconda
   - defaults
+  - bioconda
+  - conda-forge
 dependencies:
-  - cython==3.0.0
   - _libgcc_mutex=0.1=conda_forge
-  - _openmp_mutex=4.5=2_gnu
+  - _openmp_mutex=4.5=2_kmp_llvm
   - _r-mutex=1.0.1=anacondar_1
   - alsa-lib=1.2.3.2=h166bdaf_0
   - appdirs=1.4.4=pyh9f0ad1d_0
-  - archspec=0.2.1=pyhd8ed1ab_0
+  - aragorn=1.2.41=h031d066_2
+  - archspec=0.2.3=pyhd8ed1ab_0
+  - argcomplete=3.3.0=pyhd8ed1ab_0
+  - argh=0.31.2=pyhd8ed1ab_0
+  - argtable2=2.13=hd590300_1004
   - aria2=1.36.0=h8b6cd97_3
+  - arpack=3.7.0=hdefa2d7_2
+  - attr=2.5.1=h166bdaf_1
+  - barrnap=0.9=hdfd78af_4
   - bc=1.07.1=h7f98852_0
+  - bedtools=2.31.1=hf5e1c6e_1
   - binutils=2.39=hdd6e379_1
   - binutils_impl_linux-64=2.39=he00db2b_1
   - binutils_linux-64=2.39=h5fc0e48_13
+  - biocode=0.11.0=pyhdfd78af_0
   - bioconductor-ggtree=3.2.0=r41hdfd78af_0
   - bioconductor-treeio=1.18.0=r41hdfd78af_0
   - biopython=1.79=py39hb9d737c_3
-  - blast=2.14.1=pl5321h6f7f691_0
+  - blast=2.15.0=pl5321h6f7f691_1
+  - blast-legacy=2.2.26=h9ee0642_3
   - boost-cpp=1.74.0=h6cacc03_7
   - bowtie2=2.4.5=py39h3321a2d_4
-  - brotli-python=1.1.0=py39h3d6467e_0
+  - brotli=1.1.0=hd590300_1
+  - brotli-bin=1.1.0=hd590300_1
+  - brotli-python=1.1.0=py39h3d6467e_1
   - bwidget=1.9.14=ha770c72_1
-  - bzip2=1.0.8=h7f98852_4
-  - c-ares=1.19.1=hd590300_0
+  - bzip2=1.0.8=hd590300_5
+  - c-ares=1.28.1=hd590300_0
   - c-compiler=1.3.0=h7f98852_0
-  - ca-certificates=2023.7.22=hbcca054_0
+  - ca-certificates=2024.6.2=hbcca054_0
   - cairo=1.16.0=ha12eb4b_1010
   - capnproto=0.10.2=h6239696_0
-  - certifi=2023.7.22=pyhd8ed1ab_0
-  - charset-normalizer=3.2.0=pyhd8ed1ab_0
-  - click=8.1.7=unix_pyh707e725_0
+  - cd-hit=4.8.1=h43eeafb_10
+  - certifi=2024.2.2=pyhd8ed1ab_0
+  - charset-normalizer=3.3.2=pyhd8ed1ab_0
+  - clustalo=1.2.4=hdbdd923_8
+  - clustalw=2.1=h4ac6f70_10
   - colorama=0.4.6=pyhd8ed1ab_0
+  - contourpy=1.2.1=py39h7633fee_0
   - coreutils=9.1=h166bdaf_0
-  - curl=7.86.0=h7bff187_1
+  - curl=7.87.0=h5eee18b_0
   - cxx-compiler=1.3.0=h4bd325d_0
-  - dataclasses=0.8=pyhc8e2a94_3
+  - cycler=0.12.1=pyhd8ed1ab_0
   - dbus=1.13.6=h5008d03_3
-  - decorator=5.1.1=pyhd8ed1ab_0
+  - dendropy=5.0.1=pyhdfd78af_0
   - diamond=2.0.15=hb97b32f_1
   - docopt=0.6.2=py_1
-  - dos2unix=7.4.1=0
-  - entrez-direct=16.2=he881be0_1
+  - dos2unix=7.5.2=ha770c72_3
+  - entrez-direct=21.6=he881be0_0
   - ete3=3.1.2=pyh9f0ad1d_0
-  - expat=2.5.0=hcb278e6_1
+  - expat=2.6.2=h59595ed_0
   - fastme=2.1.6.1=h031d066_3
-  - fasttree=2.1.11=h031d066_2
+  - fasttree=2.1.11=h031d066_3
+  - fftw=3.3.10=nompi_hc118613_108
   - file=5.39=h753d276_1
   - filetype=1.2.0=pyhd8ed1ab_0
-  - fisher=0.1.14=py39h44dd56e_0
+  - fisher=0.1.14=py39h44dd56e_1
   - font-ttf-dejavu-sans-mono=2.37=hab24e00_0
   - font-ttf-inconsolata=3.000=h77eed37_0
   - font-ttf-source-code-pro=2.038=h77eed37_0
-  - font-ttf-ubuntu=0.83=hab24e00_0
+  - font-ttf-ubuntu=0.83=h77eed37_2
   - fontconfig=2.14.2=h14ed4e7_0
   - fonts-conda-ecosystem=1=0
   - fonts-conda-forge=1=0
+  - fonttools=4.53.0=py39hd3abc70_0
   - freetype=2.12.1=h267a509_2
   - fribidi=1.0.10=h36c2ea0_0
-  - future=0.18.3=pyhd8ed1ab_0
+  - future=1.0.0=pyhd8ed1ab_0
   - gawk=5.1.0=h7f98852_0
   - gcc=9.5.0=h1fea6ba_13
   - gcc_impl_linux-64=9.5.0=h99780fb_19
   - gcc_linux-64=9.5.0=h4258300_13
   - gdbm=1.18=h0a1914f_2
   - gecco=0.9.8=pyhdfd78af_0
-  - gensim=4.3.2=py39hddac248_0
-  - gettext=0.21.1=h27087fc_0
+  - gensim=4.3.2=py39hddac248_1
+  - gettext=0.22.5=h59595ed_2
+  - gettext-tools=0.22.5=h59595ed_2
+  - gffutils=0.13=pyh7cba7a3_0
   - gfortran_impl_linux-64=9.5.0=hf1096a2_19
   - gfortran_linux-64=9.5.0=hdb51d14_13
+  - glib=2.74.1=h6239696_0
+  - glib-tools=2.74.1=h6239696_0
   - glpk=5.0=h445213a_0
-  - gmp=6.2.1=h58526e2_0
-  - graphite2=1.3.13=h58526e2_1001
+  - gmp=6.3.0=h59595ed_1
+  - graphite2=1.3.13=h59595ed_1003
   - gsl=2.7=he838d99_0
   - gst-plugins-base=1.18.5=hf529b03_3
-  - gstreamer=1.18.5=h9f60fe5_3
-  - gtotree=1.8.2=h9ee0642_0
+  - gstreamer=1.20.3=hd4edc92_2
+  - gtotree=1.8.6=h9ee0642_0
   - gxx=9.5.0=h1fea6ba_13
   - gxx_impl_linux-64=9.5.0=h99780fb_19
   - gxx_linux-64=9.5.0=h43f449f_13
-  - gzip=1.12=h166bdaf_0
+  - gzip=1.13=hd590300_0
   - harfbuzz=4.2.0=h40b6f09_0
   - hmmer=3.3.2=hdbdd923_4
-  - htslib=1.14=h9093b5e_0
+  - htslib=1.17=h6bc39ce_1
   - icu=69.1=h9c3ff4c_0
-  - idna=3.4=pyhd8ed1ab_0
-  - importlib-metadata=6.8.0=pyha770c72_0
-  - importlib_metadata=6.8.0=hd8ed1ab_0
-  - importlib_resources=6.1.0=pyhd8ed1ab_0
-  - iqtree=2.2.5=h21ec9f0_0
-  - jbig=2.1=h7f98852_2003
-  - joblib=1.3.2=pyhd8ed1ab_0
+  - idna=3.7=pyhd8ed1ab_0
+  - importlib-metadata=7.1.0=pyha770c72_0
+  - importlib-resources=6.4.0=pyhd8ed1ab_0
+  - importlib_metadata=7.1.0=hd8ed1ab_0
+  - importlib_resources=6.4.0=pyhd8ed1ab_0
+  - infernal=1.1.5=pl5321h031d066_1
+  - intbitset=3.0.2=py39hd1e30aa_1
+  - iqtree=2.3.4=h21ec9f0_0
+  - jack=1.9.18=hfd4fe87_1001
+  - jinja2=3.1.4=pyhd8ed1ab_0
+  - joblib=1.4.2=pyhd8ed1ab_0
   - jpeg=9e=h0b41bf4_3
-  - kernel-headers_linux-64=2.6.32=he073ed8_16
+  - kernel-headers_linux-64=2.6.32=he073ed8_17
   - keyutils=1.6.1=h166bdaf_0
+  - kiwisolver=1.4.5=py39h7633fee_1
   - kofamscan=1.3.0=hdfd78af_2
   - krb5=1.19.3=h3790be6_0
+  - lcms2=2.14=h6ed2654_0
   - ld_impl_linux-64=2.39=hcc3a1bd_1
-  - lerc=2.2.1=h9c3ff4c_0
-  - libblas=3.9.0=18_linux64_openblas
-  - libcblas=3.9.0=18_linux64_openblas
-  - libclang=13.0.1=default_h7634d5b_3
-  - libcurl=7.86.0=h7bff187_1
-  - libdeflate=1.7=h7f98852_5
+  - lerc=4.0.0=h27087fc_0
+  - libasprintf=0.22.5=h661eb56_2
+  - libasprintf-devel=0.22.5=h661eb56_2
+  - libblas=3.9.0=22_linux64_openblas
+  - libbrotlicommon=1.1.0=hd590300_1
+  - libbrotlidec=1.1.0=hd590300_1
+  - libbrotlienc=1.1.0=hd590300_1
+  - libcap=2.64=ha37c62d_0
+  - libcblas=3.9.0=22_linux64_openblas
+  - libclang=13.0.1=default_h7634d5b_6
+  - libcups=2.3.3=h3e49a29_2
+  - libcurl=7.87.0=h91b91d3_0
+  - libdb=6.2.32=h9c3ff4c_0
+  - libdeflate=1.14=h166bdaf_0
   - libedit=3.1.20191231=he28a2e2_2
-  - libev=4.33=h516909a_1
+  - libev=4.33=hd590300_2
   - libevent=2.1.10=h9b69904_4
-  - libexpat=2.5.0=hcb278e6_1
+  - libexpat=2.6.2=h59595ed_0
   - libffi=3.4.2=h7f98852_5
+  - libflac=1.3.4=h27087fc_0
   - libgcc-devel_linux-64=9.5.0=h0a57e50_19
-  - libgcc-ng=13.2.0=h807b86a_2
-  - libgfortran-ng=13.2.0=h69a702a_2
-  - libgfortran5=13.2.0=ha4646dd_2
+  - libgcc-ng=13.2.0=h77fa898_7
+  - libgettextpo=0.22.5=h59595ed_2
+  - libgettextpo-devel=0.22.5=h59595ed_2
+  - libgfortran=3.0.0=1
+  - libgfortran-ng=13.2.0=h69a702a_7
+  - libgfortran5=13.2.0=hca663fb_7
   - libglib=2.74.1=h7a41b64_0
-  - libgomp=13.2.0=h807b86a_2
+  - libgomp=13.2.0=h77fa898_7
   - libhwloc=2.8.0=h32351e8_1
-  - libiconv=1.17=h166bdaf_0
-  - libidn2=2.3.4=h166bdaf_0
-  - liblapack=3.9.0=18_linux64_openblas
+  - libiconv=1.17=hd590300_2
+  - libidn11=1.34=h1cef754_0
+  - libidn2=2.3.7=hd590300_0
+  - liblapack=3.9.0=22_linux64_openblas
   - libllvm13=13.0.1=hf817b99_2
+  - libllvm14=14.0.6=hcd5def8_4
   - libmagic=5.39=h753d276_1
   - libnghttp2=1.51.0=hdcd2b5c_0
-  - libnsl=2.0.0=h7f98852_0
+  - libnsl=2.0.1=hd590300_0
   - libogg=1.3.4=h7f98852_1
-  - libopenblas=0.3.24=pthreads_h413a1c8_0
+  - libopenblas=0.3.27=pthreads_h413a1c8_0
   - libopus=1.3.1=h7f98852_1
-  - libpng=1.6.39=h753d276_0
+  - libpng=1.6.43=h2797004_0
   - libpq=14.5=h72a31a5_3
   - libsanitizer=9.5.0=h2f262e1_19
-  - libsqlite=3.43.0=h2797004_0
+  - libsndfile=1.0.31=h9c3ff4c_1
+  - libsqlite=3.45.3=h2797004_0
   - libssh2=1.10.0=haa6b8db_3
   - libstdcxx-devel_linux-64=9.5.0=h0a57e50_19
-  - libstdcxx-ng=13.2.0=h7e041cc_2
-  - libtiff=4.3.0=hf544144_1
+  - libstdcxx-ng=13.2.0=hc0a3c3a_7
+  - libtiff=4.4.0=h82bc61c_5
+  - libtool=2.4.7=h27087fc_0
+  - libudev1=253=h0b41bf4_0
   - libunistring=0.9.10=h7f98852_0
   - libuuid=2.38.1=h0b41bf4_0
   - libvorbis=1.3.7=h9c3ff4c_0
-  - libwebp-base=1.3.2=hd590300_0
+  - libwebp-base=1.4.0=hd590300_0
   - libxcb=1.13=h7f98852_1004
+  - libxcrypt=4.4.36=hd590300_1
   - libxkbcommon=1.0.3=he3ba5ed_0
   - libxml2=2.9.14=haae042b_4
   - libxslt=1.1.33=h0ef7038_3
-  - libzlib=1.2.13=hd590300_5
+  - libzlib=1.2.13=h4ab18f5_6
+  - llvm-openmp=18.1.6=ha31de31_0
+  - llvmlite=0.42.0=py39h174d805_1
   - lxml=4.8.0=py39hb9d737c_3
   - mafft=7.505=hec16e2b_0
   - make=4.3=hd18ef5c_1
   - markdown-it-py=3.0.0=pyhd8ed1ab_0
+  - markupsafe=2.1.5=py39hd1e30aa_0
   - mash=2.3=hd3113c8_6
+  - matplotlib-base=3.8.4=py39h10d1fc8_2
   - mcl=14.137=pl5321h031d066_9
-  - mdurl=0.1.0=pyhd8ed1ab_0
-  - mmseqs2=14.7e284=pl5321h6a68c12_2
+  - mdurl=0.1.2=pyhd8ed1ab_0
+  - metis=5.1.0=h59595ed_1007
+  - minced=0.4.2=hdfd78af_1
+  - mkl=2024.1.0=ha957f24_693
+  - mmseqs2=15.6f452=pl5321h6a68c12_2
+  - mpfr=4.2.1=h9458935_1
   - mpi=1.0=openmpi
+  - munkres=1.1.4=pyh9f0ad1d_0
   - muscle=5.1=h4ac6f70_3
-  - mypy=1.5.1=py39hd1e30aa_1
+  - mypy=1.10.0=py39hd3abc70_0
   - mypy_extensions=1.0.0=pyha770c72_0
   - mysql-common=8.0.32=h14678bc_0
   - mysql-libs=8.0.32=h54cf53e_0
   - n50=1.5.8=pl5321hdfd78af_0
   - ncbi-genome-download=0.3.3=pyh7cba7a3_0
-  - ncbi-vdb=3.0.8=hdbdd923_0
+  - ncbi-vdb=3.1.1=h4ac6f70_0
   - ncurses=6.2=h58526e2_4
-  - networkx=3.1=pyhd8ed1ab_0
+  - networkx=3.2.1=pyhd8ed1ab_0
   - nspr=4.35=h27087fc_0
-  - nss=3.92=h1d7d5a4_0
-  - numpy=1.26.0=py39h474f0d3_0
-  - openmpi=4.1.5=h414af15_101
+  - nss=3.100=hca3bf56_0
+  - numba=0.59.1=py39h615d6bd_0
+  - numpy=1.26.4=py39h474f0d3_0
+  - openjdk=11.0.1=h516909a_1016
+  - openjpeg=2.5.0=h7d73246_1
+  - openmpi=4.1.6=hc5af2df_101
   - openssl=1.1.1w=hd590300_0
   - orthofinder=2.5.4=hdfd78af_0
-  - ossuuid=1.6.2=hf484d3e_1000
-  - packaging=23.1=pyhd8ed1ab_0
+  - packaging=24.0=pyhd8ed1ab_0
   - pal2nal=14.1=pl5321hdfd78af_3
+  - paml=4.10.7=h031d066_1
+  - panaroo=1.5.0=pyhdfd78af_0
   - pandas=1.4.2=py39h1832856_2
   - pango=1.50.7=hbd2fdc8_0
-  - parallel=20230722=ha770c72_0
-  - patsy=0.5.3=pyhd8ed1ab_0
+  - parallel=20240522=ha770c72_0
+  - patsy=0.5.6=pyhd8ed1ab_0
   - pcre=8.45=h9c3ff4c_0
   - pcre2=10.37=hc3806b6_1
-  - perl=5.32.1=4_hd590300_perl5
-  - perl-alien-build=2.48=pl5321hec16e2b_0
-  - perl-alien-libxml2=0.17=pl5321hec16e2b_0
+  - peewee=3.17.3=py39h21eaaa1_0
+  - perl=5.32.1=7_hd590300_perl5
+  - perl-algorithm-diff=1.201=pl5321hd8ed1ab_0
   - perl-archive-tar=2.40=pl5321hdfd78af_0
+  - perl-base=2.23=pl5321hd8ed1ab_0
+  - perl-bio-asn1-entrezgene=1.73=pl5321hdfd78af_3
+  - perl-bio-coordinate=1.007001=pl5321hdfd78af_3
+  - perl-bio-featureio=1.6.905=pl5321hdfd78af_4
+  - perl-bio-samtools=1.43=pl5321he4a0461_4
+  - perl-bio-searchio-hmmer=1.7.3=pl5321hdfd78af_0
+  - perl-bio-tools-phylo-paml=1.7.3=pl5321hdfd78af_3
+  - perl-bio-tools-run-alignment-clustalw=1.7.4=pl5321hdfd78af_3
+  - perl-bio-tools-run-alignment-tcoffee=1.7.4=pl5321hdfd78af_5
+  - perl-bioperl=1.7.8=hdfd78af_1
+  - perl-bioperl-core=1.7.8=pl5321hdfd78af_1
+  - perl-bioperl-run=1.007003=pl5321hdfd78af_0
   - perl-business-isbn=3.007=pl5321hd8ed1ab_0
   - perl-business-isbn-data=20210112.006=pl5321hd8ed1ab_0
   - perl-capture-tiny=0.48=pl5321ha770c72_1
   - perl-carp=1.50=pl5321hd8ed1ab_0
+  - perl-class-data-inheritable=0.09=pl5321ha770c72_0
   - perl-common-sense=3.75=pl5321hd8ed1ab_0
   - perl-compress-raw-bzip2=2.201=pl5321h166bdaf_0
   - perl-compress-raw-zlib=2.202=pl5321h166bdaf_0
   - perl-constant=1.33=pl5321hd8ed1ab_0
-  - perl-encode=3.19=pl5321h166bdaf_0
+  - perl-data-dumper=2.183=pl5321hd590300_0
+  - perl-db_file=1.858=pl5321h166bdaf_0
+  - perl-devel-stacktrace=2.04=pl5321ha770c72_0
+  - perl-digest-hmac=1.04=pl5321hdfd78af_0
+  - perl-digest-md5=2.58=pl5321h166bdaf_0
+  - perl-encode=3.21=pl5321hd590300_0
+  - perl-encode-locale=1.05=pl5321hdfd78af_7
+  - perl-exception-class=1.45=pl5321ha770c72_0
   - perl-exporter=5.74=pl5321hd8ed1ab_0
   - perl-exporter-tiny=1.002002=pl5321hd8ed1ab_0
   - perl-extutils-makemaker=7.70=pl5321hd8ed1ab_0
-  - perl-fastx-reader=1.11.0=pl5321hdfd78af_0
-  - perl-ffi-checklib=0.28=pl5321hdfd78af_0
-  - perl-file-chdir=0.1011=pl5321hd8ed1ab_0
+  - perl-fastx-reader=1.12.0=pl5321hdfd78af_0
+  - perl-file-listing=6.16=pl5321hdfd78af_0
   - perl-file-path=2.18=pl5321hd8ed1ab_0
+  - perl-file-slurp-tiny=0.004=pl5321hdfd78af_2
+  - perl-file-sort=1.01=pl5321hdfd78af_3
+  - perl-file-spec=3.48_01=pl5321hdfd78af_2
   - perl-file-temp=0.2304=pl5321hd8ed1ab_0
   - perl-file-which=1.24=pl5321hd8ed1ab_0
   - perl-getopt-long=2.54=pl5321hdfd78af_0
-  - perl-importer=0.026=pl5321hd8ed1ab_0
+  - perl-html-parser=3.81=pl5321h4ac6f70_1
+  - perl-html-tagset=3.20=pl5321hdfd78af_4
+  - perl-http-cookies=6.10=pl5321hdfd78af_0
+  - perl-http-daemon=6.16=pl5321hdfd78af_0
+  - perl-http-date=6.06=pl5321hdfd78af_0
+  - perl-http-message=6.36=pl5321hdfd78af_0
+  - perl-http-negotiate=6.01=pl5321hdfd78af_4
+  - perl-inc-latest=0.500=pl5321ha770c72_0
   - perl-io-compress=2.201=pl5321hdbdd923_2
+  - perl-io-html=1.004=pl5321hdfd78af_0
+  - perl-io-socket-ssl=2.075=pl5321hd8ed1ab_0
+  - perl-io-string=1.08=pl5321hdfd78af_4
+  - perl-io-tty=1.16=pl5321h166bdaf_0
   - perl-io-zlib=1.14=pl5321hdfd78af_0
+  - perl-ipc-run=20200505.0=pl5321hdfd78af_0
   - perl-json=4.10=pl5321hdfd78af_0
   - perl-json-pp=4.11=pl5321hd8ed1ab_0
   - perl-json-xs=2.34=pl5321h4ac6f70_6
+  - perl-libwww-perl=6.67=pl5321hdfd78af_0
+  - perl-libxml-perl=0.08=pl5321hdfd78af_3
   - perl-list-moreutils=0.430=pl5321hdfd78af_0
   - perl-list-moreutils-xs=0.430=pl5321h031d066_2
+  - perl-lwp-mediatypes=6.04=pl5321hdfd78af_1
+  - perl-mime-base64=3.16=pl5321h166bdaf_0
+  - perl-module-build=0.4234=pl5321ha770c72_0
+  - perl-net-http=6.22=pl5321hdfd78af_0
+  - perl-net-ssleay=1.92=pl5321haa6b8db_1
+  - perl-ntlm=1.09=pl5321hdfd78af_5
   - perl-parent=0.241=pl5321hd8ed1ab_0
-  - perl-path-tiny=0.124=pl5321hd8ed1ab_0
   - perl-pathtools=3.75=pl5321h166bdaf_0
   - perl-perlio-encoding=0.18=pl5321hdfd78af_2
   - perl-pod-escapes=1.07=pl5321hdfd78af_2
   - perl-pod-usage=2.03=pl5321hdfd78af_0
   - perl-scalar-list-utils=1.63=pl5321h166bdaf_0
-  - perl-scope-guard=0.21=pl5321hd8ed1ab_0
+  - perl-socket=2.027=pl5321h031d066_4
   - perl-storable=3.15=pl5321h166bdaf_0
-  - perl-sub-info=0.002=pl5321hd8ed1ab_0
-  - perl-term-table=0.016=pl5321hdfd78af_0
+  - perl-sub-uplevel=0.2800=pl5321h166bdaf_0
   - perl-test=1.26=pl5321hd8ed1ab_0
+  - perl-test-deep=1.130=pl5321hd8ed1ab_0
+  - perl-test-differences=0.71=pl5321ha770c72_0
+  - perl-test-exception=0.43=pl5321hd8ed1ab_0
   - perl-test-fatal=0.016=pl5321ha770c72_0
   - perl-test-harness=3.44=pl5321hd8ed1ab_0
+  - perl-test-most=0.38=pl5321hdfd78af_0
+  - perl-test-warn=0.37=pl5321hd8ed1ab_0
   - perl-test-warnings=0.031=pl5321ha770c72_0
-  - perl-test2-suite=0.000145=pl5321hdfd78af_0
   - perl-text-asciitable=0.22=pl5321hdfd78af_3
+  - perl-text-diff=1.45=pl5321hd8ed1ab_0
+  - perl-time-local=1.35=pl5321hdfd78af_0
+  - perl-timedate=2.33=pl5321hdfd78af_2
+  - perl-tree-dag_node=1.32=pl5321hdfd78af_0
   - perl-try-tiny=0.31=pl5321ha770c72_0
   - perl-types-serialiser=1.01=pl5321hdfd78af_0
   - perl-uri=5.17=pl5321ha770c72_0
-  - perl-xml-libxml=2.0207=pl5321h661654b_0
+  - perl-url-encode=0.03=pl5321h9ee0642_0
+  - perl-www-robotrules=6.02=pl5321hdfd78af_4
+  - perl-xml-dom=1.46=pl5321hdfd78af_1
+  - perl-xml-dom-xpath=0.14=pl5321hdfd78af_2
   - perl-xml-namespacesupport=1.12=pl5321hd8ed1ab_0
+  - perl-xml-parser=2.44_01=pl5321hc3e0081_1003
+  - perl-xml-regexp=0.04=pl5321hdfd78af_3
   - perl-xml-sax=1.02=pl5321hd8ed1ab_0
   - perl-xml-sax-base=1.09=pl5321hd8ed1ab_0
-  - pixman=0.40.0=h36c2ea0_0
-  - polars=0.19.3=py39h903e532_0
+  - perl-xml-sax-expat=0.51=pl5321hd8ed1ab_0
+  - perl-xml-simple=2.25=pl5321hdfd78af_2
+  - perl-xml-xpathengine=0.14=pl5321hdfd78af_3
+  - pillow=9.2.0=py39hf3a2cdf_3
+  - pip=22.1.2=pyhd8ed1ab_0
+  - pixman=0.43.2=h59595ed_0
+  - plotly=5.22.0=pyhd8ed1ab_0
+  - poa=2.0=h031d066_5
+  - polars=0.20.31=py39ha963410_0
   - pomegranate=0.13.3=py39h1a9c180_3
-  - prodigal=2.6.3=h031d066_6
-  - psutil=5.9.5=py39hd1e30aa_1
+  - prank=170427=h4ac6f70_0
+  - prodigal=2.6.3=h031d066_8
+  - prokka=1.14.6=pl5321hdfd78af_5
+  - psutil=5.9.8=py39hd1e30aa_0
   - pthread-stubs=0.4=h36c2ea0_1001
-  - pygments=2.16.1=pyhd8ed1ab_0
-  - pyhmmer=0.10.2=py39hf95cd2a_0
-  - pyqt=5.12.3=py39h03dd644_4
-  - pyrodigal=3.0.0=py39hf95cd2a_0
+  - pulseaudio=14.0=hbc9ff1d_7
+  - pyfaidx=0.8.1.1=pyhdfd78af_0
+  - pygments=2.18.0=pyhd8ed1ab_0
+  - pyhmmer=0.10.12=py39hf95cd2a_0
+  - pyparsing=3.1.2=pyhd8ed1ab_0
+  - pyqt=5.15.4=py39h5a03fae_0
+  - pyqt5-sip=12.9.0=py39h5a03fae_0
+  - pyrodigal=2.3.0=py39hf95cd2a_1
   - pysam=0.16.0.1=py39h051187c_3
   - pysocks=1.7.1=pyha2e5f31_6
   - python=3.9.9=h62f1059_0_cpython
-  - python-crfsuite=0.9.9=py39h7633fee_1
-  - python-dateutil=2.8.2=pyhd8ed1ab_0
+  - python-crfsuite=0.9.9=py39h7633fee_2
+  - python-dateutil=2.9.0=pyhd8ed1ab_0
+  - python-edlib=1.3.9=py39h1f90b4d_6
+  - python-igraph=0.10.2=py39h000617a_0
   - python_abi=3.9=4_cp39
-  - pytz=2023.3.post1=pyhd8ed1ab_0
+  - pytz=2024.1=pyhd8ed1ab_0
+  - pyvcf3=1.0.3=pyhdfd78af_0
   - pyyaml=6.0.1=py39hd1e30aa_1
-  - qt=5.12.9=ha98a1a1_5
+  - qt-main=5.15.2=hdf1cb14_3
   - r-ape=5.7_1=r41h358215d_0
   - r-aplot=0.1.10=r41hc72bb7e_0
   - r-base=4.1.2=h2553ce4_1
@@ -263,7 +389,7 @@ dependencies:
   - r-combinat=0.0_8=r41hc72bb7e_1004
   - r-commonmark=1.9.0=r41h133d619_0
   - r-cowplot=1.1.1=r41hc72bb7e_1
-  - r-cpp11=0.4.3=r41hc72bb7e_0
+  - r-cpp11=0.4.7=r41hc72bb7e_0
   - r-crayon=1.5.2=r41hc72bb7e_1
   - r-curl=4.3.3=r41h06615bd_1
   - r-data.table=1.14.8=r41h133d619_0
@@ -346,39 +472,51 @@ dependencies:
   - r-xfun=0.39=r41ha503ecb_0
   - r-xml2=1.3.3=r41h044e5c7_2
   - r-yulab.utils=0.0.6=r41hc72bb7e_0
-  - raxml=8.2.12=h031d066_6
-  - raxml-ng=1.2.0=h6d1f11b_1
+  - raxml=8.2.13=h031d066_1
+  - raxml-ng=1.2.2=h6d1f11b_0
   - readline=8.1=h46c0cb4_0
-  - requests=2.31.0=pyhd8ed1ab_0
-  - rich=13.5.3=pyhd8ed1ab_0
+  - requests=2.32.3=pyhd8ed1ab_0
+  - rich=13.7.1=pyhd8ed1ab_0
   - ruby=3.1.0=h86e321c_1
   - samtools=1.12=h9aed4be_1
   - scikit-learn=1.2.2=py39hc236052_2
-  - scipy=1.11.2=py39h474f0d3_1
+  - scipy=1.13.1=py39haf93ffa_0
   - sed=4.8=he412f7d_0
+  - setuptools=58.2.0=py39hf3d152e_0
+  - simplejson=3.19.2=py39hd1e30aa_0
+  - sip=6.5.1=py39he80948d_2
   - six=1.16.0=pyh6c4a22f_0
   - sklearn-crfsuite=0.3.6=pyh9f0ad1d_0
-  - smart-open=6.4.0=pyhd8ed1ab_0
-  - smart_open=6.4.0=pyhd8ed1ab_0
+  - smart-open=7.0.4=hd8ed1ab_0
+  - smart_open=7.0.4=pyhd8ed1ab_0
   - sqlite=3.37.0=h9cd32fc_0
-  - statsmodels=0.14.0=py39h0f8d45d_1
-  - sysroot_linux-64=2.12=he073ed8_16
+  - statsmodels=0.14.2=py39hd92a3bb_0
+  - suitesparse=5.10.1=h9e50725_1
+  - sysroot_linux-64=2.12=he073ed8_17
+  - t-coffee=12.00.7fb08c2=h26a2512_0
   - tabulate=0.9.0=pyhd8ed1ab_1
   - tar=1.34=hb2e2bae_1
-  - taxonkit=0.15.0=h9ee0642_0
+  - taxadb=0.12.1=pyh5e36f6f_0
+  - taxonkit=0.16.0=h9ee0642_1
   - tbb=2021.7.0=h924138e_1
-  - threadpoolctl=3.2.0=pyha21a80b_0
-  - tk=8.6.13=h2797004_0
-  - tktable=2.10=h0c5db8f_4
+  - tbl2asn-forever=25.7.2f=h031d066_4
+  - tenacity=8.3.0=pyhd8ed1ab_0
+  - texttable=1.7.0=pyhd8ed1ab_0
+  - threadpoolctl=3.5.0=pyhc1e730c_0
+  - tk=8.6.13=noxft_h4845f30_101
+  - tktable=2.10=h0c5db8f_5
+  - toml=0.10.2=pyhd8ed1ab_0
   - tomli=2.0.1=pyhd8ed1ab_0
-  - tqdm=4.66.1=pyhd8ed1ab_0
-  - trimal=1.4.1=h4ac6f70_8
-  - typing_extensions=4.8.0=pyha770c72_0
-  - tzdata=2023c=h71feb2d_0
-  - unzip=6.0=h7f98852_3
-  - urllib3=2.0.5=pyhd8ed1ab_0
+  - tqdm=4.66.4=pyhd8ed1ab_0
+  - trimal=1.4.1=h4ac6f70_9
+  - typing_extensions=4.12.1=pyha770c72_0
+  - tzdata=2024a=h0c530f3_0
+  - unicodedata2=15.1.0=py39hd1e30aa_0
+  - urllib3=2.2.1=pyhd8ed1ab_0
+  - viennarna=2.6.4=py39pl5321h4e691d4_1
   - wget=1.20.3=ha56f1ee_1
-  - wheel=0.41.2=pyhd8ed1ab_0
+  - wheel=0.43.0=pyhd8ed1ab_1
+  - wrapt=1.16.0=py39hd1e30aa_0
   - xlsxwriter=3.0.3=pyhd8ed1ab_0
   - xorg-kbproto=1.0.7=h7f98852_1002
   - xorg-libice=1.0.10=h7f98852_0
@@ -395,12 +533,5 @@ dependencies:
   - xz=5.2.6=h166bdaf_0
   - yaml=0.2.5=h7f98852_2
   - zipp=3.17.0=pyhd8ed1ab_0
-  - zlib=1.2.13=hd590300_5
-  - zstd=1.5.5=hfc55251_0
-  - pip:
-    - pip==23.2.1
-    - pyqt5-sip==4.19.18
-    - pyqtchart==5.12
-    - pyqtwebengine==5.12.1
-    - setuptools==68.2.2
-    - sonicparanoid==2.0.2
+  - zlib=1.2.13=h4ab18f5_6
+  - zstd=1.5.6=ha6fb4c9_0

From b43da51697d7dfd3cc710bcceda8bc6b478905d8 Mon Sep 17 00:00:00 2001
From: Matin Nuhamunada <matinnu@biosustain.dtu.dk>
Date: Tue, 4 Jun 2024 15:40:42 +0000
Subject: [PATCH 11/20] chore: add taxon message for antismash run

---
 workflow/rules/antismash.smk | 1 +
 1 file changed, 1 insertion(+)

diff --git a/workflow/rules/antismash.smk b/workflow/rules/antismash.smk
index 2746391d..94a9725b 100644
--- a/workflow/rules/antismash.smk
+++ b/workflow/rules/antismash.smk
@@ -132,6 +132,7 @@ elif antismash_major_version >= 7:
             fi
 
             # Run AntiSMASH
+            echo "Running AntiSMASH {params.taxon} mode..." >> {log}
             antismash --genefinding-tool {params.genefinding} --output-dir {params.folder} \
                 --database {params.antismash_db_path} --taxon {params.taxon} \
                 --cb-general --cb-subclusters --cb-knownclusters -c {threads} $antismash_input --logfile {log} 2>> {log}

From 66ce0475ae031f8cb900c170fbfbb5fc20ae3dc4 Mon Sep 17 00:00:00 2001
From: Matin Nuhamunada <matinnu@biosustain.dtu.dk>
Date: Tue, 4 Jun 2024 18:12:36 +0000
Subject: [PATCH 12/20] fix: pin setuptools < 70.0.0

---
 .github/workflows/build.yml | 2 +-
 workflow/envs/lsabgc.yaml   | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 0690b0d0..027259fa 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -45,7 +45,7 @@ jobs:
     - name: Set up Micromamba
       uses: mamba-org/setup-micromamba@v1
       with:
-        micromamba-version: '1.5.0-1'
+        micromamba-version: '1.5.8-1'
         environment-file: ${{ matrix.environment }}
         init-shell: bash
         cache-environment: true
diff --git a/workflow/envs/lsabgc.yaml b/workflow/envs/lsabgc.yaml
index 6dd8d1bf..efe90db5 100644
--- a/workflow/envs/lsabgc.yaml
+++ b/workflow/envs/lsabgc.yaml
@@ -535,3 +535,6 @@ dependencies:
   - zipp=3.17.0=pyhd8ed1ab_0
   - zlib=1.2.13=h4ab18f5_6
   - zstd=1.5.6=ha6fb4c9_0
+  - pip
+  - pip:
+    - setuptools==69.5.1

From 016d0701898ccdb7be5ab5dda7e5b9cf22f6485a Mon Sep 17 00:00:00 2001
From: Matin Nuhamunada <matinnu@biosustain.dtu.dk>
Date: Tue, 4 Jun 2024 18:31:07 +0000
Subject: [PATCH 13/20] test: correct action

---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 027259fa..23e0084d 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -4,7 +4,7 @@ on:
   push:
     branches:
       - main
-  pull_request_target:
+  pull_request:
     branches:
       - main
 

From 38b7193c5a6b39a977489272b8492fa78d5c1803 Mon Sep 17 00:00:00 2001
From: Matin Nuhamunada <matinnu@biosustain.dtu.dk>
Date: Thu, 13 Jun 2024 20:57:51 +0000
Subject: [PATCH 14/20] feat: add antismash parameters based on antismash
 database

---
 workflow/rules/antismash.smk | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/workflow/rules/antismash.smk b/workflow/rules/antismash.smk
index 94a9725b..79dbf8c9 100644
--- a/workflow/rules/antismash.smk
+++ b/workflow/rules/antismash.smk
@@ -118,8 +118,7 @@ elif antismash_major_version >= 7:
             set +e
 
             # Find the latest existing JSON output for this strain
-            latest_version=$(ls -d data/interim/antismash/*/{wildcards.strains}/{wildcards.strains}.json | grep {wildcards.strains} | sort -r | head -n 1 | cut -d '/' -f 4) 2>> {log}
-
+            latest_version=$(find data/interim/antismash/*/{wildcards.strains} -name "{wildcards.strains}.json" | sort -r | head -n 1 | cut -d '/' -f 4) 2>> {log}
             if [ -n "$latest_version" ]; then
                 # Use existing JSON result as starting point
                 old_json="data/interim/antismash/$latest_version/{wildcards.strains}/{wildcards.strains}.json"
@@ -131,11 +130,26 @@ elif antismash_major_version >= 7:
                 antismash_input="{input.gbk}"
             fi
 
+            # Store common parameters in a variable
+            antismash_params="--genefinding-tool {params.genefinding} \
+                --output-dir {params.folder} \
+                --database {params.antismash_db_path} \
+                --taxon {params.taxon} \
+                --cb-knownclusters \
+                --cb-subclusters \
+                --cc-mibig \
+                --clusterhmmer \
+                --tigrfam \
+                --pfam2go \
+                --rre \
+                --asf \
+                --tfbs \
+                -c {threads} \
+                --logfile {log}"
+
             # Run AntiSMASH
             echo "Running AntiSMASH {params.taxon} mode..." >> {log}
-            antismash --genefinding-tool {params.genefinding} --output-dir {params.folder} \
-                --database {params.antismash_db_path} --taxon {params.taxon} \
-                --cb-general --cb-subclusters --cb-knownclusters -c {threads} $antismash_input --logfile {log} 2>> {log}
+            antismash $antismash_params $antismash_input 2>> {log}
 
             # Check if the run failed due to changed detection results or changed protocluster types
             if grep -q -e "ValueError: Detection results have changed. No results can be reused" \
@@ -143,9 +157,7 @@ elif antismash_major_version >= 7:
             then
                 # Use genbank input instead
                 echo "Previous JSON result is invalid, starting AntiSMASH from scratch..." >> {log}
-                antismash --genefinding-tool {params.genefinding} --output-dir {params.folder} \
-                    --database {params.antismash_db_path} --taxon {params.taxon} \
-                    --cb-general --cb-subclusters --cb-knownclusters -c {threads} {input.gbk} --logfile {log} 2>> {log}
+                antismash $antismash_params {input.gbk} 2>> {log}
             fi
             """
 

From cb90fa3ed46407c83a0170ea9f042978f8871dae Mon Sep 17 00:00:00 2001
From: Matin Nuhamunada <matinnu@biosustain.dtu.dk>
Date: Thu, 20 Jun 2024 12:56:51 +0200
Subject: [PATCH 15/20] fix: correct bgc downstream preparation and make sure
 all changes registered

---
 .../data/bgc_downstream_prep_selection.py     | 275 ++++++++++--------
 .../data/get_antismash_overview_gather.py     |  14 +-
 workflow/rules/antismash.smk                  |   9 +-
 workflow/rules/bgc_analytics.smk              |   2 +-
 4 files changed, 169 insertions(+), 131 deletions(-)

diff --git a/workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py b/workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py
index bff37fa5..44963ae9 100644
--- a/workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py
+++ b/workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py
@@ -3,6 +3,7 @@
 import sys
 from pathlib import Path
 
+from alive_progress import alive_bar
 from Bio import SeqIO
 
 log_format = "%(levelname)-8s %(asctime)s   %(message)s"
@@ -10,141 +11,173 @@
 logging.basicConfig(format=log_format, datefmt=date_format, level=logging.DEBUG)
 
 
-def generate_symlink(path, genome_id, output_dir, selected_bgcs=False):
+def generate_symlink(selected_bgcs, genome_id, output_dir):
     """
-    Given an antiSMASH directory, check for changed name
+    Given an antiSMASH directory, check for changed name and generate a symlink.
+
+    Parameters:
+    selected_bgcs (str): Path to the selected BGCs.
+    genome_id (str): ID of the genome.
+    output_dir (str): Path to the output directory.
+
+    Returns:
+    dict: A dictionary containing the change log.
     """
     outpath = Path(output_dir) / genome_id
     outpath.mkdir(parents=True, exist_ok=True)
-    logging.debug(f"Deducting genome id as {genome_id}")
-    ctr = 0
     change_log = None
-    matches = selected_bgcs.stem
-    for gbk in path.glob("*.gbk"):
-        if gbk.stem in matches:
-            logging.debug(f"Found match: {gbk.stem}")
-            filename = gbk.name
-            ctr = ctr + 1
-            logging.info(f"Parsing file: {gbk.name}")
-            region = SeqIO.parse(str(gbk), "genbank")
-            for record in region:
-                logging.debug(f"Processing: {gbk.name}: {record.id}")
-                record_log = {}
-                if "structured_comment" in record.annotations:
-                    try:
-                        original_id = record.annotations["structured_comment"][
-                            "antiSMASH-Data"
-                        ]["Original ID"].split()[0]
-                    except KeyError:
-                        original_id = record.id
-                        logging.warning(
-                            f"Found shortened record.id: {record.id} <- {original_id}."
-                        )
-                else:
-                    raise ValueError(f"No Structured Comments in record: {gbk.name}")
-
-                if (":" in str(record.description)) or (":" in original_id):
-                    logging.warning(
-                        f"Illegal character ':' found in genbank description, removing: {record.description}"
-                    )
-                    # Remove colon from description
-                    record.description = record.description.replace(":", "")
-                    original_id = original_id.replace(":", "")
-
-                    # Rename antiSMASH comment
-                    if "structured_comment" in record.annotations:
-                        if (
-                            "Original ID"
-                            in record.annotations["structured_comment"][
-                                "antiSMASH-Data"
-                            ]
-                        ):
-                            record.annotations["structured_comment"]["antiSMASH-Data"][
-                                "Original ID"
-                            ] = original_id
-
-                    # Write new GenBank file
-                    new_filename = filename.replace(record.id, original_id)
-                    with open(outpath / new_filename, "w") as output_handle:
-                        SeqIO.write(record, output_handle, "genbank")
-                    link = outpath / new_filename
-                else:
-                    # generate symlink
-                    new_filename = filename.replace(record.id, original_id)
-                    target_path = Path.cwd() / gbk  # target for symlink
-
-                    link = outpath / new_filename
-
-                    logging.info(f"Generating symlink: {link}")
-                    try:
-                        link.symlink_to(target_path)
-                    except FileExistsError:
-                        logging.warning(
-                            f"Previous symlink exist, updating target: {link} -> {target_path}"
-                        )
-                        link.unlink()
-                        link.symlink_to(target_path)
-
-                    # Assert that the symlink was correctly generated
-                    assert link.is_symlink(), f"Failed to create symlink: {link}"
-                    assert (
-                        link.resolve() == target_path.resolve()
-                    ), f"Symlink {link} does not point to the correct target: {target_path}"
+    gbk = Path(selected_bgcs)
+    filename = gbk.name
+    logging.info(f"{genome_id} - Parsing file: {gbk.name}")
+    region = SeqIO.parse(str(gbk), "genbank")
+    for record in region:
+        record_log = {}
+        if "structured_comment" in record.annotations:
+            try:
+                original_id = record.annotations["structured_comment"][
+                    "antiSMASH-Data"
+                ]["Original ID"].split()[0]
+            except KeyError:
+                original_id = record.id
+                logging.warning(
+                    f" - Found shortened record.id: {record.id} <- {original_id}."
+                )
+        else:
+            raise ValueError(f"No Structured Comments in record: {gbk.name}")
+
+        if (":" in str(record.description)) or (":" in original_id):
+            logging.warning(
+                f" - Illegal character ':' found in genbank description, removing: {record.description}"
+            )
+            # Remove colon from description
+            record.description = record.description.replace(":", "")
+            original_id = original_id.replace(":", "")
+
+            # Rename antiSMASH comment
+            if "structured_comment" in record.annotations:
+                if (
+                    "Original ID"
+                    in record.annotations["structured_comment"]["antiSMASH-Data"]
+                ):
+                    record.annotations["structured_comment"]["antiSMASH-Data"][
+                        "Original ID"
+                    ] = original_id
+
+            # Write new GenBank file
+            new_filename = filename.replace(record.id, original_id)
+            with open(outpath / new_filename, "w") as output_handle:
+                SeqIO.write(record, output_handle, "genbank")
+            link = outpath / new_filename
+        else:
+            # generate symlink
+            new_filename = filename.replace(record.id, original_id)
+            target_path = Path.cwd() / gbk  # target for symlink
+
+            link = outpath / new_filename
+
+            logging.info(f" - Generating symlink: {link}")
+            try:
+                link.symlink_to(target_path)
+            except FileExistsError:
+                logging.warning(
+                    f" - Previous symlink exist, updating target: {link} -> {target_path}"
+                )
+                link.unlink()
+                link.symlink_to(target_path)
+
+            # Assert that the symlink was correctly generated
+            assert link.is_symlink(), f" - Failed to create symlink: {link}"
+            assert (
+                link.resolve() == target_path.resolve()
+            ), f" - Symlink {link} does not point to the correct target: {target_path}"
+
+        record_log["record_id"] = record.id
+        record_log["original_id"] = original_id
+        record_log["target_path"] = str(gbk)
+        record_log["symlink_path"] = str(link)
+
+        change_log = {filename: record_log}
+    return change_log
 
-                record_log["record_id"] = record.id
-                record_log["original_id"] = original_id
-                record_log["target_path"] = str(gbk)
-                record_log["symlink_path"] = str(link)
 
-                change_log = {filename: record_log}
-    return change_log
+def bgc_downstream_prep(input_file, output_dir, input_dir="."):
+    """
+    Prepare the downstream BGCs.
 
+    Parameters:
+    input_file (str): Path to the input file.
+    output_dir (str): Path to the output directory.
+    input_dir (str, optional): Path to the input directory. Defaults to current directory.
 
-def bgc_downstream_prep(input_file, output_dir):
-    logging.info(f"Reading input file: {input_file}")
+    Returns:
+    None
+    """
+    original_input_dir = Path(input_dir)
+    logging.info(f"Reading input file: {input_file} from {original_input_dir}")
     with open(input_file, "r") as file:
-        file_paths = [Path(f) for f in file.read().splitlines()]
+        file_paths = [original_input_dir / f for f in file.read().splitlines()]
     change_log_containers = {}
-    for num, selected_bgcs in enumerate(file_paths):
-        input_dir = selected_bgcs.parent
-        logging.info(f"Reading input directory: {input_dir}")
-        path = Path(input_dir)
-        if not path.is_dir():
-            raise FileNotFoundError(f"No such file or directory: {path}")
-
-        # check if it has complete antiSMASH results
-        if (path / f"{path.name}.json").is_file():
-            logging.info("Found full antiSMASH record")
-            genome_id = path.name
-        else:
-            logging.warning("No full antiSMASH record found, unknown genome id")
-            genome_id = "unknown_genome_id"
-
-        assert selected_bgcs.exists(), f"File does not exist: {selected_bgcs}"
-        region_change_log = generate_symlink(path, genome_id, output_dir, selected_bgcs)
-        change_log_containers[num] = {
-            "genome_id": genome_id,
-            "value": region_change_log,
-        }
+    input_dirs = set([file.parent for file in file_paths])
+    change_log_ctr = 0
+    with alive_bar(len(input_dirs), title="Downstream prepping genomes:") as bar:
+        for num, input_dir in enumerate(input_dirs):
+            logging.info(
+                f"{num} - Processing {input_dir.name}: Reading input directory: {input_dir}"
+            )
+            path = Path(input_dir)
+            if not path.is_dir():
+                raise FileNotFoundError(f"No such file or directory: {path}")
+            # check if it has complete antiSMASH results
+            if (path / f"{path.name}.json").is_file():
+                logging.info("Found full antiSMASH record")
+                genome_id = path.name
+            else:
+                logging.warning("No full antiSMASH record found, unknown genome id")
+                genome_id = "unknown_genome_id"
+            genbanks_list = [g for g in file_paths if genome_id in str(g)]
+            gbk_ctr = 0
+            for selected_bgcs in genbanks_list:
+                if selected_bgcs in file_paths:
+                    assert (
+                        selected_bgcs.exists()
+                    ), f"File does not exist: {selected_bgcs}"
+                    region_change_log = generate_symlink(
+                        selected_bgcs, genome_id, output_dir
+                    )
+                    change_log_containers[change_log_ctr] = {
+                        "genome_id": genome_id,
+                        "value": region_change_log,
+                    }
+                    gbk_ctr += 1
+                    change_log_ctr += 1
+            logging.debug(
+                f"Finished creating {gbk_ctr}/{len(genbanks_list)} symlinks for {genome_id}\n"
+            )
+            bar()
+
     logging.info("Writing change logs...")
     change_logs = {}
     genome_ids = set(v["genome_id"] for v in change_log_containers.values())
-    for genome_id in genome_ids:
-        change_log = {}
-        for v in change_log_containers.values():
-            if v["genome_id"] == genome_id:
-                entry_name = list(v["value"].keys())[0]
-                change_log[entry_name] = v["value"][entry_name]
-        change_logs[genome_id] = change_log
-        logging.debug(f"Change log for {genome_id}: {change_log}")
-
-    for genome_id in change_logs.keys():
-        outpath = Path(output_dir) / genome_id
-        with open(
-            outpath / f"{genome_id}-change_log.json", "w", encoding="utf8"
-        ) as json_file:
-            json.dump({genome_id: change_logs[genome_id]}, json_file, indent=4)
-        logging.info(f"{genome_id}: Job done!\n")
+    with alive_bar(len(genome_ids), title="Merging changelogs:") as bar:
+        for genome_id in genome_ids:
+            change_log = {}
+            for v in change_log_containers.values():
+                if v["genome_id"] == genome_id:
+                    entry_name = list(v["value"].keys())[0]
+                    change_log[entry_name] = v["value"][entry_name]
+            change_logs[genome_id] = change_log
+            logging.debug(f"Change log for {genome_id}: {len(change_log)}")
+            bar()
+
+    with alive_bar(len(change_logs.keys()), title="Writing changelogs:") as bar:
+        for genome_id in change_logs.keys():
+            outpath = Path(output_dir) / genome_id
+            with open(
+                outpath / f"{genome_id}-change_log.json", "w", encoding="utf8"
+            ) as json_file:
+                json.dump({genome_id: change_logs[genome_id]}, json_file, indent=4)
+            logging.info(f"{genome_id}: Job done!\n")
+            bar()
 
 
 if __name__ == "__main__":
diff --git a/workflow/bgcflow/bgcflow/data/get_antismash_overview_gather.py b/workflow/bgcflow/bgcflow/data/get_antismash_overview_gather.py
index f9719c9c..a8a8e51b 100644
--- a/workflow/bgcflow/bgcflow/data/get_antismash_overview_gather.py
+++ b/workflow/bgcflow/bgcflow/data/get_antismash_overview_gather.py
@@ -30,7 +30,9 @@ def correct_bgc_id_overview(overview_file, mapping_file, genome_id=False):
     new_dict : dict
         Corrected BGC overview dictionary with updated BGC IDs
     """
-    logging.info(f"Correcting shortened bgc ids for {genome_id}...")
+    logging.info(
+        f"Correcting shortened bgc ids for {genome_id} using mapping from {mapping_file}..."
+    )
     overview_path = Path(overview_file)
     mapping_path = Path(mapping_file)
 
@@ -86,7 +88,7 @@ def gather_bgc_overview(input_json, mapping_dir, table):
     None
     """
     input_json = Path(input_json)
-    logging.info(input_json)
+
     if input_json.is_file() and input_json.suffix == ".json":
         logging.info(f"Getting BGC overview from a single file: {input_json}")
         input_json_files = input_json
@@ -104,6 +106,7 @@ def gather_bgc_overview(input_json, mapping_dir, table):
             input_json_files = [
                 Path(path) for path in paths if Path(path).suffix == ".json"
             ]
+            logging.info(f"Found entries of {len(input_json_files)} region files...")
     else:
         input_json_files = [
             Path(file)
@@ -120,6 +123,9 @@ def gather_bgc_overview(input_json, mapping_dir, table):
         genome_id = mapping_file.name.replace("_bgc_overview.json", "")
         mapping_path = Path(mapping_dir) / f"{genome_id}/{genome_id}-change_log.json"
         corrected = correct_bgc_id_overview(mapping_file, mapping_path, genome_id)
+        logging.debug(
+            f"Adding {len(corrected)} entries from {genome_id} to the merged table..."
+        )
         merged_dict.update(corrected)
 
     df = pd.DataFrame.from_dict(merged_dict).T
@@ -130,13 +136,13 @@ def gather_bgc_overview(input_json, mapping_dir, table):
         lambda x: 1 if x is not None and x > 1 else x
     )
 
-    logging.debug(f"Writing file to: {table}")
+    logging.debug(f"Writing file containing {len(df)} entries to: {table}")
 
     # Save dataframes to csv tables
     df_table = Path(table)
     df_table.parent.mkdir(parents=True, exist_ok=True)
     df.to_csv(table)
-    logging.info("Job done")
+
     return None
 
 
diff --git a/workflow/rules/antismash.smk b/workflow/rules/antismash.smk
index 79dbf8c9..93e62990 100644
--- a/workflow/rules/antismash.smk
+++ b/workflow/rules/antismash.smk
@@ -118,12 +118,11 @@ elif antismash_major_version >= 7:
             set +e
 
             # Find the latest existing JSON output for this strain
-            latest_version=$(find data/interim/antismash/*/{wildcards.strains} -name "{wildcards.strains}.json" | sort -r | head -n 1 | cut -d '/' -f 4) 2>> {log}
-            if [ -n "$latest_version" ]; then
+            latest_json=$(find data/interim/antismash/*/* -name "{wildcards.strains}.json" | sort -V | tail -n 1) 2>> {log}
+            if [ -n "$latest_json" ]; then
                 # Use existing JSON result as starting point
-                old_json="data/interim/antismash/$latest_version/{wildcards.strains}/{wildcards.strains}.json"
-                echo "Using existing JSON from $old_json as starting point..." >> {log}
-                antismash_input="--reuse-result $old_json"
+                echo "Using existing JSON from $latest_json as starting point..." >> {log}
+                antismash_input="--reuse-result $latest_json"
             else
                 # No existing JSON result found, use genbank input
                 echo "No existing JSON result found, starting AntiSMASH from scratch..." >> {log}
diff --git a/workflow/rules/bgc_analytics.smk b/workflow/rules/bgc_analytics.smk
index af3f4651..a80d311f 100644
--- a/workflow/rules/bgc_analytics.smk
+++ b/workflow/rules/bgc_analytics.smk
@@ -42,7 +42,7 @@ rule antismash_overview_gather:
         "logs/bgc_analytics/antismash_overview_gather-{version}-{name}.log",
     shell:
         """
-        TMPDIR="data/interim/tmp/{wildcards.name}/{wildcards.version}"
+        TMPDIR="data/interim/bgcs/{wildcards.name}/tmp/{wildcards.version}"
         mkdir -p $TMPDIR
         INPUT_JSON="$TMPDIR/df_regions_antismash.txt"
         echo '{input.bgc_overview}' > $INPUT_JSON

From 241b8ca6251f2c5be09ac0e3442d11353c0a3fe1 Mon Sep 17 00:00:00 2001
From: Matin Nuhamunada <matinnu@biosustain.dtu.dk>
Date: Thu, 20 Jun 2024 14:30:11 +0200
Subject: [PATCH 16/20] test: fix micromamba version

---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 23e0084d..543b41c8 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -45,7 +45,7 @@ jobs:
     - name: Set up Micromamba
       uses: mamba-org/setup-micromamba@v1
       with:
-        micromamba-version: '1.5.8-1'
+        micromamba-version: '1.5.8-0'
         environment-file: ${{ matrix.environment }}
         init-shell: bash
         cache-environment: true

From 591f1b4431efcb697dd9c21990f3b40eaef55761 Mon Sep 17 00:00:00 2001
From: Matin Nuhamunada <matinnu@biosustain.dtu.dk>
Date: Thu, 20 Jun 2024 16:09:29 +0000
Subject: [PATCH 17/20] test: use snakemake version from wrapper

---
 .github/workflows/push.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml
index 4e26ffdb..b1dce9c3 100644
--- a/.github/workflows/push.yml
+++ b/.github/workflows/push.yml
@@ -148,7 +148,6 @@ jobs:
       - run: pip install bgcflow_wrapper
       - run: pip install pytest-cov
       - run: pip install alive-progress
-      - run: pip install snakemake==8.5.2
       - name: Test coverage
         run: pytest --cov=.tests/unit .tests/unit/
       - name: Build coverage file

From 8de0983d5aac9f89d1b1af1455f4f5f53d20ca4b Mon Sep 17 00:00:00 2001
From: Matin Nuhamunada <matinnu@biosustain.dtu.dk>
Date: Thu, 20 Jun 2024 17:48:22 +0000
Subject: [PATCH 18/20] fix: pin numpyt to version 1.26.4 for checkm

---
 workflow/envs/checkm.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/workflow/envs/checkm.yaml b/workflow/envs/checkm.yaml
index 520530c6..6b24663c 100644
--- a/workflow/envs/checkm.yaml
+++ b/workflow/envs/checkm.yaml
@@ -4,6 +4,8 @@ channels:
   - bioconda
   - defaults
 dependencies:
+  - python==3.11
+  - numpy==1.26.4
   - checkm-genome==1.2.2
   - wget
   - tar

From 3010397459e661c086a1de9cac57695b9ef9925c Mon Sep 17 00:00:00 2001
From: Matin Nuhamunada <matinnu@biosustain.dtu.dk>
Date: Thu, 20 Jun 2024 17:49:49 +0000
Subject: [PATCH 19/20] test: update expected ncbi metadata

---
 .../Lactobacillus_delbrueckii/tables/df_ncbi_meta.csv  | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.tests/unit/extract_ncbi_information/expected/data/processed/Lactobacillus_delbrueckii/tables/df_ncbi_meta.csv b/.tests/unit/extract_ncbi_information/expected/data/processed/Lactobacillus_delbrueckii/tables/df_ncbi_meta.csv
index 5130be45..aac6d46d 100644
--- a/.tests/unit/extract_ncbi_information/expected/data/processed/Lactobacillus_delbrueckii/tables/df_ncbi_meta.csv
+++ b/.tests/unit/extract_ncbi_information/expected/data/processed/Lactobacillus_delbrueckii/tables/df_ncbi_meta.csv
@@ -1,5 +1,5 @@
-genome_id,BioProject,assembly,assembly_level,assembly_type,biosample,date,genbank,genome_representation,genus,organism,refseq,refseq_category,refseq_genbank_identity,release_type,species,strain,submitter,tax_id
-GCA_000056065.1,PRJNA16871,ASM5606v1,Complete Genome,na,SAMEA3138258,2006-05-26,GCA_000056065.1,full,Lactobacillus,Lactobacillus delbrueckii subsp. bulgaricus ATCC 11842 = JCM 1002 (firmicutes),GCF_000056065.1,,yes,major,delbrueckii,ATCC 11842,Genoscope,390333
-GCA_000182835.1,PRJNA49147,ASM18283v1,Complete Genome,na,SAMN02603937,2010-11-19,GCA_000182835.1,full,Lactobacillus,Lactobacillus delbrueckii subsp. bulgaricus ND02 (firmicutes),GCF_000182835.1,,yes,major,delbrueckii,ND02,"The Key Laboratory of Dairy Biotechnology and Bioengineering, Education Ministry of P. R. China, Department of Food Science and Engineering, Inner Mongolia Agricultural University, China",767455
-GCA_000191165.1,PRJNA16120,ASM19116v1,Complete Genome,na,SAMN02603124,2011-03-03,GCA_000191165.1,full,Lactobacillus,Lactobacillus delbrueckii subsp. bulgaricus 2038 (firmicutes),GCF_000191165.1,,yes,major,delbrueckii,2038,Chinese National HGC,353496
-GCA_000014405.1,PRJNA403,ASM1440v1,Complete Genome,na,SAMN02598530,2006-10-13,GCA_000014405.1,full,Lactobacillus,Lactobacillus delbrueckii subsp. bulgaricus ATCC BAA-365 (firmicutes),GCF_000014405.1,,yes,major,delbrueckii,ATCC BAA-365,"US DOE Joint Genome Institute (JGI), The Lactic Acid Bacteria Genome Consortium and Fidelity Systems Inc.",321956
+genome_id,assembly,organism,genus,species,strain,tax_id,refseq_category,refseq,genbank,assembly_type,release_type,assembly_level,genome_representation,refseq_genbank_identity,biosample,submitter,date,BioProject
+GCA_000056065.1,ASM5606v1,Lactobacillus delbrueckii subsp. bulgaricus ATCC 11842 = JCM 1002 (firmicutes),Lactobacillus,delbrueckii,ATCC 11842,390333,,GCF_000056065.1,GCA_000056065.1,na,major,Complete Genome,full,yes,SAMEA3138258,Genoscope,2006-05-26,PRJNA16871
+GCA_000182835.1,ASM18283v1,Lactobacillus delbrueckii subsp. bulgaricus ND02 (firmicutes),Lactobacillus,delbrueckii,ND02,767455,,GCF_000182835.1,GCA_000182835.1,na,major,Complete Genome,full,yes,SAMN02603937,"The Key Laboratory of Dairy Biotechnology and Bioengineering, Education Ministry of P. R. China, Department of Food Science and Engineering, Inner Mongolia Agricultural University, China",2010-11-19,PRJNA49147
+GCA_000191165.1,ASM19116v1,Lactobacillus delbrueckii subsp. bulgaricus 2038 (firmicutes),Lactobacillus,delbrueckii,2038,353496,,GCF_000191165.1,GCA_000191165.1,na,major,Complete Genome,full,yes,SAMN02603124,Chinese National HGC,2011-03-03,PRJNA16120
+GCA_000014405.1,ASM1440v1,Lactobacillus delbrueckii subsp. bulgaricus ATCC BAA-365 (firmicutes),Lactobacillus,delbrueckii,ATCC BAA-365,321956,,GCF_000014405.1,GCA_000014405.1,na,major,Complete Genome,full,yes,SAMN02598530,"US DOE Joint Genome Institute (JGI), The Lactic Acid Bacteria Genome Consortium and Fidelity Systems Inc.",2006-10-13,PRJNA403

From db70072925f36ad794f39df7b147a9e7e60129d8 Mon Sep 17 00:00:00 2001
From: Matin Nuhamunada <matinnu@biosustain.dtu.dk>
Date: Thu, 20 Jun 2024 18:23:07 +0000
Subject: [PATCH 20/20] test: drop build test for antiSMASH 6 and lsabgc

---
 .github/workflows/build.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 543b41c8..3d7cff49 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -15,7 +15,7 @@ jobs:
       matrix:
         environment:
             - workflow/envs/antismash.yaml
-            - workflow/envs/antismash_v6.yaml
+            #- workflow/envs/antismash_v6.yaml
             - workflow/envs/arts.yaml
             - workflow/envs/automlst_wrapper.yaml
             - workflow/envs/bgc_analytics.yaml
@@ -36,7 +36,7 @@ jobs:
             - workflow/envs/roary.yaml
             - workflow/envs/seqfu.yaml
             - workflow/envs/utilities.yaml
-            - workflow/envs/lsabgc.yaml
+            #- workflow/envs/lsabgc.yaml
     steps:
     - name: Checkout repository and submodules
       uses: actions/checkout@v4