NBChub · matinnuhamunada · Apr 22, 2024 · Mar 13, 2024 · Apr 3, 2024 · Apr 3, 2024
diff --git a/.examples/_config_example.yaml b/.examples/_config_example.yaml
@@ -76,5 +76,5 @@ rule_parameters:
 utility_parameters:
   METABASE_MIN_MEMORY: "2g"
   METABASE_MAX_MEMORY: "8g"
-  METABASE_VERSION: "v0.47.0"
-  METABASE_DUCKDB_PLUGIN_VERSION: "0.2.2"
+  METABASE_VERSION: "v0.49.6"
+  METABASE_DUCKDB_PLUGIN_VERSION: "0.2.6"
diff --git a/.examples/lanthipeptide_lactobacillus/df_regions_antismash_7.0.0.csv b/.examples/lanthipeptide_lactobacillus/df_regions_antismash_7.0.0.csv
diff --git a/.examples/lanthipeptide_lactobacillus/df_regions_antismash_7.1.0.csv b/.examples/lanthipeptide_lactobacillus/df_regions_antismash_7.1.0.csv
@@ -0,0 +1,5 @@
+bgc_id,genome_id,region,accession,start_pos,end_pos,contig_edge,product,region_length,most_similar_known_cluster_id,most_similar_known_cluster_description,most_similar_known_cluster_type,similarity,source,gbk_path
+CR954253.1.region001,GCA_000056065.1,1.1,CR954253.1,17407,39909,False,['lanthipeptide-class-iii'],22502,,,,,bgcflow,data/interim/antismash/7.1.0/GCA_000056065.1/CR954253.1.region001.gbk
+CR954253.1.region003,GCA_000056065.1,1.3,CR954253.1,1745672,1767868,False,['lanthipeptide-class-iv'],22196,,,,,bgcflow,data/interim/antismash/7.1.0/GCA_000056065.1/CR954253.1.region003.gbk
+CP000156.1.region002,GCA_000191165.1,1.2,CP000156.1,1767251,1789447,False,['lanthipeptide-class-iv'],22196,,,,,bgcflow,data/interim/antismash/7.1.0/GCA_000191165.1/CP000156.1.region002.gbk
+CP000412.1.region001,GCA_000014405.1,1.1,CP000412.1,17283,39785,False,['lanthipeptide-class-iii'],22502,,,,,bgcflow,data/interim/antismash/7.1.0/GCA_000014405.1/CP000412.1.region001.gbk
diff --git a/.examples/lanthipeptide_lactobacillus/project_config.yaml b/.examples/lanthipeptide_lactobacillus/project_config.yaml
@@ -1,12 +1,13 @@
 name: lanthipeptide_lactobacillus
 pep_version: 2.1.0
 description: 'A selection of lanthipeptides from Lactobacillus delbrueckii'
-sample_table: df_regions_antismash_7.0.0.csv
+sample_table: df_regions_antismash_7.1.0.csv
 
 rules:
   bigslice: TRUE
   bigscape: TRUE
-  query-bigslice: TRUE
+  query-bigslice: FALSE
   clinker: TRUE
-  interproscan: TRUE
+  interproscan: FALSE
   mmseqs2: TRUE
+  getphylo: FALSE
diff --git a/.tests/config/lanthipeptide_lactobacillus/project_config.yaml b/.tests/config/lanthipeptide_lactobacillus/project_config.yaml
@@ -10,3 +10,4 @@ rules:
   clinker: TRUE
   interproscan: TRUE
   mmseqs2: TRUE
+  getphylo: TRUE
diff --git a/workflow/BGC b/workflow/BGC
@@ -61,12 +61,19 @@ def extract_bgc_project_information(config, project_variable="projects", sample_
             print(f" - Processing project {pep_file}", file=sys.stderr)
             p = peppy.Project(pep_file, sample_table_index=sample_table_index)
 
-
             # make sure each project has unique names
             assert (
                 not p.name in df_projects["name"].unique()
             ), f"Project name [{p.name}] in [{pep_file}] has been used. Please use different name for each project."
 
+            # assign column types as string
+            for col in ["name", "samples", "rules"]:
+                if not col in df_projects.columns:
+                    df_projects[col] = pd.Series(dtype=str)
+                else:
+                    df_projects[col] = df_projects[col].astype(str)
+
+            # add values
             df_projects.loc[p.name, "name"] = p.name
             df_projects.loc[p.name, "samples"] = p.config_file
             df_projects.loc[p.name, "rules"] = p.config_file

diff --git a/workflow/Metabase b/workflow/Metabase
@@ -156,8 +156,8 @@ def setup_metabase(token, api_url, metabase_config):
 metabase_config = {
     "METABASE_MIN_MEMORY": "2g",
     "METABASE_MAX_MEMORY": "8g",
-    "METABASE_VERSION": "v0.47.0",
-    "METABASE_DUCKDB_PLUGIN_VERSION": "0.2.2",
+    "METABASE_VERSION": "v0.49.6",
+    "METABASE_DUCKDB_PLUGIN_VERSION": "0.2.6",
     "DMB_SETUP_TOKEN": "ad0fb086-351b-4fa5-a17e-76282d2c9753",
     "METABASE_HTTP": "http://localhost:3000",
     "MB_IS_METABOT_ENABLED" : "true"

diff --git a/workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py b/workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py
@@ -10,34 +10,26 @@
 logging.basicConfig(format=log_format, datefmt=date_format, level=logging.DEBUG)
 
 
-def bgc_downstream_prep(input_dir, output_dir, selected_bgcs=False):
+def generate_symlink(path, genome_id, output_dir, selected_bgcs=False):
     """
     Given an antiSMASH directory, check for changed name
     """
-    logging.info(f"Reading input directory: {input_dir}")
-    path = Path(input_dir)
-    if not path.is_dir():
-        raise FileNotFoundError(f"No such file or directory: {path}")
-
-    genome_id = path.name
     outpath = Path(output_dir) / genome_id
     outpath.mkdir(parents=True, exist_ok=True)
     logging.debug(f"Deducting genome id as {genome_id}")
-
-    change_log = {genome_id: {}}
     ctr = 0
-    matches = [Path(i).stem for i in selected_bgcs.split()]
+    matches = selected_bgcs.stem
     for gbk in path.glob("*.gbk"):
         if gbk.stem in matches:
-            logging.debug(f"MATCH: {gbk.stem}")
+            logging.debug(f"Found match: {gbk.stem}")
             filename = gbk.name
             ctr = ctr + 1
             logging.info(f"Parsing file: {gbk.name}")
             region = SeqIO.parse(str(gbk), "genbank")
             for record in region:
-                logging.info(f"{gbk} {record.id}")
+                logging.debug(f"Processing: {gbk.name}: {record.id}")
                 record_log = {}
-                if "comment" in record.annotations:
+                if "structured_comment" in record.annotations:
                     try:
                         original_id = record.annotations["structured_comment"][
                             "antiSMASH-Data"
@@ -47,7 +39,35 @@ def bgc_downstream_prep(input_dir, output_dir, selected_bgcs=False):
                         logging.warning(
                             f"Found shortened record.id: {record.id} <- {original_id}."
                         )
+                else:
+                    raise ValueError(f"No Structured Comments in record: {gbk.name}")
+
+                if (":" in str(record.description)) or (":" in original_id):
+                    logging.warning(
+                        f"Illegal character ':' found in genbank description, removing: {record.description}"
+                    )
+                    # Remove colon from description
+                    record.description = record.description.replace(":", "")
+                    original_id = original_id.replace(":", "")
 
+                    # Rename antiSMASH comment
+                    if "structured_comment" in record.annotations:
+                        if (
+                            "Original ID"
+                            in record.annotations["structured_comment"][
+                                "antiSMASH-Data"
+                            ]
+                        ):
+                            record.annotations["structured_comment"]["antiSMASH-Data"][
+                                "Original ID"
+                            ] = original_id
+
+                    # Write new GenBank file
+                    new_filename = filename.replace(record.id, original_id)
+                    with open(outpath / new_filename, "w") as output_handle:
+                        SeqIO.write(record, output_handle, "genbank")
+                    link = outpath / new_filename
+                else:
                     # generate symlink
                     new_filename = filename.replace(record.id, original_id)
                     target_path = Path.cwd() / gbk  # target for symlink
@@ -64,23 +84,66 @@ def bgc_downstream_prep(input_dir, output_dir, selected_bgcs=False):
                         link.unlink()
                         link.symlink_to(target_path)
 
-                    record_log["record_id"] = record.id
-                    record_log["original_id"] = original_id
-                    record_log["target_path"] = str(gbk)
-                    record_log["symlink_path"] = str(link)
-                else:
-                    logging.warning(f"No Comments in record: {gbk.name}")
+                    # Assert that the symlink was correctly generated
+                    assert link.is_symlink(), f"Failed to create symlink: {link}"
+                    assert (
+                        link.resolve() == target_path
+                    ), f"Symlink {link} does not point to the correct target: {target_path}"
+
+                record_log["record_id"] = record.id
+                record_log["original_id"] = original_id
+                record_log["target_path"] = str(gbk)
+                record_log["symlink_path"] = str(link)
+
+                change_log = {filename: record_log}
+    return change_log
+
+
+def bgc_downstream_prep(input_file, output_dir):
+    logging.info(f"Reading input file: {input_file}")
+    with open(input_file, "r") as file:
+        file_paths = [Path(f) for f in file.read().splitlines()]
+    change_log_containers = {}
+    for num, selected_bgcs in enumerate(file_paths):
+        input_dir = selected_bgcs.parent
+        logging.info(f"Reading input directory: {input_dir}")
+        path = Path(input_dir)
+        if not path.is_dir():
+            raise FileNotFoundError(f"No such file or directory: {path}")
+
+        # check if it has complete antiSMASH results
+        if (path / f"{path.name}.json").is_file():
+            logging.info("Found full antiSMASH record")
+            genome_id = path.name
+        else:
+            logging.warning("No full antiSMASH record found, unknown genome id")
+            genome_id = "unknown_genome_id"
 
-                change_log[genome_id][filename] = record_log
-    # assert 1+1==3
-    with open(
-        outpath / f"{genome_id}-change_log.json", "w", encoding="utf8"
-    ) as json_file:
-        json.dump(change_log, json_file, indent=4)
+        assert selected_bgcs.exists(), f"File does not exist: {selected_bgcs}"
+        region_change_log = generate_symlink(path, genome_id, output_dir, selected_bgcs)
+        change_log_containers[num] = {
+            "genome_id": genome_id,
+            "value": region_change_log,
+        }
+    change_logs = {}
+    genome_ids = set(v["genome_id"] for v in change_log_containers.values())
+    for genome_id in genome_ids:
+        change_log = {}
+        for v in change_log_containers.values():
+            if v["genome_id"] == genome_id:
+                entry_name = list(v["value"].keys())[0]
+                change_log[entry_name] = v["value"][entry_name]
+        change_logs[genome_id] = change_log
+    logging.debug(change_logs)
 
-    logging.info(f"{genome_id}: Job done!\n")
-    return
+    for genome_id in change_logs.keys():
+        outpath = Path(output_dir) / genome_id
+        with open(
+            outpath / f"{genome_id}-change_log.json", "w", encoding="utf8"
+        ) as json_file:
+            json.dump({genome_id: change_logs[genome_id]}, json_file, indent=4)
+        logging.info(f"{genome_id}: Job done!\n")
 
 
 if __name__ == "__main__":
-    bgc_downstream_prep(sys.argv[1], sys.argv[2], sys.argv[3])
+    bgc_downstream_prep(sys.argv[1], sys.argv[2])
diff --git a/workflow/bgcflow/bgcflow/data/get_dependencies.py b/workflow/bgcflow/bgcflow/data/get_dependencies.py
@@ -1,5 +1,6 @@
 import json
 import logging
+import re
 import sys
 
 import yaml
@@ -45,9 +46,16 @@ def get_dependency_version(dep, dep_key, antismash_version="7"):
                     if dep_key in p:
                         if p.startswith("git+"):
                             result = p.split("@")[-1]
-                            result = result.replace("-", ".")
+                            if dep_key == "antismash" and "-" in result:
+                                result = re.sub(r"\-", ".", result, count=2).split("-")[
+                                    0
+                                ]
+                            else:
+                                result = result.replace("-", ".")
                         else:
                             result = p.split("=")[-1]
+
+    logging.debug(f"Version of {dep_key} is: {result}")
     return str(result)
 
 

diff --git a/workflow/envs/antismash.yaml b/workflow/envs/antismash.yaml
@@ -40,4 +40,4 @@ dependencies:
   - yaml
   - pip
   - pip:
-    - git+https://github.com/antismash/antismash.git@7-0-0
+    - git+https://github.com/antismash/antismash.git@7-1-0-1
diff --git a/workflow/envs/bigscape.yaml b/workflow/envs/bigscape.yaml
@@ -6,7 +6,7 @@ dependencies:
   - python=3.6
   - dataclasses
   - hmmer
-  - biopython
+  - biopython=1.70
   - fasttree
   - numpy
   - scipy

diff --git a/workflow/envs/cblaster.yaml b/workflow/envs/cblaster.yaml
@@ -4,7 +4,8 @@ channels:
   - default
   - bioconda
 dependencies:
-  - diamond==2.0.15
+  - diamond==2.1.9
+  - python=3.8
   - pip
   - pip:
-    - cblaster==1.3.12
+    - cblaster==1.3.18
diff --git a/workflow/envs/checkm.yaml b/workflow/envs/checkm.yaml
@@ -4,6 +4,6 @@ channels:
   - bioconda
   - defaults
 dependencies:
-  - checkm-genome=1.1.3
+  - checkm-genome==1.2.2
   - wget
   - tar
diff --git a/workflow/envs/clinker.yaml b/workflow/envs/clinker.yaml
@@ -6,4 +6,4 @@ channels:
 dependencies:
   - pip
   - pip:
-    - clinker
+    - clinker==0.0.28
diff --git a/workflow/envs/dbt-duckdb.yaml b/workflow/envs/dbt-duckdb.yaml
@@ -5,9 +5,9 @@ channels:
   - defaults
 dependencies:
   - python==3.11
-  - python-duckdb==0.8.1
+  - python-duckdb==0.9.2
   - unzip
   - pip
   - pip:
-    - dbt-duckdb==1.6.0
-    - dbt-metabase==0.9.15
+    - dbt-duckdb==1.7.4
+    - dbt-metabase==1.3.0
diff --git a/workflow/envs/getphylo.yaml b/workflow/envs/getphylo.yaml
@@ -9,4 +9,4 @@ dependencies:
   - fasttree
   - pip
   - pip:
-    - getphylo
+    - getphylo==0.2.1
diff --git a/workflow/envs/seqfu.yaml b/workflow/envs/seqfu.yaml
@@ -4,4 +4,4 @@ channels:
   - bioconda
   - defaults
 dependencies:
-  - seqfu=1.15.3
+  - seqfu=1.20.3
diff --git a/workflow/notebook/automlst-wrapper.rpy.ipynb b/workflow/notebook/automlst-wrapper.rpy.ipynb
@@ -565,7 +565,7 @@
     "    df_antismash[\"complete_bgcs\"] = df_antismash[\"bgcs_count\"] - df_antismash[\"bgcs_on_contig_edge\"]\n",
     "    \n",
     "    # Select the 'complete_bgcs' and 'bgcs_on_contig_edge' columns and convert them to integers\n",
-    "    df_antismash_completeness = df_antismash.loc[:, [\"complete_bgcs\", \"bgcs_on_contig_edge\"]].astype(int)\n",
+    "    df_antismash_completeness = df_antismash.loc[:, [\"complete_bgcs\", \"bgcs_on_contig_edge\"]].fillna(0).astype(int)\n",
     "    \n",
     "    # Define the output file path\n",
     "    outfile = Path(f\"assets/iTOL_annotation/iTOL_antismash_{antismash_version}_completeness.txt\")\n",

diff --git a/workflow/rules/antismash.smk b/workflow/rules/antismash.smk
@@ -135,8 +135,10 @@ elif antismash_major_version >= 7:
                 --database {params.antismash_db_path} \
                 --cb-general --cb-subclusters --cb-knownclusters -c {threads} $antismash_input --logfile {log} 2>> {log}
 
-            # Check if the run failed due to changed detection results
-            if grep -q "ValueError: Detection results have changed. No results can be reused" {log}; then
+            # Check if the run failed due to changed detection results or changed protocluster types
+            if grep -q -e "ValueError: Detection results have changed. No results can be reused" \
+                    -e "RuntimeError: Protocluster types supported by HMM detection have changed, all results invalid" {log}
+            then
                 # Use genbank input instead
                 echo "Previous JSON result is invalid, starting AntiSMASH from scratch..." >> {log}
                 antismash --genefinding-tool {params.genefinding} --output-dir {params.folder} \
-Original file line number
+Diff line change
@@ Expand Up / @@ -40,4 +40,4 @@ dependencies: @@
       - yaml
       - pip
       - pip:
-        - git+https://github.com/antismash/antismash.git@7-0-0
+        - git+https://github.com/antismash/antismash.git@7-1-0-1