From fc034e5cc0a462396a1824f2f379707ec5d5e852 Mon Sep 17 00:00:00 2001 From: Matin Nuhamunada Date: Mon, 6 May 2024 14:01:32 +0000 Subject: [PATCH 01/20] docs: update citation --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 5852dd0a..093b43b0 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ At present, `BGCFlow` is only tested and confirmed to work on **Linux** systems with `conda` / `mamba` package manager. ## Publication -> Matin Nuhamunada, Omkar S. Mohite, Patrick V. Phaneuf, Bernhard O. Palsson, and Tilmann Weber. (2023). BGCFlow: Systematic pangenome workflow for the analysis of biosynthetic gene clusters across large genomic datasets. bioRxiv 2023.06.14.545018; doi: [https://doi.org/10.1101/2023.06.14.545018](https://doi.org/10.1101/2023.06.14.545018) +> Matin Nuhamunada, Omkar S Mohite, Patrick V Phaneuf, Bernhard O Palsson, Tilmann Weber, BGCFlow: systematic pangenome workflow for the analysis of biosynthetic gene clusters across large genomic datasets, Nucleic Acids Research, 2024;, gkae314, [https://doi.org/10.1093/nar/gkae314](https://doi.org/10.1093/nar/gkae314) ## Pre-requisites `BGCFlow` requires `gcc` and the `conda`/`mamba` package manager. See [installation instruction](https://github.com/NBChub/bgcflow/wiki/00-Installation-Guide) for details. @@ -27,7 +27,7 @@ A quick and easy way to use `BGCFlow` using the command line interface wrapper: ```bash # create and activate a new conda environment -conda create -n bgcflow -c conda-forge python=3.11 pip openjdk -y # also install java for metabase +mamba create -n bgcflow -c conda-forge python=3.11 pip openjdk -y # also install java for metabase conda activate bgcflow # install `BGCFlow` wrapper From 5deef98e1541eda4bb8e53e4c80896088da6b49f Mon Sep 17 00:00:00 2001 From: Matin Nuhamunada Date: Wed, 8 May 2024 09:50:07 +0200 Subject: [PATCH 02/20] fix: upgrade bgc genome preparation script to handle weird input --- workflow/rules/bgc.smk | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/workflow/rules/bgc.smk b/workflow/rules/bgc.smk index 353b879e..b0ce9af7 100644 --- a/workflow/rules/bgc.smk +++ b/workflow/rules/bgc.smk @@ -5,6 +5,7 @@ rule downstream_bgc_prep: ), table="data/processed/{name}/tables/df_gtdb_meta.csv", output: + input_list=temp("data/interim/bgcs/{name}/{version}/input_list.txt"), taxonomy="data/interim/bgcs/taxonomy/taxonomy_{name}_antismash_{version}.tsv", outdir=directory("data/interim/bgcs/{name}/{version}"), bgc_mapping="data/interim/bgcs/{name}/{name}_antismash_{version}.csv", @@ -12,33 +13,29 @@ rule downstream_bgc_prep: "../envs/bgc_analytics.yaml" params: dataset="data/interim/bgcs/datasets.tsv", - log: - general="logs/bgcs/downstream_bgc_prep/{name}/downstream_bgc_prep-{version}.log", - symlink="logs/bgcs/downstream_bgc_prep/{name}/bgc_downstream_bgc_prep-{version}.log", - taxonomy="logs/bgcs/downstream_bgc_prep/{name}/tax_downstream_bgc_prep-{version}.log", + log: "logs/bgcs/downstream_bgc_prep/{name}/downstream_bgc_prep-{version}.log", shell: """ - echo "Preparing BGCs for {wildcards.name} downstream analysis..." > {log.general} - #mkdir -p {output.outdir} 2>> {log.general} - # Generate symlink for each regions in genomes in dataset - for i in $(dirname {input.gbk}) - do - echo Processing $i >> {log.symlink} - python workflow/bgcflow/bgcflow/data/bgc_downstream_prep.py $i {output.outdir} 2>> {log.symlink} - done - # generate taxonomic information for dataset - python workflow/bgcflow/bgcflow/data/bigslice_prep.py {input.table} {output.taxonomy} 2>> {log.taxonomy} + echo "Preparing BGCs for {wildcards.name} downstream analysis..." >> {log} + + echo "Step 1. Generate symlink for each regions in genomes in dataset" >> {log} + echo {input.gbk} | tr ' ' '\n' > {output.input_list} 2>> {log} + head -n 5 {output.input_list} >> {log} + python workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py {output.input_list} {output.outdir} 2>> {log} + + echo "Step 2. Generate taxonomic information for dataset" >> {log} + python workflow/bgcflow/bgcflow/data/bigslice_prep.py {input.table} {output.taxonomy} 2>> {log} # append new dataset information ## check if previous dataset exists if [[ -s {params.dataset} ]] then - echo "Previous dataset detected, appending dataset information for {wildcards.name}..." >> {log.symlink} - sed -i 'a {wildcards.name}_antismash_{wildcards.version}\t{wildcards.name}_antismash_{wildcards.version}\ttaxonomy/taxonomy_{wildcards.name}_antismash_{wildcards.version}.tsv\t{wildcards.name}' {params.dataset} 2>> {log.general} + echo "Previous dataset detected, appending dataset information for {wildcards.name}..." + sed -i 'a {wildcards.name}_antismash_{wildcards.version}\t{wildcards.name}_antismash_{wildcards.version}\ttaxonomy/taxonomy_{wildcards.name}_antismash_{wildcards.version}.tsv\t{wildcards.name}' {params.dataset} 2>> {log} else - echo "No previous dataset detected, generating dataset information for {wildcards.name}..." >> {log.symlink} - echo -e '# Dataset name\tPath to folder\tPath to taxonomy\tDescription' > {params.dataset} >> {log.general} - sed -i 'a {wildcards.name}_antismash_{wildcards.version}\t{wildcards.name}_antismash_{wildcards.version}\ttaxonomy/taxonomy_{wildcards.name}_antismash_{wildcards.version}.tsv\t{wildcards.name}' {params.dataset} 2>> {log.general} + echo "No previous dataset detected, generating dataset information for {wildcards.name}..." 2>> {log} + echo -e '# Dataset name\tPath to folder\tPath to taxonomy\tDescription' > {params.dataset} 2>> {log} + sed -i 'a {wildcards.name}_antismash_{wildcards.version}\t{wildcards.name}_antismash_{wildcards.version}\ttaxonomy/taxonomy_{wildcards.name}_antismash_{wildcards.version}.tsv\t{wildcards.name}' {params.dataset} 2>> {log} fi - # generate mapping for visualization - python workflow/bgcflow/bgcflow/data/get_bigscape_mapping.py {output.outdir} {output.bgc_mapping} 2>> {log.general} + echo "Step 3. Generate mapping for visualization" >> {log} + python workflow/bgcflow/bgcflow/data/get_bigscape_mapping.py {output.outdir} {output.bgc_mapping} 2>> {log} """ From d7998aace8ea5d7dd854bb51f96d8c14fcd814ae Mon Sep 17 00:00:00 2001 From: Matin Nuhamunada Date: Wed, 8 May 2024 10:13:10 +0200 Subject: [PATCH 03/20] fix: update get_antismash_inputs function to retrieve region gbks --- workflow/rules/common.smk | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index e276b493..fcdd7609 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -520,21 +520,29 @@ def get_prokka_refdb(genome_id, params, df_samples, mapping_file, config=config) return output -# bigscape.smk, bigslice.smk, and bgc_analytics.smk # def get_antismash_inputs(name, version, df_samples): """ - Given a project name, find the corresponding sample file to use + This function retrieves the list of antismash GenBank (.gbk) files for a given project. - Arguments: - name {str} -- project name - version {str} -- antismash version - df_samples {pd.DataFrame} -- sample table + It iterates over the sample table (DataFrame), selects the rows where the project name matches the provided name, + and for each matching sample, it constructs a path to the directory where the antismash files for that sample are stored. + It then collects all GenBank files in these directories that have 'region' in their name. + + Parameters: + name (str): The name of the project for which to retrieve the antismash files. + version (str): The version of antismash used to generate the files. + df_samples (pd.DataFrame): A DataFrame containing the sample table. It is expected to have a 'name' column. Returns: - output {list} -- list of antismash gbk files + output (list): A list of strings, where each string is the path to an antismash GenBank file for the given project. """ selection = [i for i in df_samples.index if name in df_samples.loc[i, "name"]] - output = [f"data/interim/antismash/{version}/{s}/{s}.gbk" for s in selection] + output = [] + for genome_id in selection: + genome_path = Path(f"data/interim/antismash/{version}/{genome_id}/") + region_genbanks = list(genome_path.glob("*.region*.gbk")) + for r in region_genbanks: + output.append(str(r)) return output From 6355679a5af110efc89af4e212683d96fd6d63d6 Mon Sep 17 00:00:00 2001 From: Matin Nuhamunada Date: Fri, 17 May 2024 12:40:48 +0200 Subject: [PATCH 04/20] fix: change input requirement for bgc downstream preparation --- .../data/bgc_downstream_prep_selection.py | 5 +++-- workflow/rules/bgc.smk | 17 ++++++++++------- workflow/rules/common.smk | 2 +- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py b/workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py index 88099515..8c4ce1f5 100644 --- a/workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py +++ b/workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py @@ -19,7 +19,7 @@ def generate_symlink(path, genome_id, output_dir, selected_bgcs=False): logging.debug(f"Deducting genome id as {genome_id}") ctr = 0 matches = selected_bgcs.stem - for gbk in path.glob("*.gbk"): + for gbk in path.glob("*region*.gbk"): if gbk.stem in matches: logging.debug(f"Found match: {gbk.stem}") filename = gbk.name @@ -125,6 +125,7 @@ def bgc_downstream_prep(input_file, output_dir): "genome_id": genome_id, "value": region_change_log, } + logging.info("Writing change logs...") change_logs = {} genome_ids = set(v["genome_id"] for v in change_log_containers.values()) for genome_id in genome_ids: @@ -134,7 +135,7 @@ def bgc_downstream_prep(input_file, output_dir): entry_name = list(v["value"].keys())[0] change_log[entry_name] = v["value"][entry_name] change_logs[genome_id] = change_log - logging.debug(change_logs) + logging.debug(f"Change log for {genome_id}: {change_log}") for genome_id in change_logs.keys(): outpath = Path(output_dir) / genome_id diff --git a/workflow/rules/bgc.smk b/workflow/rules/bgc.smk index b0ce9af7..48c782c8 100644 --- a/workflow/rules/bgc.smk +++ b/workflow/rules/bgc.smk @@ -1,11 +1,12 @@ rule downstream_bgc_prep: input: - gbk=lambda wildcards: get_antismash_inputs( - wildcards.name, wildcards.version, DF_SAMPLES + gbk=lambda wildcards: expand("data/interim/antismash/{version}/{strains}/{strains}.gbk", + version=wildcards.version, + strains=[s for s in PEP_PROJECTS[wildcards.name].sample_table.genome_id.unique()], ), table="data/processed/{name}/tables/df_gtdb_meta.csv", output: - input_list=temp("data/interim/bgcs/{name}/{version}/input_list.txt"), + input_list="data/interim/bgcs/{name}/{version}/input_list.txt", taxonomy="data/interim/bgcs/taxonomy/taxonomy_{name}_antismash_{version}.tsv", outdir=directory("data/interim/bgcs/{name}/{version}"), bgc_mapping="data/interim/bgcs/{name}/{name}_antismash_{version}.csv", @@ -13,14 +14,16 @@ rule downstream_bgc_prep: "../envs/bgc_analytics.yaml" params: dataset="data/interim/bgcs/datasets.tsv", + regions=lambda wildcards: get_antismash_regions( + wildcards.name, wildcards.version, DF_SAMPLES + ), log: "logs/bgcs/downstream_bgc_prep/{name}/downstream_bgc_prep-{version}.log", shell: """ echo "Preparing BGCs for {wildcards.name} downstream analysis..." >> {log} echo "Step 1. Generate symlink for each regions in genomes in dataset" >> {log} - echo {input.gbk} | tr ' ' '\n' > {output.input_list} 2>> {log} - head -n 5 {output.input_list} >> {log} + echo {params.regions} | tr ' ' '\n' >> {output.input_list} 2>> {log} python workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py {output.input_list} {output.outdir} 2>> {log} echo "Step 2. Generate taxonomic information for dataset" >> {log} @@ -29,10 +32,10 @@ rule downstream_bgc_prep: ## check if previous dataset exists if [[ -s {params.dataset} ]] then - echo "Previous dataset detected, appending dataset information for {wildcards.name}..." + echo "Previous dataset detected, appending dataset information for {wildcards.name}..." >> {log} sed -i 'a {wildcards.name}_antismash_{wildcards.version}\t{wildcards.name}_antismash_{wildcards.version}\ttaxonomy/taxonomy_{wildcards.name}_antismash_{wildcards.version}.tsv\t{wildcards.name}' {params.dataset} 2>> {log} else - echo "No previous dataset detected, generating dataset information for {wildcards.name}..." 2>> {log} + echo "No previous dataset detected, generating dataset information for {wildcards.name}..." >> {log} echo -e '# Dataset name\tPath to folder\tPath to taxonomy\tDescription' > {params.dataset} 2>> {log} sed -i 'a {wildcards.name}_antismash_{wildcards.version}\t{wildcards.name}_antismash_{wildcards.version}\ttaxonomy/taxonomy_{wildcards.name}_antismash_{wildcards.version}.tsv\t{wildcards.name}' {params.dataset} 2>> {log} fi diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index fcdd7609..84e81718 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -520,7 +520,7 @@ def get_prokka_refdb(genome_id, params, df_samples, mapping_file, config=config) return output -def get_antismash_inputs(name, version, df_samples): +def get_antismash_regions(name, version, df_samples): """ This function retrieves the list of antismash GenBank (.gbk) files for a given project. From 862900415b28a53a90d1ba106f95d3894519622b Mon Sep 17 00:00:00 2001 From: Matin Nuhamunada Date: Wed, 29 May 2024 15:24:25 +0200 Subject: [PATCH 05/20] chore: correct typos --- workflow/rules/antismash.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/antismash.smk b/workflow/rules/antismash.smk index 1ecad295..4a7f4261 100644 --- a/workflow/rules/antismash.smk +++ b/workflow/rules/antismash.smk @@ -47,7 +47,7 @@ if antismash_major_version <= 6: """ antismash \ --genefinding-tool {params.genefinding} \ - --database {params.antismash_db_path,} \ + --database {params.antismash_db_path} \ --output-dir {params.folder} \ --cb-general \ --cb-subclusters \ From ac20d03546da67aae924e23bc113fa5ff119dfe3 Mon Sep 17 00:00:00 2001 From: Matin Nuhamunada Date: Thu, 30 May 2024 13:30:33 +0000 Subject: [PATCH 06/20] fix: reinclude full antiSMASH gbks for downstream process --- workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py | 3 ++- workflow/rules/bgc.smk | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py b/workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py index 8c4ce1f5..bff37fa5 100644 --- a/workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py +++ b/workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py @@ -18,8 +18,9 @@ def generate_symlink(path, genome_id, output_dir, selected_bgcs=False): outpath.mkdir(parents=True, exist_ok=True) logging.debug(f"Deducting genome id as {genome_id}") ctr = 0 + change_log = None matches = selected_bgcs.stem - for gbk in path.glob("*region*.gbk"): + for gbk in path.glob("*.gbk"): if gbk.stem in matches: logging.debug(f"Found match: {gbk.stem}") filename = gbk.name diff --git a/workflow/rules/bgc.smk b/workflow/rules/bgc.smk index 48c782c8..538d829d 100644 --- a/workflow/rules/bgc.smk +++ b/workflow/rules/bgc.smk @@ -23,6 +23,7 @@ rule downstream_bgc_prep: echo "Preparing BGCs for {wildcards.name} downstream analysis..." >> {log} echo "Step 1. Generate symlink for each regions in genomes in dataset" >> {log} + echo {input.gbk} | tr ' ' '\n' >> {output.input_list} 2>> {log} echo {params.regions} | tr ' ' '\n' >> {output.input_list} 2>> {log} python workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py {output.input_list} {output.outdir} 2>> {log} From 0b65b42b40ba1a64500d2d95db3545d0218255c4 Mon Sep 17 00:00:00 2001 From: Matin Nuhamunada Date: Thu, 30 May 2024 13:36:32 +0000 Subject: [PATCH 07/20] feat: use database schema 0.3.1 --- workflow/rules/build-database.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/build-database.smk b/workflow/rules/build-database.smk index 5318c948..6932a688 100644 --- a/workflow/rules/build-database.smk +++ b/workflow/rules/build-database.smk @@ -105,7 +105,7 @@ rule get_dbt_template: params: dbt = "data/processed/{name}/dbt/antiSMASH_{version}", dbt_repo = "https://github.com/NBChub/bgcflow_dbt-duckdb", - release = "0.2.1", + release = "0.3.1", cutoff = "0.30", as_version = "{version}" shell: From 0ac724262aa77e2ccea5b06af99357d6f68e5788 Mon Sep 17 00:00:00 2001 From: Matin Nuhamunada Date: Mon, 3 Jun 2024 18:43:10 +0000 Subject: [PATCH 08/20] feat: enable parameter to change taxon in antismash --- workflow/rules/antismash.smk | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/workflow/rules/antismash.smk b/workflow/rules/antismash.smk index 4a7f4261..2746391d 100644 --- a/workflow/rules/antismash.smk +++ b/workflow/rules/antismash.smk @@ -112,6 +112,7 @@ elif antismash_major_version >= 7: folder=directory("data/interim/antismash/{version}/{strains}/"), antismash_db_path=antismash_db_path, genefinding="none", + taxon="bacteria", shell: """ set +e @@ -132,7 +133,7 @@ elif antismash_major_version >= 7: # Run AntiSMASH antismash --genefinding-tool {params.genefinding} --output-dir {params.folder} \ - --database {params.antismash_db_path} \ + --database {params.antismash_db_path} --taxon {params.taxon} \ --cb-general --cb-subclusters --cb-knownclusters -c {threads} $antismash_input --logfile {log} 2>> {log} # Check if the run failed due to changed detection results or changed protocluster types @@ -142,7 +143,7 @@ elif antismash_major_version >= 7: # Use genbank input instead echo "Previous JSON result is invalid, starting AntiSMASH from scratch..." >> {log} antismash --genefinding-tool {params.genefinding} --output-dir {params.folder} \ - --database {params.antismash_db_path} \ + --database {params.antismash_db_path} --taxon {params.taxon} \ --cb-general --cb-subclusters --cb-knownclusters -c {threads} {input.gbk} --logfile {log} 2>> {log} fi """ From 340c9478bb42236adbced2401450a1571b64f92d Mon Sep 17 00:00:00 2001 From: Matin Nuhamunada Date: Mon, 3 Jun 2024 18:43:36 +0000 Subject: [PATCH 09/20] chore: include .gbff as recognized format --- workflow/rules/convert_genbank.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/convert_genbank.smk b/workflow/rules/convert_genbank.smk index 865d8311..cf69c5b0 100644 --- a/workflow/rules/convert_genbank.smk +++ b/workflow/rules/convert_genbank.smk @@ -9,7 +9,7 @@ if len(CUSTOM_GENBANK) > 0: log: "logs/prokka/copy_custom_fasta/copy_custom_fasta-{custom_genbank}.log" shell: """ - if [[ {input} == *.gb || {input} == *.gbk || {input} == *.genbank ]] + if [[ {input} == *.gb || {input} == *.gbk || {input} == *.genbank || {input} == *.gbff ]] then cp {input} {output} 2>> {log} else From db9d42cad1e1068c5a0ff0aceb5cf03cee19c725 Mon Sep 17 00:00:00 2001 From: Matin Nuhamunada Date: Tue, 4 Jun 2024 15:39:31 +0000 Subject: [PATCH 10/20] fix: update lsabgc environment --- workflow/envs/lsabgc.post-deploy.sh | 2 +- workflow/envs/lsabgc.yaml | 393 ++++++++++++++++++---------- 2 files changed, 263 insertions(+), 132 deletions(-) diff --git a/workflow/envs/lsabgc.post-deploy.sh b/workflow/envs/lsabgc.post-deploy.sh index b0838581..024956bc 100644 --- a/workflow/envs/lsabgc.post-deploy.sh +++ b/workflow/envs/lsabgc.post-deploy.sh @@ -3,7 +3,7 @@ resource_dir="resources" output_lsabgc="$resource_dir/lsaBGC" repository="https://github.com/Kalan-Lab/lsaBGC" -version="1.40.0" +version="1.52" release="$repository/archive/refs/tags/v$version.tar.gz" log="logs/lsabgc/install.log" diff --git a/workflow/envs/lsabgc.yaml b/workflow/envs/lsabgc.yaml index 89b980a2..6dd8d1bf 100644 --- a/workflow/envs/lsabgc.yaml +++ b/workflow/envs/lsabgc.yaml @@ -1,257 +1,383 @@ -name: lsabgc_env +name: lsaBGC channels: - - conda-forge - - bioconda - defaults + - bioconda + - conda-forge dependencies: - - cython==3.0.0 - _libgcc_mutex=0.1=conda_forge - - _openmp_mutex=4.5=2_gnu + - _openmp_mutex=4.5=2_kmp_llvm - _r-mutex=1.0.1=anacondar_1 - alsa-lib=1.2.3.2=h166bdaf_0 - appdirs=1.4.4=pyh9f0ad1d_0 - - archspec=0.2.1=pyhd8ed1ab_0 + - aragorn=1.2.41=h031d066_2 + - archspec=0.2.3=pyhd8ed1ab_0 + - argcomplete=3.3.0=pyhd8ed1ab_0 + - argh=0.31.2=pyhd8ed1ab_0 + - argtable2=2.13=hd590300_1004 - aria2=1.36.0=h8b6cd97_3 + - arpack=3.7.0=hdefa2d7_2 + - attr=2.5.1=h166bdaf_1 + - barrnap=0.9=hdfd78af_4 - bc=1.07.1=h7f98852_0 + - bedtools=2.31.1=hf5e1c6e_1 - binutils=2.39=hdd6e379_1 - binutils_impl_linux-64=2.39=he00db2b_1 - binutils_linux-64=2.39=h5fc0e48_13 + - biocode=0.11.0=pyhdfd78af_0 - bioconductor-ggtree=3.2.0=r41hdfd78af_0 - bioconductor-treeio=1.18.0=r41hdfd78af_0 - biopython=1.79=py39hb9d737c_3 - - blast=2.14.1=pl5321h6f7f691_0 + - blast=2.15.0=pl5321h6f7f691_1 + - blast-legacy=2.2.26=h9ee0642_3 - boost-cpp=1.74.0=h6cacc03_7 - bowtie2=2.4.5=py39h3321a2d_4 - - brotli-python=1.1.0=py39h3d6467e_0 + - brotli=1.1.0=hd590300_1 + - brotli-bin=1.1.0=hd590300_1 + - brotli-python=1.1.0=py39h3d6467e_1 - bwidget=1.9.14=ha770c72_1 - - bzip2=1.0.8=h7f98852_4 - - c-ares=1.19.1=hd590300_0 + - bzip2=1.0.8=hd590300_5 + - c-ares=1.28.1=hd590300_0 - c-compiler=1.3.0=h7f98852_0 - - ca-certificates=2023.7.22=hbcca054_0 + - ca-certificates=2024.6.2=hbcca054_0 - cairo=1.16.0=ha12eb4b_1010 - capnproto=0.10.2=h6239696_0 - - certifi=2023.7.22=pyhd8ed1ab_0 - - charset-normalizer=3.2.0=pyhd8ed1ab_0 - - click=8.1.7=unix_pyh707e725_0 + - cd-hit=4.8.1=h43eeafb_10 + - certifi=2024.2.2=pyhd8ed1ab_0 + - charset-normalizer=3.3.2=pyhd8ed1ab_0 + - clustalo=1.2.4=hdbdd923_8 + - clustalw=2.1=h4ac6f70_10 - colorama=0.4.6=pyhd8ed1ab_0 + - contourpy=1.2.1=py39h7633fee_0 - coreutils=9.1=h166bdaf_0 - - curl=7.86.0=h7bff187_1 + - curl=7.87.0=h5eee18b_0 - cxx-compiler=1.3.0=h4bd325d_0 - - dataclasses=0.8=pyhc8e2a94_3 + - cycler=0.12.1=pyhd8ed1ab_0 - dbus=1.13.6=h5008d03_3 - - decorator=5.1.1=pyhd8ed1ab_0 + - dendropy=5.0.1=pyhdfd78af_0 - diamond=2.0.15=hb97b32f_1 - docopt=0.6.2=py_1 - - dos2unix=7.4.1=0 - - entrez-direct=16.2=he881be0_1 + - dos2unix=7.5.2=ha770c72_3 + - entrez-direct=21.6=he881be0_0 - ete3=3.1.2=pyh9f0ad1d_0 - - expat=2.5.0=hcb278e6_1 + - expat=2.6.2=h59595ed_0 - fastme=2.1.6.1=h031d066_3 - - fasttree=2.1.11=h031d066_2 + - fasttree=2.1.11=h031d066_3 + - fftw=3.3.10=nompi_hc118613_108 - file=5.39=h753d276_1 - filetype=1.2.0=pyhd8ed1ab_0 - - fisher=0.1.14=py39h44dd56e_0 + - fisher=0.1.14=py39h44dd56e_1 - font-ttf-dejavu-sans-mono=2.37=hab24e00_0 - font-ttf-inconsolata=3.000=h77eed37_0 - font-ttf-source-code-pro=2.038=h77eed37_0 - - font-ttf-ubuntu=0.83=hab24e00_0 + - font-ttf-ubuntu=0.83=h77eed37_2 - fontconfig=2.14.2=h14ed4e7_0 - fonts-conda-ecosystem=1=0 - fonts-conda-forge=1=0 + - fonttools=4.53.0=py39hd3abc70_0 - freetype=2.12.1=h267a509_2 - fribidi=1.0.10=h36c2ea0_0 - - future=0.18.3=pyhd8ed1ab_0 + - future=1.0.0=pyhd8ed1ab_0 - gawk=5.1.0=h7f98852_0 - gcc=9.5.0=h1fea6ba_13 - gcc_impl_linux-64=9.5.0=h99780fb_19 - gcc_linux-64=9.5.0=h4258300_13 - gdbm=1.18=h0a1914f_2 - gecco=0.9.8=pyhdfd78af_0 - - gensim=4.3.2=py39hddac248_0 - - gettext=0.21.1=h27087fc_0 + - gensim=4.3.2=py39hddac248_1 + - gettext=0.22.5=h59595ed_2 + - gettext-tools=0.22.5=h59595ed_2 + - gffutils=0.13=pyh7cba7a3_0 - gfortran_impl_linux-64=9.5.0=hf1096a2_19 - gfortran_linux-64=9.5.0=hdb51d14_13 + - glib=2.74.1=h6239696_0 + - glib-tools=2.74.1=h6239696_0 - glpk=5.0=h445213a_0 - - gmp=6.2.1=h58526e2_0 - - graphite2=1.3.13=h58526e2_1001 + - gmp=6.3.0=h59595ed_1 + - graphite2=1.3.13=h59595ed_1003 - gsl=2.7=he838d99_0 - gst-plugins-base=1.18.5=hf529b03_3 - - gstreamer=1.18.5=h9f60fe5_3 - - gtotree=1.8.2=h9ee0642_0 + - gstreamer=1.20.3=hd4edc92_2 + - gtotree=1.8.6=h9ee0642_0 - gxx=9.5.0=h1fea6ba_13 - gxx_impl_linux-64=9.5.0=h99780fb_19 - gxx_linux-64=9.5.0=h43f449f_13 - - gzip=1.12=h166bdaf_0 + - gzip=1.13=hd590300_0 - harfbuzz=4.2.0=h40b6f09_0 - hmmer=3.3.2=hdbdd923_4 - - htslib=1.14=h9093b5e_0 + - htslib=1.17=h6bc39ce_1 - icu=69.1=h9c3ff4c_0 - - idna=3.4=pyhd8ed1ab_0 - - importlib-metadata=6.8.0=pyha770c72_0 - - importlib_metadata=6.8.0=hd8ed1ab_0 - - importlib_resources=6.1.0=pyhd8ed1ab_0 - - iqtree=2.2.5=h21ec9f0_0 - - jbig=2.1=h7f98852_2003 - - joblib=1.3.2=pyhd8ed1ab_0 + - idna=3.7=pyhd8ed1ab_0 + - importlib-metadata=7.1.0=pyha770c72_0 + - importlib-resources=6.4.0=pyhd8ed1ab_0 + - importlib_metadata=7.1.0=hd8ed1ab_0 + - importlib_resources=6.4.0=pyhd8ed1ab_0 + - infernal=1.1.5=pl5321h031d066_1 + - intbitset=3.0.2=py39hd1e30aa_1 + - iqtree=2.3.4=h21ec9f0_0 + - jack=1.9.18=hfd4fe87_1001 + - jinja2=3.1.4=pyhd8ed1ab_0 + - joblib=1.4.2=pyhd8ed1ab_0 - jpeg=9e=h0b41bf4_3 - - kernel-headers_linux-64=2.6.32=he073ed8_16 + - kernel-headers_linux-64=2.6.32=he073ed8_17 - keyutils=1.6.1=h166bdaf_0 + - kiwisolver=1.4.5=py39h7633fee_1 - kofamscan=1.3.0=hdfd78af_2 - krb5=1.19.3=h3790be6_0 + - lcms2=2.14=h6ed2654_0 - ld_impl_linux-64=2.39=hcc3a1bd_1 - - lerc=2.2.1=h9c3ff4c_0 - - libblas=3.9.0=18_linux64_openblas - - libcblas=3.9.0=18_linux64_openblas - - libclang=13.0.1=default_h7634d5b_3 - - libcurl=7.86.0=h7bff187_1 - - libdeflate=1.7=h7f98852_5 + - lerc=4.0.0=h27087fc_0 + - libasprintf=0.22.5=h661eb56_2 + - libasprintf-devel=0.22.5=h661eb56_2 + - libblas=3.9.0=22_linux64_openblas + - libbrotlicommon=1.1.0=hd590300_1 + - libbrotlidec=1.1.0=hd590300_1 + - libbrotlienc=1.1.0=hd590300_1 + - libcap=2.64=ha37c62d_0 + - libcblas=3.9.0=22_linux64_openblas + - libclang=13.0.1=default_h7634d5b_6 + - libcups=2.3.3=h3e49a29_2 + - libcurl=7.87.0=h91b91d3_0 + - libdb=6.2.32=h9c3ff4c_0 + - libdeflate=1.14=h166bdaf_0 - libedit=3.1.20191231=he28a2e2_2 - - libev=4.33=h516909a_1 + - libev=4.33=hd590300_2 - libevent=2.1.10=h9b69904_4 - - libexpat=2.5.0=hcb278e6_1 + - libexpat=2.6.2=h59595ed_0 - libffi=3.4.2=h7f98852_5 + - libflac=1.3.4=h27087fc_0 - libgcc-devel_linux-64=9.5.0=h0a57e50_19 - - libgcc-ng=13.2.0=h807b86a_2 - - libgfortran-ng=13.2.0=h69a702a_2 - - libgfortran5=13.2.0=ha4646dd_2 + - libgcc-ng=13.2.0=h77fa898_7 + - libgettextpo=0.22.5=h59595ed_2 + - libgettextpo-devel=0.22.5=h59595ed_2 + - libgfortran=3.0.0=1 + - libgfortran-ng=13.2.0=h69a702a_7 + - libgfortran5=13.2.0=hca663fb_7 - libglib=2.74.1=h7a41b64_0 - - libgomp=13.2.0=h807b86a_2 + - libgomp=13.2.0=h77fa898_7 - libhwloc=2.8.0=h32351e8_1 - - libiconv=1.17=h166bdaf_0 - - libidn2=2.3.4=h166bdaf_0 - - liblapack=3.9.0=18_linux64_openblas + - libiconv=1.17=hd590300_2 + - libidn11=1.34=h1cef754_0 + - libidn2=2.3.7=hd590300_0 + - liblapack=3.9.0=22_linux64_openblas - libllvm13=13.0.1=hf817b99_2 + - libllvm14=14.0.6=hcd5def8_4 - libmagic=5.39=h753d276_1 - libnghttp2=1.51.0=hdcd2b5c_0 - - libnsl=2.0.0=h7f98852_0 + - libnsl=2.0.1=hd590300_0 - libogg=1.3.4=h7f98852_1 - - libopenblas=0.3.24=pthreads_h413a1c8_0 + - libopenblas=0.3.27=pthreads_h413a1c8_0 - libopus=1.3.1=h7f98852_1 - - libpng=1.6.39=h753d276_0 + - libpng=1.6.43=h2797004_0 - libpq=14.5=h72a31a5_3 - libsanitizer=9.5.0=h2f262e1_19 - - libsqlite=3.43.0=h2797004_0 + - libsndfile=1.0.31=h9c3ff4c_1 + - libsqlite=3.45.3=h2797004_0 - libssh2=1.10.0=haa6b8db_3 - libstdcxx-devel_linux-64=9.5.0=h0a57e50_19 - - libstdcxx-ng=13.2.0=h7e041cc_2 - - libtiff=4.3.0=hf544144_1 + - libstdcxx-ng=13.2.0=hc0a3c3a_7 + - libtiff=4.4.0=h82bc61c_5 + - libtool=2.4.7=h27087fc_0 + - libudev1=253=h0b41bf4_0 - libunistring=0.9.10=h7f98852_0 - libuuid=2.38.1=h0b41bf4_0 - libvorbis=1.3.7=h9c3ff4c_0 - - libwebp-base=1.3.2=hd590300_0 + - libwebp-base=1.4.0=hd590300_0 - libxcb=1.13=h7f98852_1004 + - libxcrypt=4.4.36=hd590300_1 - libxkbcommon=1.0.3=he3ba5ed_0 - libxml2=2.9.14=haae042b_4 - libxslt=1.1.33=h0ef7038_3 - - libzlib=1.2.13=hd590300_5 + - libzlib=1.2.13=h4ab18f5_6 + - llvm-openmp=18.1.6=ha31de31_0 + - llvmlite=0.42.0=py39h174d805_1 - lxml=4.8.0=py39hb9d737c_3 - mafft=7.505=hec16e2b_0 - make=4.3=hd18ef5c_1 - markdown-it-py=3.0.0=pyhd8ed1ab_0 + - markupsafe=2.1.5=py39hd1e30aa_0 - mash=2.3=hd3113c8_6 + - matplotlib-base=3.8.4=py39h10d1fc8_2 - mcl=14.137=pl5321h031d066_9 - - mdurl=0.1.0=pyhd8ed1ab_0 - - mmseqs2=14.7e284=pl5321h6a68c12_2 + - mdurl=0.1.2=pyhd8ed1ab_0 + - metis=5.1.0=h59595ed_1007 + - minced=0.4.2=hdfd78af_1 + - mkl=2024.1.0=ha957f24_693 + - mmseqs2=15.6f452=pl5321h6a68c12_2 + - mpfr=4.2.1=h9458935_1 - mpi=1.0=openmpi + - munkres=1.1.4=pyh9f0ad1d_0 - muscle=5.1=h4ac6f70_3 - - mypy=1.5.1=py39hd1e30aa_1 + - mypy=1.10.0=py39hd3abc70_0 - mypy_extensions=1.0.0=pyha770c72_0 - mysql-common=8.0.32=h14678bc_0 - mysql-libs=8.0.32=h54cf53e_0 - n50=1.5.8=pl5321hdfd78af_0 - ncbi-genome-download=0.3.3=pyh7cba7a3_0 - - ncbi-vdb=3.0.8=hdbdd923_0 + - ncbi-vdb=3.1.1=h4ac6f70_0 - ncurses=6.2=h58526e2_4 - - networkx=3.1=pyhd8ed1ab_0 + - networkx=3.2.1=pyhd8ed1ab_0 - nspr=4.35=h27087fc_0 - - nss=3.92=h1d7d5a4_0 - - numpy=1.26.0=py39h474f0d3_0 - - openmpi=4.1.5=h414af15_101 + - nss=3.100=hca3bf56_0 + - numba=0.59.1=py39h615d6bd_0 + - numpy=1.26.4=py39h474f0d3_0 + - openjdk=11.0.1=h516909a_1016 + - openjpeg=2.5.0=h7d73246_1 + - openmpi=4.1.6=hc5af2df_101 - openssl=1.1.1w=hd590300_0 - orthofinder=2.5.4=hdfd78af_0 - - ossuuid=1.6.2=hf484d3e_1000 - - packaging=23.1=pyhd8ed1ab_0 + - packaging=24.0=pyhd8ed1ab_0 - pal2nal=14.1=pl5321hdfd78af_3 + - paml=4.10.7=h031d066_1 + - panaroo=1.5.0=pyhdfd78af_0 - pandas=1.4.2=py39h1832856_2 - pango=1.50.7=hbd2fdc8_0 - - parallel=20230722=ha770c72_0 - - patsy=0.5.3=pyhd8ed1ab_0 + - parallel=20240522=ha770c72_0 + - patsy=0.5.6=pyhd8ed1ab_0 - pcre=8.45=h9c3ff4c_0 - pcre2=10.37=hc3806b6_1 - - perl=5.32.1=4_hd590300_perl5 - - perl-alien-build=2.48=pl5321hec16e2b_0 - - perl-alien-libxml2=0.17=pl5321hec16e2b_0 + - peewee=3.17.3=py39h21eaaa1_0 + - perl=5.32.1=7_hd590300_perl5 + - perl-algorithm-diff=1.201=pl5321hd8ed1ab_0 - perl-archive-tar=2.40=pl5321hdfd78af_0 + - perl-base=2.23=pl5321hd8ed1ab_0 + - perl-bio-asn1-entrezgene=1.73=pl5321hdfd78af_3 + - perl-bio-coordinate=1.007001=pl5321hdfd78af_3 + - perl-bio-featureio=1.6.905=pl5321hdfd78af_4 + - perl-bio-samtools=1.43=pl5321he4a0461_4 + - perl-bio-searchio-hmmer=1.7.3=pl5321hdfd78af_0 + - perl-bio-tools-phylo-paml=1.7.3=pl5321hdfd78af_3 + - perl-bio-tools-run-alignment-clustalw=1.7.4=pl5321hdfd78af_3 + - perl-bio-tools-run-alignment-tcoffee=1.7.4=pl5321hdfd78af_5 + - perl-bioperl=1.7.8=hdfd78af_1 + - perl-bioperl-core=1.7.8=pl5321hdfd78af_1 + - perl-bioperl-run=1.007003=pl5321hdfd78af_0 - perl-business-isbn=3.007=pl5321hd8ed1ab_0 - perl-business-isbn-data=20210112.006=pl5321hd8ed1ab_0 - perl-capture-tiny=0.48=pl5321ha770c72_1 - perl-carp=1.50=pl5321hd8ed1ab_0 + - perl-class-data-inheritable=0.09=pl5321ha770c72_0 - perl-common-sense=3.75=pl5321hd8ed1ab_0 - perl-compress-raw-bzip2=2.201=pl5321h166bdaf_0 - perl-compress-raw-zlib=2.202=pl5321h166bdaf_0 - perl-constant=1.33=pl5321hd8ed1ab_0 - - perl-encode=3.19=pl5321h166bdaf_0 + - perl-data-dumper=2.183=pl5321hd590300_0 + - perl-db_file=1.858=pl5321h166bdaf_0 + - perl-devel-stacktrace=2.04=pl5321ha770c72_0 + - perl-digest-hmac=1.04=pl5321hdfd78af_0 + - perl-digest-md5=2.58=pl5321h166bdaf_0 + - perl-encode=3.21=pl5321hd590300_0 + - perl-encode-locale=1.05=pl5321hdfd78af_7 + - perl-exception-class=1.45=pl5321ha770c72_0 - perl-exporter=5.74=pl5321hd8ed1ab_0 - perl-exporter-tiny=1.002002=pl5321hd8ed1ab_0 - perl-extutils-makemaker=7.70=pl5321hd8ed1ab_0 - - perl-fastx-reader=1.11.0=pl5321hdfd78af_0 - - perl-ffi-checklib=0.28=pl5321hdfd78af_0 - - perl-file-chdir=0.1011=pl5321hd8ed1ab_0 + - perl-fastx-reader=1.12.0=pl5321hdfd78af_0 + - perl-file-listing=6.16=pl5321hdfd78af_0 - perl-file-path=2.18=pl5321hd8ed1ab_0 + - perl-file-slurp-tiny=0.004=pl5321hdfd78af_2 + - perl-file-sort=1.01=pl5321hdfd78af_3 + - perl-file-spec=3.48_01=pl5321hdfd78af_2 - perl-file-temp=0.2304=pl5321hd8ed1ab_0 - perl-file-which=1.24=pl5321hd8ed1ab_0 - perl-getopt-long=2.54=pl5321hdfd78af_0 - - perl-importer=0.026=pl5321hd8ed1ab_0 + - perl-html-parser=3.81=pl5321h4ac6f70_1 + - perl-html-tagset=3.20=pl5321hdfd78af_4 + - perl-http-cookies=6.10=pl5321hdfd78af_0 + - perl-http-daemon=6.16=pl5321hdfd78af_0 + - perl-http-date=6.06=pl5321hdfd78af_0 + - perl-http-message=6.36=pl5321hdfd78af_0 + - perl-http-negotiate=6.01=pl5321hdfd78af_4 + - perl-inc-latest=0.500=pl5321ha770c72_0 - perl-io-compress=2.201=pl5321hdbdd923_2 + - perl-io-html=1.004=pl5321hdfd78af_0 + - perl-io-socket-ssl=2.075=pl5321hd8ed1ab_0 + - perl-io-string=1.08=pl5321hdfd78af_4 + - perl-io-tty=1.16=pl5321h166bdaf_0 - perl-io-zlib=1.14=pl5321hdfd78af_0 + - perl-ipc-run=20200505.0=pl5321hdfd78af_0 - perl-json=4.10=pl5321hdfd78af_0 - perl-json-pp=4.11=pl5321hd8ed1ab_0 - perl-json-xs=2.34=pl5321h4ac6f70_6 + - perl-libwww-perl=6.67=pl5321hdfd78af_0 + - perl-libxml-perl=0.08=pl5321hdfd78af_3 - perl-list-moreutils=0.430=pl5321hdfd78af_0 - perl-list-moreutils-xs=0.430=pl5321h031d066_2 + - perl-lwp-mediatypes=6.04=pl5321hdfd78af_1 + - perl-mime-base64=3.16=pl5321h166bdaf_0 + - perl-module-build=0.4234=pl5321ha770c72_0 + - perl-net-http=6.22=pl5321hdfd78af_0 + - perl-net-ssleay=1.92=pl5321haa6b8db_1 + - perl-ntlm=1.09=pl5321hdfd78af_5 - perl-parent=0.241=pl5321hd8ed1ab_0 - - perl-path-tiny=0.124=pl5321hd8ed1ab_0 - perl-pathtools=3.75=pl5321h166bdaf_0 - perl-perlio-encoding=0.18=pl5321hdfd78af_2 - perl-pod-escapes=1.07=pl5321hdfd78af_2 - perl-pod-usage=2.03=pl5321hdfd78af_0 - perl-scalar-list-utils=1.63=pl5321h166bdaf_0 - - perl-scope-guard=0.21=pl5321hd8ed1ab_0 + - perl-socket=2.027=pl5321h031d066_4 - perl-storable=3.15=pl5321h166bdaf_0 - - perl-sub-info=0.002=pl5321hd8ed1ab_0 - - perl-term-table=0.016=pl5321hdfd78af_0 + - perl-sub-uplevel=0.2800=pl5321h166bdaf_0 - perl-test=1.26=pl5321hd8ed1ab_0 + - perl-test-deep=1.130=pl5321hd8ed1ab_0 + - perl-test-differences=0.71=pl5321ha770c72_0 + - perl-test-exception=0.43=pl5321hd8ed1ab_0 - perl-test-fatal=0.016=pl5321ha770c72_0 - perl-test-harness=3.44=pl5321hd8ed1ab_0 + - perl-test-most=0.38=pl5321hdfd78af_0 + - perl-test-warn=0.37=pl5321hd8ed1ab_0 - perl-test-warnings=0.031=pl5321ha770c72_0 - - perl-test2-suite=0.000145=pl5321hdfd78af_0 - perl-text-asciitable=0.22=pl5321hdfd78af_3 + - perl-text-diff=1.45=pl5321hd8ed1ab_0 + - perl-time-local=1.35=pl5321hdfd78af_0 + - perl-timedate=2.33=pl5321hdfd78af_2 + - perl-tree-dag_node=1.32=pl5321hdfd78af_0 - perl-try-tiny=0.31=pl5321ha770c72_0 - perl-types-serialiser=1.01=pl5321hdfd78af_0 - perl-uri=5.17=pl5321ha770c72_0 - - perl-xml-libxml=2.0207=pl5321h661654b_0 + - perl-url-encode=0.03=pl5321h9ee0642_0 + - perl-www-robotrules=6.02=pl5321hdfd78af_4 + - perl-xml-dom=1.46=pl5321hdfd78af_1 + - perl-xml-dom-xpath=0.14=pl5321hdfd78af_2 - perl-xml-namespacesupport=1.12=pl5321hd8ed1ab_0 + - perl-xml-parser=2.44_01=pl5321hc3e0081_1003 + - perl-xml-regexp=0.04=pl5321hdfd78af_3 - perl-xml-sax=1.02=pl5321hd8ed1ab_0 - perl-xml-sax-base=1.09=pl5321hd8ed1ab_0 - - pixman=0.40.0=h36c2ea0_0 - - polars=0.19.3=py39h903e532_0 + - perl-xml-sax-expat=0.51=pl5321hd8ed1ab_0 + - perl-xml-simple=2.25=pl5321hdfd78af_2 + - perl-xml-xpathengine=0.14=pl5321hdfd78af_3 + - pillow=9.2.0=py39hf3a2cdf_3 + - pip=22.1.2=pyhd8ed1ab_0 + - pixman=0.43.2=h59595ed_0 + - plotly=5.22.0=pyhd8ed1ab_0 + - poa=2.0=h031d066_5 + - polars=0.20.31=py39ha963410_0 - pomegranate=0.13.3=py39h1a9c180_3 - - prodigal=2.6.3=h031d066_6 - - psutil=5.9.5=py39hd1e30aa_1 + - prank=170427=h4ac6f70_0 + - prodigal=2.6.3=h031d066_8 + - prokka=1.14.6=pl5321hdfd78af_5 + - psutil=5.9.8=py39hd1e30aa_0 - pthread-stubs=0.4=h36c2ea0_1001 - - pygments=2.16.1=pyhd8ed1ab_0 - - pyhmmer=0.10.2=py39hf95cd2a_0 - - pyqt=5.12.3=py39h03dd644_4 - - pyrodigal=3.0.0=py39hf95cd2a_0 + - pulseaudio=14.0=hbc9ff1d_7 + - pyfaidx=0.8.1.1=pyhdfd78af_0 + - pygments=2.18.0=pyhd8ed1ab_0 + - pyhmmer=0.10.12=py39hf95cd2a_0 + - pyparsing=3.1.2=pyhd8ed1ab_0 + - pyqt=5.15.4=py39h5a03fae_0 + - pyqt5-sip=12.9.0=py39h5a03fae_0 + - pyrodigal=2.3.0=py39hf95cd2a_1 - pysam=0.16.0.1=py39h051187c_3 - pysocks=1.7.1=pyha2e5f31_6 - python=3.9.9=h62f1059_0_cpython - - python-crfsuite=0.9.9=py39h7633fee_1 - - python-dateutil=2.8.2=pyhd8ed1ab_0 + - python-crfsuite=0.9.9=py39h7633fee_2 + - python-dateutil=2.9.0=pyhd8ed1ab_0 + - python-edlib=1.3.9=py39h1f90b4d_6 + - python-igraph=0.10.2=py39h000617a_0 - python_abi=3.9=4_cp39 - - pytz=2023.3.post1=pyhd8ed1ab_0 + - pytz=2024.1=pyhd8ed1ab_0 + - pyvcf3=1.0.3=pyhdfd78af_0 - pyyaml=6.0.1=py39hd1e30aa_1 - - qt=5.12.9=ha98a1a1_5 + - qt-main=5.15.2=hdf1cb14_3 - r-ape=5.7_1=r41h358215d_0 - r-aplot=0.1.10=r41hc72bb7e_0 - r-base=4.1.2=h2553ce4_1 @@ -263,7 +389,7 @@ dependencies: - r-combinat=0.0_8=r41hc72bb7e_1004 - r-commonmark=1.9.0=r41h133d619_0 - r-cowplot=1.1.1=r41hc72bb7e_1 - - r-cpp11=0.4.3=r41hc72bb7e_0 + - r-cpp11=0.4.7=r41hc72bb7e_0 - r-crayon=1.5.2=r41hc72bb7e_1 - r-curl=4.3.3=r41h06615bd_1 - r-data.table=1.14.8=r41h133d619_0 @@ -346,39 +472,51 @@ dependencies: - r-xfun=0.39=r41ha503ecb_0 - r-xml2=1.3.3=r41h044e5c7_2 - r-yulab.utils=0.0.6=r41hc72bb7e_0 - - raxml=8.2.12=h031d066_6 - - raxml-ng=1.2.0=h6d1f11b_1 + - raxml=8.2.13=h031d066_1 + - raxml-ng=1.2.2=h6d1f11b_0 - readline=8.1=h46c0cb4_0 - - requests=2.31.0=pyhd8ed1ab_0 - - rich=13.5.3=pyhd8ed1ab_0 + - requests=2.32.3=pyhd8ed1ab_0 + - rich=13.7.1=pyhd8ed1ab_0 - ruby=3.1.0=h86e321c_1 - samtools=1.12=h9aed4be_1 - scikit-learn=1.2.2=py39hc236052_2 - - scipy=1.11.2=py39h474f0d3_1 + - scipy=1.13.1=py39haf93ffa_0 - sed=4.8=he412f7d_0 + - setuptools=58.2.0=py39hf3d152e_0 + - simplejson=3.19.2=py39hd1e30aa_0 + - sip=6.5.1=py39he80948d_2 - six=1.16.0=pyh6c4a22f_0 - sklearn-crfsuite=0.3.6=pyh9f0ad1d_0 - - smart-open=6.4.0=pyhd8ed1ab_0 - - smart_open=6.4.0=pyhd8ed1ab_0 + - smart-open=7.0.4=hd8ed1ab_0 + - smart_open=7.0.4=pyhd8ed1ab_0 - sqlite=3.37.0=h9cd32fc_0 - - statsmodels=0.14.0=py39h0f8d45d_1 - - sysroot_linux-64=2.12=he073ed8_16 + - statsmodels=0.14.2=py39hd92a3bb_0 + - suitesparse=5.10.1=h9e50725_1 + - sysroot_linux-64=2.12=he073ed8_17 + - t-coffee=12.00.7fb08c2=h26a2512_0 - tabulate=0.9.0=pyhd8ed1ab_1 - tar=1.34=hb2e2bae_1 - - taxonkit=0.15.0=h9ee0642_0 + - taxadb=0.12.1=pyh5e36f6f_0 + - taxonkit=0.16.0=h9ee0642_1 - tbb=2021.7.0=h924138e_1 - - threadpoolctl=3.2.0=pyha21a80b_0 - - tk=8.6.13=h2797004_0 - - tktable=2.10=h0c5db8f_4 + - tbl2asn-forever=25.7.2f=h031d066_4 + - tenacity=8.3.0=pyhd8ed1ab_0 + - texttable=1.7.0=pyhd8ed1ab_0 + - threadpoolctl=3.5.0=pyhc1e730c_0 + - tk=8.6.13=noxft_h4845f30_101 + - tktable=2.10=h0c5db8f_5 + - toml=0.10.2=pyhd8ed1ab_0 - tomli=2.0.1=pyhd8ed1ab_0 - - tqdm=4.66.1=pyhd8ed1ab_0 - - trimal=1.4.1=h4ac6f70_8 - - typing_extensions=4.8.0=pyha770c72_0 - - tzdata=2023c=h71feb2d_0 - - unzip=6.0=h7f98852_3 - - urllib3=2.0.5=pyhd8ed1ab_0 + - tqdm=4.66.4=pyhd8ed1ab_0 + - trimal=1.4.1=h4ac6f70_9 + - typing_extensions=4.12.1=pyha770c72_0 + - tzdata=2024a=h0c530f3_0 + - unicodedata2=15.1.0=py39hd1e30aa_0 + - urllib3=2.2.1=pyhd8ed1ab_0 + - viennarna=2.6.4=py39pl5321h4e691d4_1 - wget=1.20.3=ha56f1ee_1 - - wheel=0.41.2=pyhd8ed1ab_0 + - wheel=0.43.0=pyhd8ed1ab_1 + - wrapt=1.16.0=py39hd1e30aa_0 - xlsxwriter=3.0.3=pyhd8ed1ab_0 - xorg-kbproto=1.0.7=h7f98852_1002 - xorg-libice=1.0.10=h7f98852_0 @@ -395,12 +533,5 @@ dependencies: - xz=5.2.6=h166bdaf_0 - yaml=0.2.5=h7f98852_2 - zipp=3.17.0=pyhd8ed1ab_0 - - zlib=1.2.13=hd590300_5 - - zstd=1.5.5=hfc55251_0 - - pip: - - pip==23.2.1 - - pyqt5-sip==4.19.18 - - pyqtchart==5.12 - - pyqtwebengine==5.12.1 - - setuptools==68.2.2 - - sonicparanoid==2.0.2 + - zlib=1.2.13=h4ab18f5_6 + - zstd=1.5.6=ha6fb4c9_0 From b43da51697d7dfd3cc710bcceda8bc6b478905d8 Mon Sep 17 00:00:00 2001 From: Matin Nuhamunada Date: Tue, 4 Jun 2024 15:40:42 +0000 Subject: [PATCH 11/20] chore: add taxon message for antismash run --- workflow/rules/antismash.smk | 1 + 1 file changed, 1 insertion(+) diff --git a/workflow/rules/antismash.smk b/workflow/rules/antismash.smk index 2746391d..94a9725b 100644 --- a/workflow/rules/antismash.smk +++ b/workflow/rules/antismash.smk @@ -132,6 +132,7 @@ elif antismash_major_version >= 7: fi # Run AntiSMASH + echo "Running AntiSMASH {params.taxon} mode..." >> {log} antismash --genefinding-tool {params.genefinding} --output-dir {params.folder} \ --database {params.antismash_db_path} --taxon {params.taxon} \ --cb-general --cb-subclusters --cb-knownclusters -c {threads} $antismash_input --logfile {log} 2>> {log} From 66ce0475ae031f8cb900c170fbfbb5fc20ae3dc4 Mon Sep 17 00:00:00 2001 From: Matin Nuhamunada Date: Tue, 4 Jun 2024 18:12:36 +0000 Subject: [PATCH 12/20] fix: pin setuptools < 70.0.0 --- .github/workflows/build.yml | 2 +- workflow/envs/lsabgc.yaml | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0690b0d0..027259fa 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -45,7 +45,7 @@ jobs: - name: Set up Micromamba uses: mamba-org/setup-micromamba@v1 with: - micromamba-version: '1.5.0-1' + micromamba-version: '1.5.8-1' environment-file: ${{ matrix.environment }} init-shell: bash cache-environment: true diff --git a/workflow/envs/lsabgc.yaml b/workflow/envs/lsabgc.yaml index 6dd8d1bf..efe90db5 100644 --- a/workflow/envs/lsabgc.yaml +++ b/workflow/envs/lsabgc.yaml @@ -535,3 +535,6 @@ dependencies: - zipp=3.17.0=pyhd8ed1ab_0 - zlib=1.2.13=h4ab18f5_6 - zstd=1.5.6=ha6fb4c9_0 + - pip + - pip: + - setuptools==69.5.1 From 016d0701898ccdb7be5ab5dda7e5b9cf22f6485a Mon Sep 17 00:00:00 2001 From: Matin Nuhamunada Date: Tue, 4 Jun 2024 18:31:07 +0000 Subject: [PATCH 13/20] test: correct action --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 027259fa..23e0084d 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -4,7 +4,7 @@ on: push: branches: - main - pull_request_target: + pull_request: branches: - main From 38b7193c5a6b39a977489272b8492fa78d5c1803 Mon Sep 17 00:00:00 2001 From: Matin Nuhamunada Date: Thu, 13 Jun 2024 20:57:51 +0000 Subject: [PATCH 14/20] feat: add antismash parameters based on antismash database --- workflow/rules/antismash.smk | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/workflow/rules/antismash.smk b/workflow/rules/antismash.smk index 94a9725b..79dbf8c9 100644 --- a/workflow/rules/antismash.smk +++ b/workflow/rules/antismash.smk @@ -118,8 +118,7 @@ elif antismash_major_version >= 7: set +e # Find the latest existing JSON output for this strain - latest_version=$(ls -d data/interim/antismash/*/{wildcards.strains}/{wildcards.strains}.json | grep {wildcards.strains} | sort -r | head -n 1 | cut -d '/' -f 4) 2>> {log} - + latest_version=$(find data/interim/antismash/*/{wildcards.strains} -name "{wildcards.strains}.json" | sort -r | head -n 1 | cut -d '/' -f 4) 2>> {log} if [ -n "$latest_version" ]; then # Use existing JSON result as starting point old_json="data/interim/antismash/$latest_version/{wildcards.strains}/{wildcards.strains}.json" @@ -131,11 +130,26 @@ elif antismash_major_version >= 7: antismash_input="{input.gbk}" fi + # Store common parameters in a variable + antismash_params="--genefinding-tool {params.genefinding} \ + --output-dir {params.folder} \ + --database {params.antismash_db_path} \ + --taxon {params.taxon} \ + --cb-knownclusters \ + --cb-subclusters \ + --cc-mibig \ + --clusterhmmer \ + --tigrfam \ + --pfam2go \ + --rre \ + --asf \ + --tfbs \ + -c {threads} \ + --logfile {log}" + # Run AntiSMASH echo "Running AntiSMASH {params.taxon} mode..." >> {log} - antismash --genefinding-tool {params.genefinding} --output-dir {params.folder} \ - --database {params.antismash_db_path} --taxon {params.taxon} \ - --cb-general --cb-subclusters --cb-knownclusters -c {threads} $antismash_input --logfile {log} 2>> {log} + antismash $antismash_params $antismash_input 2>> {log} # Check if the run failed due to changed detection results or changed protocluster types if grep -q -e "ValueError: Detection results have changed. No results can be reused" \ @@ -143,9 +157,7 @@ elif antismash_major_version >= 7: then # Use genbank input instead echo "Previous JSON result is invalid, starting AntiSMASH from scratch..." >> {log} - antismash --genefinding-tool {params.genefinding} --output-dir {params.folder} \ - --database {params.antismash_db_path} --taxon {params.taxon} \ - --cb-general --cb-subclusters --cb-knownclusters -c {threads} {input.gbk} --logfile {log} 2>> {log} + antismash $antismash_params {input.gbk} 2>> {log} fi """ From cb90fa3ed46407c83a0170ea9f042978f8871dae Mon Sep 17 00:00:00 2001 From: Matin Nuhamunada Date: Thu, 20 Jun 2024 12:56:51 +0200 Subject: [PATCH 15/20] fix: correct bgc downstream preparation and make sure all changes registered --- .../data/bgc_downstream_prep_selection.py | 275 ++++++++++-------- .../data/get_antismash_overview_gather.py | 14 +- workflow/rules/antismash.smk | 9 +- workflow/rules/bgc_analytics.smk | 2 +- 4 files changed, 169 insertions(+), 131 deletions(-) diff --git a/workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py b/workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py index bff37fa5..44963ae9 100644 --- a/workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py +++ b/workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py @@ -3,6 +3,7 @@ import sys from pathlib import Path +from alive_progress import alive_bar from Bio import SeqIO log_format = "%(levelname)-8s %(asctime)s %(message)s" @@ -10,141 +11,173 @@ logging.basicConfig(format=log_format, datefmt=date_format, level=logging.DEBUG) -def generate_symlink(path, genome_id, output_dir, selected_bgcs=False): +def generate_symlink(selected_bgcs, genome_id, output_dir): """ - Given an antiSMASH directory, check for changed name + Given an antiSMASH directory, check for changed name and generate a symlink. + + Parameters: + selected_bgcs (str): Path to the selected BGCs. + genome_id (str): ID of the genome. + output_dir (str): Path to the output directory. + + Returns: + dict: A dictionary containing the change log. """ outpath = Path(output_dir) / genome_id outpath.mkdir(parents=True, exist_ok=True) - logging.debug(f"Deducting genome id as {genome_id}") - ctr = 0 change_log = None - matches = selected_bgcs.stem - for gbk in path.glob("*.gbk"): - if gbk.stem in matches: - logging.debug(f"Found match: {gbk.stem}") - filename = gbk.name - ctr = ctr + 1 - logging.info(f"Parsing file: {gbk.name}") - region = SeqIO.parse(str(gbk), "genbank") - for record in region: - logging.debug(f"Processing: {gbk.name}: {record.id}") - record_log = {} - if "structured_comment" in record.annotations: - try: - original_id = record.annotations["structured_comment"][ - "antiSMASH-Data" - ]["Original ID"].split()[0] - except KeyError: - original_id = record.id - logging.warning( - f"Found shortened record.id: {record.id} <- {original_id}." - ) - else: - raise ValueError(f"No Structured Comments in record: {gbk.name}") - - if (":" in str(record.description)) or (":" in original_id): - logging.warning( - f"Illegal character ':' found in genbank description, removing: {record.description}" - ) - # Remove colon from description - record.description = record.description.replace(":", "") - original_id = original_id.replace(":", "") - - # Rename antiSMASH comment - if "structured_comment" in record.annotations: - if ( - "Original ID" - in record.annotations["structured_comment"][ - "antiSMASH-Data" - ] - ): - record.annotations["structured_comment"]["antiSMASH-Data"][ - "Original ID" - ] = original_id - - # Write new GenBank file - new_filename = filename.replace(record.id, original_id) - with open(outpath / new_filename, "w") as output_handle: - SeqIO.write(record, output_handle, "genbank") - link = outpath / new_filename - else: - # generate symlink - new_filename = filename.replace(record.id, original_id) - target_path = Path.cwd() / gbk # target for symlink - - link = outpath / new_filename - - logging.info(f"Generating symlink: {link}") - try: - link.symlink_to(target_path) - except FileExistsError: - logging.warning( - f"Previous symlink exist, updating target: {link} -> {target_path}" - ) - link.unlink() - link.symlink_to(target_path) - - # Assert that the symlink was correctly generated - assert link.is_symlink(), f"Failed to create symlink: {link}" - assert ( - link.resolve() == target_path.resolve() - ), f"Symlink {link} does not point to the correct target: {target_path}" + gbk = Path(selected_bgcs) + filename = gbk.name + logging.info(f"{genome_id} - Parsing file: {gbk.name}") + region = SeqIO.parse(str(gbk), "genbank") + for record in region: + record_log = {} + if "structured_comment" in record.annotations: + try: + original_id = record.annotations["structured_comment"][ + "antiSMASH-Data" + ]["Original ID"].split()[0] + except KeyError: + original_id = record.id + logging.warning( + f" - Found shortened record.id: {record.id} <- {original_id}." + ) + else: + raise ValueError(f"No Structured Comments in record: {gbk.name}") + + if (":" in str(record.description)) or (":" in original_id): + logging.warning( + f" - Illegal character ':' found in genbank description, removing: {record.description}" + ) + # Remove colon from description + record.description = record.description.replace(":", "") + original_id = original_id.replace(":", "") + + # Rename antiSMASH comment + if "structured_comment" in record.annotations: + if ( + "Original ID" + in record.annotations["structured_comment"]["antiSMASH-Data"] + ): + record.annotations["structured_comment"]["antiSMASH-Data"][ + "Original ID" + ] = original_id + + # Write new GenBank file + new_filename = filename.replace(record.id, original_id) + with open(outpath / new_filename, "w") as output_handle: + SeqIO.write(record, output_handle, "genbank") + link = outpath / new_filename + else: + # generate symlink + new_filename = filename.replace(record.id, original_id) + target_path = Path.cwd() / gbk # target for symlink + + link = outpath / new_filename + + logging.info(f" - Generating symlink: {link}") + try: + link.symlink_to(target_path) + except FileExistsError: + logging.warning( + f" - Previous symlink exist, updating target: {link} -> {target_path}" + ) + link.unlink() + link.symlink_to(target_path) + + # Assert that the symlink was correctly generated + assert link.is_symlink(), f" - Failed to create symlink: {link}" + assert ( + link.resolve() == target_path.resolve() + ), f" - Symlink {link} does not point to the correct target: {target_path}" + + record_log["record_id"] = record.id + record_log["original_id"] = original_id + record_log["target_path"] = str(gbk) + record_log["symlink_path"] = str(link) + + change_log = {filename: record_log} + return change_log - record_log["record_id"] = record.id - record_log["original_id"] = original_id - record_log["target_path"] = str(gbk) - record_log["symlink_path"] = str(link) - change_log = {filename: record_log} - return change_log +def bgc_downstream_prep(input_file, output_dir, input_dir="."): + """ + Prepare the downstream BGCs. + Parameters: + input_file (str): Path to the input file. + output_dir (str): Path to the output directory. + input_dir (str, optional): Path to the input directory. Defaults to current directory. -def bgc_downstream_prep(input_file, output_dir): - logging.info(f"Reading input file: {input_file}") + Returns: + None + """ + original_input_dir = Path(input_dir) + logging.info(f"Reading input file: {input_file} from {original_input_dir}") with open(input_file, "r") as file: - file_paths = [Path(f) for f in file.read().splitlines()] + file_paths = [original_input_dir / f for f in file.read().splitlines()] change_log_containers = {} - for num, selected_bgcs in enumerate(file_paths): - input_dir = selected_bgcs.parent - logging.info(f"Reading input directory: {input_dir}") - path = Path(input_dir) - if not path.is_dir(): - raise FileNotFoundError(f"No such file or directory: {path}") - - # check if it has complete antiSMASH results - if (path / f"{path.name}.json").is_file(): - logging.info("Found full antiSMASH record") - genome_id = path.name - else: - logging.warning("No full antiSMASH record found, unknown genome id") - genome_id = "unknown_genome_id" - - assert selected_bgcs.exists(), f"File does not exist: {selected_bgcs}" - region_change_log = generate_symlink(path, genome_id, output_dir, selected_bgcs) - change_log_containers[num] = { - "genome_id": genome_id, - "value": region_change_log, - } + input_dirs = set([file.parent for file in file_paths]) + change_log_ctr = 0 + with alive_bar(len(input_dirs), title="Downstream prepping genomes:") as bar: + for num, input_dir in enumerate(input_dirs): + logging.info( + f"{num} - Processing {input_dir.name}: Reading input directory: {input_dir}" + ) + path = Path(input_dir) + if not path.is_dir(): + raise FileNotFoundError(f"No such file or directory: {path}") + # check if it has complete antiSMASH results + if (path / f"{path.name}.json").is_file(): + logging.info("Found full antiSMASH record") + genome_id = path.name + else: + logging.warning("No full antiSMASH record found, unknown genome id") + genome_id = "unknown_genome_id" + genbanks_list = [g for g in file_paths if genome_id in str(g)] + gbk_ctr = 0 + for selected_bgcs in genbanks_list: + if selected_bgcs in file_paths: + assert ( + selected_bgcs.exists() + ), f"File does not exist: {selected_bgcs}" + region_change_log = generate_symlink( + selected_bgcs, genome_id, output_dir + ) + change_log_containers[change_log_ctr] = { + "genome_id": genome_id, + "value": region_change_log, + } + gbk_ctr += 1 + change_log_ctr += 1 + logging.debug( + f"Finished creating {gbk_ctr}/{len(genbanks_list)} symlinks for {genome_id}\n" + ) + bar() + logging.info("Writing change logs...") change_logs = {} genome_ids = set(v["genome_id"] for v in change_log_containers.values()) - for genome_id in genome_ids: - change_log = {} - for v in change_log_containers.values(): - if v["genome_id"] == genome_id: - entry_name = list(v["value"].keys())[0] - change_log[entry_name] = v["value"][entry_name] - change_logs[genome_id] = change_log - logging.debug(f"Change log for {genome_id}: {change_log}") - - for genome_id in change_logs.keys(): - outpath = Path(output_dir) / genome_id - with open( - outpath / f"{genome_id}-change_log.json", "w", encoding="utf8" - ) as json_file: - json.dump({genome_id: change_logs[genome_id]}, json_file, indent=4) - logging.info(f"{genome_id}: Job done!\n") + with alive_bar(len(genome_ids), title="Merging changelogs:") as bar: + for genome_id in genome_ids: + change_log = {} + for v in change_log_containers.values(): + if v["genome_id"] == genome_id: + entry_name = list(v["value"].keys())[0] + change_log[entry_name] = v["value"][entry_name] + change_logs[genome_id] = change_log + logging.debug(f"Change log for {genome_id}: {len(change_log)}") + bar() + + with alive_bar(len(change_logs.keys()), title="Writing changelogs:") as bar: + for genome_id in change_logs.keys(): + outpath = Path(output_dir) / genome_id + with open( + outpath / f"{genome_id}-change_log.json", "w", encoding="utf8" + ) as json_file: + json.dump({genome_id: change_logs[genome_id]}, json_file, indent=4) + logging.info(f"{genome_id}: Job done!\n") + bar() if __name__ == "__main__": diff --git a/workflow/bgcflow/bgcflow/data/get_antismash_overview_gather.py b/workflow/bgcflow/bgcflow/data/get_antismash_overview_gather.py index f9719c9c..a8a8e51b 100644 --- a/workflow/bgcflow/bgcflow/data/get_antismash_overview_gather.py +++ b/workflow/bgcflow/bgcflow/data/get_antismash_overview_gather.py @@ -30,7 +30,9 @@ def correct_bgc_id_overview(overview_file, mapping_file, genome_id=False): new_dict : dict Corrected BGC overview dictionary with updated BGC IDs """ - logging.info(f"Correcting shortened bgc ids for {genome_id}...") + logging.info( + f"Correcting shortened bgc ids for {genome_id} using mapping from {mapping_file}..." + ) overview_path = Path(overview_file) mapping_path = Path(mapping_file) @@ -86,7 +88,7 @@ def gather_bgc_overview(input_json, mapping_dir, table): None """ input_json = Path(input_json) - logging.info(input_json) + if input_json.is_file() and input_json.suffix == ".json": logging.info(f"Getting BGC overview from a single file: {input_json}") input_json_files = input_json @@ -104,6 +106,7 @@ def gather_bgc_overview(input_json, mapping_dir, table): input_json_files = [ Path(path) for path in paths if Path(path).suffix == ".json" ] + logging.info(f"Found entries of {len(input_json_files)} region files...") else: input_json_files = [ Path(file) @@ -120,6 +123,9 @@ def gather_bgc_overview(input_json, mapping_dir, table): genome_id = mapping_file.name.replace("_bgc_overview.json", "") mapping_path = Path(mapping_dir) / f"{genome_id}/{genome_id}-change_log.json" corrected = correct_bgc_id_overview(mapping_file, mapping_path, genome_id) + logging.debug( + f"Adding {len(corrected)} entries from {genome_id} to the merged table..." + ) merged_dict.update(corrected) df = pd.DataFrame.from_dict(merged_dict).T @@ -130,13 +136,13 @@ def gather_bgc_overview(input_json, mapping_dir, table): lambda x: 1 if x is not None and x > 1 else x ) - logging.debug(f"Writing file to: {table}") + logging.debug(f"Writing file containing {len(df)} entries to: {table}") # Save dataframes to csv tables df_table = Path(table) df_table.parent.mkdir(parents=True, exist_ok=True) df.to_csv(table) - logging.info("Job done") + return None diff --git a/workflow/rules/antismash.smk b/workflow/rules/antismash.smk index 79dbf8c9..93e62990 100644 --- a/workflow/rules/antismash.smk +++ b/workflow/rules/antismash.smk @@ -118,12 +118,11 @@ elif antismash_major_version >= 7: set +e # Find the latest existing JSON output for this strain - latest_version=$(find data/interim/antismash/*/{wildcards.strains} -name "{wildcards.strains}.json" | sort -r | head -n 1 | cut -d '/' -f 4) 2>> {log} - if [ -n "$latest_version" ]; then + latest_json=$(find data/interim/antismash/*/* -name "{wildcards.strains}.json" | sort -V | tail -n 1) 2>> {log} + if [ -n "$latest_json" ]; then # Use existing JSON result as starting point - old_json="data/interim/antismash/$latest_version/{wildcards.strains}/{wildcards.strains}.json" - echo "Using existing JSON from $old_json as starting point..." >> {log} - antismash_input="--reuse-result $old_json" + echo "Using existing JSON from $latest_json as starting point..." >> {log} + antismash_input="--reuse-result $latest_json" else # No existing JSON result found, use genbank input echo "No existing JSON result found, starting AntiSMASH from scratch..." >> {log} diff --git a/workflow/rules/bgc_analytics.smk b/workflow/rules/bgc_analytics.smk index af3f4651..a80d311f 100644 --- a/workflow/rules/bgc_analytics.smk +++ b/workflow/rules/bgc_analytics.smk @@ -42,7 +42,7 @@ rule antismash_overview_gather: "logs/bgc_analytics/antismash_overview_gather-{version}-{name}.log", shell: """ - TMPDIR="data/interim/tmp/{wildcards.name}/{wildcards.version}" + TMPDIR="data/interim/bgcs/{wildcards.name}/tmp/{wildcards.version}" mkdir -p $TMPDIR INPUT_JSON="$TMPDIR/df_regions_antismash.txt" echo '{input.bgc_overview}' > $INPUT_JSON From 241b8ca6251f2c5be09ac0e3442d11353c0a3fe1 Mon Sep 17 00:00:00 2001 From: Matin Nuhamunada Date: Thu, 20 Jun 2024 14:30:11 +0200 Subject: [PATCH 16/20] test: fix micromamba version --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 23e0084d..543b41c8 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -45,7 +45,7 @@ jobs: - name: Set up Micromamba uses: mamba-org/setup-micromamba@v1 with: - micromamba-version: '1.5.8-1' + micromamba-version: '1.5.8-0' environment-file: ${{ matrix.environment }} init-shell: bash cache-environment: true From 591f1b4431efcb697dd9c21990f3b40eaef55761 Mon Sep 17 00:00:00 2001 From: Matin Nuhamunada Date: Thu, 20 Jun 2024 16:09:29 +0000 Subject: [PATCH 17/20] test: use snakemake version from wrapper --- .github/workflows/push.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 4e26ffdb..b1dce9c3 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -148,7 +148,6 @@ jobs: - run: pip install bgcflow_wrapper - run: pip install pytest-cov - run: pip install alive-progress - - run: pip install snakemake==8.5.2 - name: Test coverage run: pytest --cov=.tests/unit .tests/unit/ - name: Build coverage file From 8de0983d5aac9f89d1b1af1455f4f5f53d20ca4b Mon Sep 17 00:00:00 2001 From: Matin Nuhamunada Date: Thu, 20 Jun 2024 17:48:22 +0000 Subject: [PATCH 18/20] fix: pin numpyt to version 1.26.4 for checkm --- workflow/envs/checkm.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/workflow/envs/checkm.yaml b/workflow/envs/checkm.yaml index 520530c6..6b24663c 100644 --- a/workflow/envs/checkm.yaml +++ b/workflow/envs/checkm.yaml @@ -4,6 +4,8 @@ channels: - bioconda - defaults dependencies: + - python==3.11 + - numpy==1.26.4 - checkm-genome==1.2.2 - wget - tar From 3010397459e661c086a1de9cac57695b9ef9925c Mon Sep 17 00:00:00 2001 From: Matin Nuhamunada Date: Thu, 20 Jun 2024 17:49:49 +0000 Subject: [PATCH 19/20] test: update expected ncbi metadata --- .../Lactobacillus_delbrueckii/tables/df_ncbi_meta.csv | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.tests/unit/extract_ncbi_information/expected/data/processed/Lactobacillus_delbrueckii/tables/df_ncbi_meta.csv b/.tests/unit/extract_ncbi_information/expected/data/processed/Lactobacillus_delbrueckii/tables/df_ncbi_meta.csv index 5130be45..aac6d46d 100644 --- a/.tests/unit/extract_ncbi_information/expected/data/processed/Lactobacillus_delbrueckii/tables/df_ncbi_meta.csv +++ b/.tests/unit/extract_ncbi_information/expected/data/processed/Lactobacillus_delbrueckii/tables/df_ncbi_meta.csv @@ -1,5 +1,5 @@ -genome_id,BioProject,assembly,assembly_level,assembly_type,biosample,date,genbank,genome_representation,genus,organism,refseq,refseq_category,refseq_genbank_identity,release_type,species,strain,submitter,tax_id -GCA_000056065.1,PRJNA16871,ASM5606v1,Complete Genome,na,SAMEA3138258,2006-05-26,GCA_000056065.1,full,Lactobacillus,Lactobacillus delbrueckii subsp. bulgaricus ATCC 11842 = JCM 1002 (firmicutes),GCF_000056065.1,,yes,major,delbrueckii,ATCC 11842,Genoscope,390333 -GCA_000182835.1,PRJNA49147,ASM18283v1,Complete Genome,na,SAMN02603937,2010-11-19,GCA_000182835.1,full,Lactobacillus,Lactobacillus delbrueckii subsp. bulgaricus ND02 (firmicutes),GCF_000182835.1,,yes,major,delbrueckii,ND02,"The Key Laboratory of Dairy Biotechnology and Bioengineering, Education Ministry of P. R. China, Department of Food Science and Engineering, Inner Mongolia Agricultural University, China",767455 -GCA_000191165.1,PRJNA16120,ASM19116v1,Complete Genome,na,SAMN02603124,2011-03-03,GCA_000191165.1,full,Lactobacillus,Lactobacillus delbrueckii subsp. bulgaricus 2038 (firmicutes),GCF_000191165.1,,yes,major,delbrueckii,2038,Chinese National HGC,353496 -GCA_000014405.1,PRJNA403,ASM1440v1,Complete Genome,na,SAMN02598530,2006-10-13,GCA_000014405.1,full,Lactobacillus,Lactobacillus delbrueckii subsp. bulgaricus ATCC BAA-365 (firmicutes),GCF_000014405.1,,yes,major,delbrueckii,ATCC BAA-365,"US DOE Joint Genome Institute (JGI), The Lactic Acid Bacteria Genome Consortium and Fidelity Systems Inc.",321956 +genome_id,assembly,organism,genus,species,strain,tax_id,refseq_category,refseq,genbank,assembly_type,release_type,assembly_level,genome_representation,refseq_genbank_identity,biosample,submitter,date,BioProject +GCA_000056065.1,ASM5606v1,Lactobacillus delbrueckii subsp. bulgaricus ATCC 11842 = JCM 1002 (firmicutes),Lactobacillus,delbrueckii,ATCC 11842,390333,,GCF_000056065.1,GCA_000056065.1,na,major,Complete Genome,full,yes,SAMEA3138258,Genoscope,2006-05-26,PRJNA16871 +GCA_000182835.1,ASM18283v1,Lactobacillus delbrueckii subsp. bulgaricus ND02 (firmicutes),Lactobacillus,delbrueckii,ND02,767455,,GCF_000182835.1,GCA_000182835.1,na,major,Complete Genome,full,yes,SAMN02603937,"The Key Laboratory of Dairy Biotechnology and Bioengineering, Education Ministry of P. R. China, Department of Food Science and Engineering, Inner Mongolia Agricultural University, China",2010-11-19,PRJNA49147 +GCA_000191165.1,ASM19116v1,Lactobacillus delbrueckii subsp. bulgaricus 2038 (firmicutes),Lactobacillus,delbrueckii,2038,353496,,GCF_000191165.1,GCA_000191165.1,na,major,Complete Genome,full,yes,SAMN02603124,Chinese National HGC,2011-03-03,PRJNA16120 +GCA_000014405.1,ASM1440v1,Lactobacillus delbrueckii subsp. bulgaricus ATCC BAA-365 (firmicutes),Lactobacillus,delbrueckii,ATCC BAA-365,321956,,GCF_000014405.1,GCA_000014405.1,na,major,Complete Genome,full,yes,SAMN02598530,"US DOE Joint Genome Institute (JGI), The Lactic Acid Bacteria Genome Consortium and Fidelity Systems Inc.",2006-10-13,PRJNA403 From db70072925f36ad794f39df7b147a9e7e60129d8 Mon Sep 17 00:00:00 2001 From: Matin Nuhamunada Date: Thu, 20 Jun 2024 18:23:07 +0000 Subject: [PATCH 20/20] test: drop build test for antiSMASH 6 and lsabgc --- .github/workflows/build.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 543b41c8..3d7cff49 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -15,7 +15,7 @@ jobs: matrix: environment: - workflow/envs/antismash.yaml - - workflow/envs/antismash_v6.yaml + #- workflow/envs/antismash_v6.yaml - workflow/envs/arts.yaml - workflow/envs/automlst_wrapper.yaml - workflow/envs/bgc_analytics.yaml @@ -36,7 +36,7 @@ jobs: - workflow/envs/roary.yaml - workflow/envs/seqfu.yaml - workflow/envs/utilities.yaml - - workflow/envs/lsabgc.yaml + #- workflow/envs/lsabgc.yaml steps: - name: Checkout repository and submodules uses: actions/checkout@v4