Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Preparation to release v1.0.0 #348

Merged
merged 20 commits into from
Jun 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
fc034e5
docs: update citation
matinnuhamunada May 6, 2024
5deef98
fix: upgrade bgc genome preparation script to handle weird input
matinnuhamunada May 8, 2024
d7998aa
fix: update get_antismash_inputs function to retrieve region gbks
matinnuhamunada May 8, 2024
6355679
fix: change input requirement for bgc downstream preparation
matinnuhamunada May 17, 2024
8629004
chore: correct typos
matinnuhamunada May 29, 2024
ac20d03
fix: reinclude full antiSMASH gbks for downstream process
matinnuhamunada May 30, 2024
0b65b42
feat: use database schema 0.3.1
matinnuhamunada May 30, 2024
0ac7242
feat: enable parameter to change taxon in antismash
matinnuhamunada Jun 3, 2024
340c947
chore: include .gbff as recognized format
matinnuhamunada Jun 3, 2024
db9d42c
fix: update lsabgc environment
matinnuhamunada Jun 4, 2024
b43da51
chore: add taxon message for antismash run
matinnuhamunada Jun 4, 2024
66ce047
fix: pin setuptools < 70.0.0
matinnuhamunada Jun 4, 2024
016d070
test: correct action
matinnuhamunada Jun 4, 2024
38b7193
feat: add antismash parameters based on antismash database
matinnuhamunada Jun 13, 2024
cb90fa3
fix: correct bgc downstream preparation and make sure all changes reg…
matinnuhamunada Jun 20, 2024
241b8ca
test: fix micromamba version
matinnuhamunada Jun 20, 2024
591f1b4
test: use snakemake version from wrapper
matinnuhamunada Jun 20, 2024
8de0983
fix: pin numpyt to version 1.26.4 for checkm
matinnuhamunada Jun 20, 2024
3010397
test: update expected ncbi metadata
matinnuhamunada Jun 20, 2024
db70072
test: drop build test for antiSMASH 6 and lsabgc
matinnuhamunada Jun 20, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ on:
push:
branches:
- main
pull_request_target:
pull_request:
branches:
- main

Expand All @@ -15,7 +15,7 @@ jobs:
matrix:
environment:
- workflow/envs/antismash.yaml
- workflow/envs/antismash_v6.yaml
#- workflow/envs/antismash_v6.yaml
- workflow/envs/arts.yaml
- workflow/envs/automlst_wrapper.yaml
- workflow/envs/bgc_analytics.yaml
Expand All @@ -36,7 +36,7 @@ jobs:
- workflow/envs/roary.yaml
- workflow/envs/seqfu.yaml
- workflow/envs/utilities.yaml
- workflow/envs/lsabgc.yaml
#- workflow/envs/lsabgc.yaml
steps:
- name: Checkout repository and submodules
uses: actions/checkout@v4
Expand All @@ -45,7 +45,7 @@ jobs:
- name: Set up Micromamba
uses: mamba-org/setup-micromamba@v1
with:
micromamba-version: '1.5.0-1'
micromamba-version: '1.5.8-0'
environment-file: ${{ matrix.environment }}
init-shell: bash
cache-environment: true
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,6 @@ jobs:
- run: pip install bgcflow_wrapper
- run: pip install pytest-cov
- run: pip install alive-progress
- run: pip install snakemake==8.5.2
- name: Test coverage
run: pytest --cov=.tests/unit .tests/unit/
- name: Build coverage file
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
genome_id,BioProject,assembly,assembly_level,assembly_type,biosample,date,genbank,genome_representation,genus,organism,refseq,refseq_category,refseq_genbank_identity,release_type,species,strain,submitter,tax_id
GCA_000056065.1,PRJNA16871,ASM5606v1,Complete Genome,na,SAMEA3138258,2006-05-26,GCA_000056065.1,full,Lactobacillus,Lactobacillus delbrueckii subsp. bulgaricus ATCC 11842 = JCM 1002 (firmicutes),GCF_000056065.1,,yes,major,delbrueckii,ATCC 11842,Genoscope,390333
GCA_000182835.1,PRJNA49147,ASM18283v1,Complete Genome,na,SAMN02603937,2010-11-19,GCA_000182835.1,full,Lactobacillus,Lactobacillus delbrueckii subsp. bulgaricus ND02 (firmicutes),GCF_000182835.1,,yes,major,delbrueckii,ND02,"The Key Laboratory of Dairy Biotechnology and Bioengineering, Education Ministry of P. R. China, Department of Food Science and Engineering, Inner Mongolia Agricultural University, China",767455
GCA_000191165.1,PRJNA16120,ASM19116v1,Complete Genome,na,SAMN02603124,2011-03-03,GCA_000191165.1,full,Lactobacillus,Lactobacillus delbrueckii subsp. bulgaricus 2038 (firmicutes),GCF_000191165.1,,yes,major,delbrueckii,2038,Chinese National HGC,353496
GCA_000014405.1,PRJNA403,ASM1440v1,Complete Genome,na,SAMN02598530,2006-10-13,GCA_000014405.1,full,Lactobacillus,Lactobacillus delbrueckii subsp. bulgaricus ATCC BAA-365 (firmicutes),GCF_000014405.1,,yes,major,delbrueckii,ATCC BAA-365,"US DOE Joint Genome Institute (JGI), The Lactic Acid Bacteria Genome Consortium and Fidelity Systems Inc.",321956
genome_id,assembly,organism,genus,species,strain,tax_id,refseq_category,refseq,genbank,assembly_type,release_type,assembly_level,genome_representation,refseq_genbank_identity,biosample,submitter,date,BioProject
GCA_000056065.1,ASM5606v1,Lactobacillus delbrueckii subsp. bulgaricus ATCC 11842 = JCM 1002 (firmicutes),Lactobacillus,delbrueckii,ATCC 11842,390333,,GCF_000056065.1,GCA_000056065.1,na,major,Complete Genome,full,yes,SAMEA3138258,Genoscope,2006-05-26,PRJNA16871
GCA_000182835.1,ASM18283v1,Lactobacillus delbrueckii subsp. bulgaricus ND02 (firmicutes),Lactobacillus,delbrueckii,ND02,767455,,GCF_000182835.1,GCA_000182835.1,na,major,Complete Genome,full,yes,SAMN02603937,"The Key Laboratory of Dairy Biotechnology and Bioengineering, Education Ministry of P. R. China, Department of Food Science and Engineering, Inner Mongolia Agricultural University, China",2010-11-19,PRJNA49147
GCA_000191165.1,ASM19116v1,Lactobacillus delbrueckii subsp. bulgaricus 2038 (firmicutes),Lactobacillus,delbrueckii,2038,353496,,GCF_000191165.1,GCA_000191165.1,na,major,Complete Genome,full,yes,SAMN02603124,Chinese National HGC,2011-03-03,PRJNA16120
GCA_000014405.1,ASM1440v1,Lactobacillus delbrueckii subsp. bulgaricus ATCC BAA-365 (firmicutes),Lactobacillus,delbrueckii,ATCC BAA-365,321956,,GCF_000014405.1,GCA_000014405.1,na,major,Complete Genome,full,yes,SAMN02598530,"US DOE Joint Genome Institute (JGI), The Lactic Acid Bacteria Genome Consortium and Fidelity Systems Inc.",2006-10-13,PRJNA403
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
At present, `BGCFlow` is only tested and confirmed to work on **Linux** systems with `conda` / `mamba` package manager.

## Publication
> Matin Nuhamunada, Omkar S. Mohite, Patrick V. Phaneuf, Bernhard O. Palsson, and Tilmann Weber. (2023). BGCFlow: Systematic pangenome workflow for the analysis of biosynthetic gene clusters across large genomic datasets. bioRxiv 2023.06.14.545018; doi: [https://doi.org/10.1101/2023.06.14.545018](https://doi.org/10.1101/2023.06.14.545018)
> Matin Nuhamunada, Omkar S Mohite, Patrick V Phaneuf, Bernhard O Palsson, Tilmann Weber, BGCFlow: systematic pangenome workflow for the analysis of biosynthetic gene clusters across large genomic datasets, Nucleic Acids Research, 2024;, gkae314, [https://doi.org/10.1093/nar/gkae314](https://doi.org/10.1093/nar/gkae314)

## Pre-requisites
`BGCFlow` requires `gcc` and the `conda`/`mamba` package manager. See [installation instruction](https://github.com/NBChub/bgcflow/wiki/00-Installation-Guide) for details.
Expand All @@ -27,7 +27,7 @@ A quick and easy way to use `BGCFlow` using the command line interface wrapper:

```bash
# create and activate a new conda environment
conda create -n bgcflow -c conda-forge python=3.11 pip openjdk -y # also install java for metabase
mamba create -n bgcflow -c conda-forge python=3.11 pip openjdk -y # also install java for metabase
conda activate bgcflow

# install `BGCFlow` wrapper
Expand Down
277 changes: 156 additions & 121 deletions workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,146 +3,181 @@
import sys
from pathlib import Path

from alive_progress import alive_bar
from Bio import SeqIO

log_format = "%(levelname)-8s %(asctime)s %(message)s"
date_format = "%d/%m %H:%M:%S"
logging.basicConfig(format=log_format, datefmt=date_format, level=logging.DEBUG)


def generate_symlink(path, genome_id, output_dir, selected_bgcs=False):
def generate_symlink(selected_bgcs, genome_id, output_dir):
"""
Given an antiSMASH directory, check for changed name
Given an antiSMASH directory, check for changed name and generate a symlink.

Parameters:
selected_bgcs (str): Path to the selected BGCs.
genome_id (str): ID of the genome.
output_dir (str): Path to the output directory.

Returns:
dict: A dictionary containing the change log.
"""
outpath = Path(output_dir) / genome_id
outpath.mkdir(parents=True, exist_ok=True)
logging.debug(f"Deducting genome id as {genome_id}")
ctr = 0
matches = selected_bgcs.stem
for gbk in path.glob("*.gbk"):
if gbk.stem in matches:
logging.debug(f"Found match: {gbk.stem}")
filename = gbk.name
ctr = ctr + 1
logging.info(f"Parsing file: {gbk.name}")
region = SeqIO.parse(str(gbk), "genbank")
for record in region:
logging.debug(f"Processing: {gbk.name}: {record.id}")
record_log = {}
if "structured_comment" in record.annotations:
try:
original_id = record.annotations["structured_comment"][
"antiSMASH-Data"
]["Original ID"].split()[0]
except KeyError:
original_id = record.id
logging.warning(
f"Found shortened record.id: {record.id} <- {original_id}."
)
else:
raise ValueError(f"No Structured Comments in record: {gbk.name}")

if (":" in str(record.description)) or (":" in original_id):
logging.warning(
f"Illegal character ':' found in genbank description, removing: {record.description}"
)
# Remove colon from description
record.description = record.description.replace(":", "")
original_id = original_id.replace(":", "")

# Rename antiSMASH comment
if "structured_comment" in record.annotations:
if (
"Original ID"
in record.annotations["structured_comment"][
"antiSMASH-Data"
]
):
record.annotations["structured_comment"]["antiSMASH-Data"][
"Original ID"
] = original_id

# Write new GenBank file
new_filename = filename.replace(record.id, original_id)
with open(outpath / new_filename, "w") as output_handle:
SeqIO.write(record, output_handle, "genbank")
link = outpath / new_filename
else:
# generate symlink
new_filename = filename.replace(record.id, original_id)
target_path = Path.cwd() / gbk # target for symlink

link = outpath / new_filename

logging.info(f"Generating symlink: {link}")
try:
link.symlink_to(target_path)
except FileExistsError:
logging.warning(
f"Previous symlink exist, updating target: {link} -> {target_path}"
)
link.unlink()
link.symlink_to(target_path)

# Assert that the symlink was correctly generated
assert link.is_symlink(), f"Failed to create symlink: {link}"
assert (
link.resolve() == target_path.resolve()
), f"Symlink {link} does not point to the correct target: {target_path}"
change_log = None
gbk = Path(selected_bgcs)
filename = gbk.name
logging.info(f"{genome_id} - Parsing file: {gbk.name}")
region = SeqIO.parse(str(gbk), "genbank")
for record in region:
record_log = {}
if "structured_comment" in record.annotations:
try:
original_id = record.annotations["structured_comment"][
"antiSMASH-Data"
]["Original ID"].split()[0]
except KeyError:
original_id = record.id
logging.warning(
f" - Found shortened record.id: {record.id} <- {original_id}."
)
else:
raise ValueError(f"No Structured Comments in record: {gbk.name}")

if (":" in str(record.description)) or (":" in original_id):
logging.warning(
f" - Illegal character ':' found in genbank description, removing: {record.description}"
)
# Remove colon from description
record.description = record.description.replace(":", "")
original_id = original_id.replace(":", "")

# Rename antiSMASH comment
if "structured_comment" in record.annotations:
if (
"Original ID"
in record.annotations["structured_comment"]["antiSMASH-Data"]
):
record.annotations["structured_comment"]["antiSMASH-Data"][
"Original ID"
] = original_id

# Write new GenBank file
new_filename = filename.replace(record.id, original_id)
with open(outpath / new_filename, "w") as output_handle:
SeqIO.write(record, output_handle, "genbank")
link = outpath / new_filename
else:
# generate symlink
new_filename = filename.replace(record.id, original_id)
target_path = Path.cwd() / gbk # target for symlink

link = outpath / new_filename

logging.info(f" - Generating symlink: {link}")
try:
link.symlink_to(target_path)
except FileExistsError:
logging.warning(
f" - Previous symlink exist, updating target: {link} -> {target_path}"
)
link.unlink()
link.symlink_to(target_path)

# Assert that the symlink was correctly generated
assert link.is_symlink(), f" - Failed to create symlink: {link}"
assert (
link.resolve() == target_path.resolve()
), f" - Symlink {link} does not point to the correct target: {target_path}"

record_log["record_id"] = record.id
record_log["original_id"] = original_id
record_log["target_path"] = str(gbk)
record_log["symlink_path"] = str(link)

change_log = {filename: record_log}
return change_log

record_log["record_id"] = record.id
record_log["original_id"] = original_id
record_log["target_path"] = str(gbk)
record_log["symlink_path"] = str(link)

change_log = {filename: record_log}
return change_log
def bgc_downstream_prep(input_file, output_dir, input_dir="."):
"""
Prepare the downstream BGCs.

Parameters:
input_file (str): Path to the input file.
output_dir (str): Path to the output directory.
input_dir (str, optional): Path to the input directory. Defaults to current directory.

def bgc_downstream_prep(input_file, output_dir):
logging.info(f"Reading input file: {input_file}")
Returns:
None
"""
original_input_dir = Path(input_dir)
logging.info(f"Reading input file: {input_file} from {original_input_dir}")
with open(input_file, "r") as file:
file_paths = [Path(f) for f in file.read().splitlines()]
file_paths = [original_input_dir / f for f in file.read().splitlines()]
change_log_containers = {}
for num, selected_bgcs in enumerate(file_paths):
input_dir = selected_bgcs.parent
logging.info(f"Reading input directory: {input_dir}")
path = Path(input_dir)
if not path.is_dir():
raise FileNotFoundError(f"No such file or directory: {path}")

# check if it has complete antiSMASH results
if (path / f"{path.name}.json").is_file():
logging.info("Found full antiSMASH record")
genome_id = path.name
else:
logging.warning("No full antiSMASH record found, unknown genome id")
genome_id = "unknown_genome_id"

assert selected_bgcs.exists(), f"File does not exist: {selected_bgcs}"
region_change_log = generate_symlink(path, genome_id, output_dir, selected_bgcs)
change_log_containers[num] = {
"genome_id": genome_id,
"value": region_change_log,
}
input_dirs = set([file.parent for file in file_paths])
change_log_ctr = 0
with alive_bar(len(input_dirs), title="Downstream prepping genomes:") as bar:
for num, input_dir in enumerate(input_dirs):
logging.info(
f"{num} - Processing {input_dir.name}: Reading input directory: {input_dir}"
)
path = Path(input_dir)
if not path.is_dir():
raise FileNotFoundError(f"No such file or directory: {path}")
# check if it has complete antiSMASH results
if (path / f"{path.name}.json").is_file():
logging.info("Found full antiSMASH record")
genome_id = path.name
else:
logging.warning("No full antiSMASH record found, unknown genome id")
genome_id = "unknown_genome_id"
genbanks_list = [g for g in file_paths if genome_id in str(g)]
gbk_ctr = 0
for selected_bgcs in genbanks_list:
if selected_bgcs in file_paths:
assert (
selected_bgcs.exists()
), f"File does not exist: {selected_bgcs}"
region_change_log = generate_symlink(
selected_bgcs, genome_id, output_dir
)
change_log_containers[change_log_ctr] = {
"genome_id": genome_id,
"value": region_change_log,
}
gbk_ctr += 1
change_log_ctr += 1
logging.debug(
f"Finished creating {gbk_ctr}/{len(genbanks_list)} symlinks for {genome_id}\n"
)
bar()

logging.info("Writing change logs...")
change_logs = {}
genome_ids = set(v["genome_id"] for v in change_log_containers.values())
for genome_id in genome_ids:
change_log = {}
for v in change_log_containers.values():
if v["genome_id"] == genome_id:
entry_name = list(v["value"].keys())[0]
change_log[entry_name] = v["value"][entry_name]
change_logs[genome_id] = change_log
logging.debug(change_logs)

for genome_id in change_logs.keys():
outpath = Path(output_dir) / genome_id
with open(
outpath / f"{genome_id}-change_log.json", "w", encoding="utf8"
) as json_file:
json.dump({genome_id: change_logs[genome_id]}, json_file, indent=4)
logging.info(f"{genome_id}: Job done!\n")
with alive_bar(len(genome_ids), title="Merging changelogs:") as bar:
for genome_id in genome_ids:
change_log = {}
for v in change_log_containers.values():
if v["genome_id"] == genome_id:
entry_name = list(v["value"].keys())[0]
change_log[entry_name] = v["value"][entry_name]
change_logs[genome_id] = change_log
logging.debug(f"Change log for {genome_id}: {len(change_log)}")
bar()

with alive_bar(len(change_logs.keys()), title="Writing changelogs:") as bar:
for genome_id in change_logs.keys():
outpath = Path(output_dir) / genome_id
with open(
outpath / f"{genome_id}-change_log.json", "w", encoding="utf8"
) as json_file:
json.dump({genome_id: change_logs[genome_id]}, json_file, indent=4)
logging.info(f"{genome_id}: Job done!\n")
bar()


if __name__ == "__main__":
Expand Down
Loading
Loading