diff --git a/.gitignore b/.gitignore index fb57152..d49c332 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,6 @@ __pycache__/ strainy/.DS_Store .DS_Store +build/ +dist/ +strainy.egg-info/ diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..194ec27 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,67 @@ +FROM ubuntu:22.04 +MAINTAINER Mikhail Kolmogorov, mikolmogorov@gmail.com + +# update and install dependencies +RUN apt-get update && \ + DEBIAN_FRONTEND="noninteractive" apt-get -y install tzdata && \ + apt-get -y install cmake git make gcc g++ autoconf bzip2 lzma-dev zlib1g-dev tabix libbz2-dev && \ + apt-get -y install libcurl4-openssl-dev libpthread-stubs0-dev liblzma-dev libhdf5-dev && \ + apt-get -y install python3-pip python3-virtualenv virtualenv python3-dev && \ + apt-get -y install wget libz-dev libncurses5-dev libgsl-dev && \ + apt-get -y install graphviz libgraphviz-dev pkg-config && \ + apt-get clean && \ + apt-get purge && \ + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + +RUN ln -s /usr/bin/python3 /usr/bin/python + +RUN python3 --version && \ + python3 -m pip install --upgrade pip + +ARG MM_VER=2.28 +RUN wget https://github.com/lh3/minimap2/releases/download/v$MM_VER/minimap2-$MM_VER.tar.bz2 && \ + tar xvf minimap2-$MM_VER.tar.bz2 && \ + rm minimap2-$MM_VER.tar.bz2 && \ + cd minimap2-$MM_VER && \ + make && \ + cp minimap2 /usr/bin/ + +### samtools +# 1.15 +WORKDIR /opt/samtools +RUN wget https://github.com/samtools/samtools/releases/download/1.15/samtools-1.15.tar.bz2 && \ + tar xvf samtools-1.15.tar.bz2 && \ + rm -r /opt/samtools/samtools-1.15.tar.bz2 && \ + cd samtools-1.15/ && \ + autoheader && \ + autoconf -Wno-header && \ + ./configure && \ + make && \ + cp samtools /usr/bin/samtools + +### bcftools +WORKDIR /opt/bcftools +RUN wget https://github.com/samtools/bcftools/releases/download/1.15/bcftools-1.15.tar.bz2 && \ + tar xvf bcftools-1.15.tar.bz2 && \ + cd bcftools-1.15/ && \ + autoheader && \ + autoconf -Wno-header && \ + ./configure --enable-libgsl && \ + make && \ + cp bcftools /usr/bin/bcftools + +#build and install Flye +WORKDIR /opt/flye +RUN git clone https://github.com/fenderglass/Flye && \ + cd Flye && \ + python setup.py install + +#Install strainy & dependencies +WORKDIR /opt/strainy +RUN git clone -b docker https://github.com/katerinakazantseva/strainy && \ + cd strainy && \ + python3 -m pip install -r requirements.txt && \ + python3 setup.py install + + +ENV PYTHONUNBUFFERED "1" diff --git a/README.md b/README.md index a430c19..0501430 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@ Third, Strainy enables assembly-based analysis, which is useful in the absence o ## Contents * [Installation](#installation) * [Quick Usage](#quick-usage) +* [Docker container](#docker-container) * [Strainy input](#strainy-input) * [Preparing de novo metagenomic assemblies](#preparing-de-novo-metagenomic-assemblies) * [Parameters description](#parameters-description) @@ -48,7 +49,7 @@ Find details [**here**](https://github.com/conda/conda/issues/11216) ## Quick usage -After successful installation, you should be able to run: +After repository cloning/installation, you should be able to run: ``` conda activate strainy @@ -65,6 +66,15 @@ See below for the details on how to interpret strainy output. Also see a more detailed [tutorial](#strainy-tutorial) for the example of how to use Strainy. +## Docker container + +Alternatively, you can use a Docker container (using the example provided in `test_set` Strainy directory): + +``` +ST_DIR=`pwd` +docker run -v $ST_DIR:$ST_DIR -u `id -u`:`id -g` mkolmogo/strainy:1.0 strainy --gfa $ST_DIR/test_set/toy.gfa --fastq $ST_DIR/test_set/toy.fastq.gz -o $ST_DIR/out_strainy --threads 8 --mode hifi +``` + ## Strainy input Strainy supports PacBio HiFi, Nanopore R9 (Guppy5+) and R10 sequencing. diff --git a/requirements.txt b/requirements.txt index f54046e..90d351b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,11 @@ +networkx>=2.6,<3.4 +numpy>=1.24,<1.27 +pandas>=1.3,<3 +pysam>=0.20,<0.23 +scipy>=1.8,<1.13 biopython gfapy karateclub matplotlib -networkx -numpy -pandas pygraphviz -pysam edlib diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..edd536d --- /dev/null +++ b/setup.py @@ -0,0 +1,31 @@ +import os +import sys +import subprocess +import shutil + +try: + import setuptools +except ImportError: + sys.exit("setuptools package not found. " + "Please use 'pip install setuptools' first") + +from setuptools import setup + +# Make sure we're running from the setup.py directory. +script_dir = os.path.dirname(os.path.realpath(__file__)) +if script_dir != os.getcwd(): + os.chdir(script_dir) + +from strainy.__version__ import __version__ + + +setup(name='strainy', + version=__version__, + description='Metagenomic strain phasing and assmebly using long reads', + url='https://github.com/katerinakazantseva/strainy', + author='Ekaterina Kazantseva & Ataberk Donmez', + author_email = 'ekaterina.v.kazantseva@gmail.com', + license='CC BY-NC-SA 4.0', + packages=['strainy', 'strainy/clustering', 'strainy/gfa_operations', 'strainy/reports', 'strainy/simplification'], + entry_points={'console_scripts': ['strainy = strainy.main:main']}, + ) diff --git a/strainy.py b/strainy.py index 34ee921..48ea0d5 100755 --- a/strainy.py +++ b/strainy.py @@ -1,15 +1,12 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 + +""" +This script sets up environment paths +and invokes Strainy without installation. +""" -import sys import os -import platform -import re -import subprocess -import argparse -import logging -import shutil -import cProfile -import gfapy +import sys #Setting executable paths and additional import dirs @@ -23,126 +20,13 @@ os.environ["PATH"] = bin_absolute + os.pathsep + os.environ["PATH"] ### - -from strainy.phase import phase_main -from strainy.transform import transform_main -from strainy.params import StRainyArgs, init_global_args_storage -from strainy.logging import set_thread_logging -from strainy.preprocessing import preprocess_cmd_args - - -logger = logging.getLogger() - -def get_processor_name(): - if platform.system() == "Windows": - return platform.processor() - elif platform.system() == "Darwin": - os.environ['PATH'] = os.environ['PATH'] + os.pathsep + '/usr/sbin' - command ="sysctl -n machdep.cpu.brand_string" - return subprocess.check_output(command).strip() - elif platform.system() == "Linux": - command = "cat /proc/cpuinfo" - all_info = subprocess.check_output(command, shell=True).decode().strip() - for line in all_info.split("\n"): - if "model name" in line: - return re.sub( ".*model name.*:", "", line,1) - return "" - - def main(): - parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) - requiredNamed = parser.add_argument_group('Required named arguments') - requiredNamed.add_argument("-o", "--output", help="output directory",required=True) - requiredNamed.add_argument("-g", "--gfa", help="input gfa to uncollapse",required=True) - requiredNamed.add_argument("-m", "--mode", help="type of reads", choices=["hifi", "nano"], - required=True) - requiredNamed.add_argument("-q", "--fastq",help="fastq file with reads to phase / assemble", - required=True) - parser.add_argument("-s", "--stage", help="stage to run: either phase, transform or e2e (phase + transform)", - choices=["phase", "transform", "e2e"], default="e2e") - parser.add_argument("--snp", help="path to vcf file with SNP calls to use", default=None) - parser.add_argument("-t", "--threads", help="number of threads to use", type=int, default=4) - parser.add_argument("-f", "--fasta", required=False, help=argparse.SUPPRESS) - parser.add_argument("-b", "--bam", help="path to indexed alignment in bam format",required=False) - parser.add_argument("--link-simplify", required=False, action="store_true", default=False, - dest="link_simplify",help="Enable agressive graph simplification") - parser.add_argument("--debug", required=False, action="store_true", default=False, - help="Generate extra output for debugging") - parser.add_argument("--unitig-split-length", - help="The length (in kb) which the unitigs that are longer will be split, set 0 to disable", - required=False, - type=int, - default=50) - parser.add_argument("--only-split",help="Do not run stRainy, only split long gfa unitigs", default='False', - required=False) - parser.add_argument("-d","--cluster-divergence",help="cluster divergence", type=float, default=0, required=False) - parser.add_argument("-a","--allele-frequency",help="Set allele frequency for internal caller only (pileup)", - type=float, default=0.2, required=False) - parser.add_argument("--min-unitig-length", - help="The length (in kb) which the unitigs that are shorter will not be phased", - required=False, - type=float, - default=1) - parser.add_argument("--min-unitig-coverage", - help="The minimum coverage threshold for phasing unitigs, unitigs with lower coverage will not be phased", - required=False, - type=int, - default=20) - parser.add_argument("--max-unitig-coverage", - help="The maximum coverage threshold for phasing unitigs, unitigs with higher coverage will not be phased", - required=False, - type=int, - default=500) - - args = parser.parse_args() - args.strainy_root = strainy_root - #setting up global arguments storage - input_graph = gfapy.Gfa.from_file(args.gfa) - args.graph_edges = input_graph.segment_names - args.edges_to_phase = [] - init_global_args_storage(args) - BIN_TOOLS = ["samtools", "bcftools", "minimap2", StRainyArgs().flye] - for tool in BIN_TOOLS: - if not shutil.which(tool): - print("{} not installed".format(tool), file=sys.stderr) - return 1 - - os.makedirs(StRainyArgs().output, exist_ok=True) - os.makedirs(StRainyArgs().output_intermediate, exist_ok=True) - if os.path.isdir(StRainyArgs().log_phase): - shutil.rmtree(StRainyArgs().log_phase) - os.makedirs(StRainyArgs().log_phase, exist_ok=True) - set_thread_logging(StRainyArgs().log_phase, "phase_root", None) + #Strainy entry point + import strainy.main + sys.exit(strainy.main.main()) - preprocess_cmd_args(args) - if StRainyArgs().debug: - print(f'Using processor(s): {get_processor_name()}') - - # set one more time for the modified args - init_global_args_storage(args) - if args.only_split=='True': - sys.exit() - elif args.stage == "phase": - sys.exit(phase_main(args)) - elif args.stage == "transform": - sys.exit(transform_main(args)) - elif args.stage == "e2e": - pr_phase = cProfile.Profile() - pr_phase.enable() - phase_main(args) - logger.info("Phase stage completed, starting transform now...") - pr_phase.disable() - pr_phase.dump_stats(f'{StRainyArgs().output_intermediate}/phase.prof') - - pr_transform = cProfile.Profile() - pr_transform.enable() - transform_main(args) - logger.info("Transform stage completed, exiting...") - pr_transform.disable() - pr_transform.dump_stats(f'{StRainyArgs().output_intermediate}/transform.prof') if __name__ == "__main__": main() - diff --git a/strainy/__version__.py b/strainy/__version__.py new file mode 100644 index 0000000..f901408 --- /dev/null +++ b/strainy/__version__.py @@ -0,0 +1 @@ +__version__ = "1.1" diff --git a/strainy/gfa_operations/__init__.py b/strainy/gfa_operations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/strainy/main.py b/strainy/main.py new file mode 100644 index 0000000..abe72a8 --- /dev/null +++ b/strainy/main.py @@ -0,0 +1,134 @@ +import sys +import os +import platform +import re +import subprocess +import argparse +import gfapy +import logging +import shutil + +from strainy.phase import phase_main +from strainy.transform import transform_main +from strainy.params import StRainyArgs, init_global_args_storage +from strainy.logging import set_thread_logging +from strainy.preprocessing import preprocess_cmd_args +from strainy.__version__ import __version__ + + +logger = logging.getLogger() + +def get_processor_name(): + if platform.system() == "Windows": + return platform.processor() + elif platform.system() == "Darwin": + os.environ['PATH'] = os.environ['PATH'] + os.pathsep + '/usr/sbin' + command ="sysctl -n machdep.cpu.brand_string" + return subprocess.check_output(command).strip() + elif platform.system() == "Linux": + command = "cat /proc/cpuinfo" + all_info = subprocess.check_output(command, shell=True).decode().strip() + for line in all_info.split("\n"): + if "model name" in line: + return re.sub( ".*model name.*:", "", line,1) + return "" + + +def _version(): + return __version__ + + +def main(): + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + requiredNamed = parser.add_argument_group('Required named arguments') + requiredNamed.add_argument("-o", "--output", help="output directory",required=True) + requiredNamed.add_argument("-g", "--gfa", help="input gfa to uncollapse",required=True) + requiredNamed.add_argument("-m", "--mode", help="type of reads", choices=["hifi", "nano"], required=True) + requiredNamed.add_argument("-q", "--fastq", + help="fastq file with reads to phase / assemble", + required=True) + + parser.add_argument("-s", "--stage", help="stage to run: either phase, transform or e2e (phase + transform)", + choices=["phase", "transform", "e2e"], default="e2e") + parser.add_argument("--snp", help="path to vcf file with SNP calls to use", default=None) + parser.add_argument("-t", "--threads", help="number of threads to use", type=int, default=4) + parser.add_argument("-f", "--fasta", required=False, help=argparse.SUPPRESS) + parser.add_argument("-b", "--bam", help="path to indexed alignment in bam format",required=False) + parser.add_argument("--link-simplify", required=False, action="store_true", default=False, dest="link_simplify", + help="Enable agressive graph simplification") + parser.add_argument("--debug", required=False, action="store_true", default=False, + help="Generate extra output for debugging") + parser.add_argument("--unitig-split-length", + help="The length (in kb) which the unitigs that are longer will be split, set 0 to disable", + required=False, + type=int, + default=50) + parser.add_argument("--only-split",help="Do not run stRainy, only split long gfa unitigs", default='False', required=False) + parser.add_argument("-d","--cluster-divergence",help="cluster divergence", type=float, default=0, required=False) + parser.add_argument("-a","--allele-frequency",help="Set allele frequency for internal caller only (pileup)", type=float, default=0.2, required=False) + parser.add_argument("--min-unitig-length", + help="The length (in kb) which the unitigs that are shorter will not be phased", + required=False, + type=float, + default=1) + parser.add_argument("--min-unitig-coverage", + help="The minimum coverage threshold for phasing unitigs, unitigs with lower coverage will not be phased", + required=False, + type=int, + default=20) + parser.add_argument("--max-unitig-coverage", + help="The maximum coverage threshold for phasing unitigs, unitigs with higher coverage will not be phased", + required=False, + type=int, + default=500) + parser.add_argument("-v", "--version", action="version", version=_version()) + + args = parser.parse_args() + #args.strainy_root = strainy_root + #setting up global arguments storage + input_graph = gfapy.Gfa.from_file(args.gfa) + args.graph_edges = input_graph.segment_names + args.edges_to_phase = [] + init_global_args_storage(args) + BIN_TOOLS = ["samtools", "bcftools", "minimap2"] + for tool in BIN_TOOLS: + if not shutil.which(tool): + print("{} not installed".format(tool), file=sys.stderr) + return 1 + + os.makedirs(StRainyArgs().output, exist_ok=True) + os.makedirs(StRainyArgs().output_intermediate, exist_ok=True) + if os.path.isdir(StRainyArgs().log_phase): + shutil.rmtree(StRainyArgs().log_phase) + os.makedirs(StRainyArgs().log_phase, exist_ok=True) + set_thread_logging(StRainyArgs().log_phase, "phase_root", None) + + preprocess_cmd_args(args) + + if StRainyArgs().debug: + print(f'Using processor(s): {get_processor_name()}') + + # set one more time for the modified args + init_global_args_storage(args) + if args.only_split=='True': + sys.exit() + elif args.stage == "phase": + sys.exit(phase_main(args)) + elif args.stage == "transform": + sys.exit(transform_main(args)) + elif args.stage == "e2e": + import cProfile + pr_phase = cProfile.Profile() + pr_phase.enable() + phase_main(args) + logger.info("Phase stage completed, starting transform now...") + pr_phase.disable() + pr_phase.dump_stats(f'{StRainyArgs().output_intermediate}/phase.prof') + + pr_transform = cProfile.Profile() + pr_transform.enable() + transform_main(args) + logger.info("Transform stage completed, exiting...") + pr_transform.disable() + pr_transform.dump_stats(f'{StRainyArgs().output_intermediate}/transform.prof') diff --git a/strainy/params.py b/strainy/params.py index 6877536..8d533be 100644 --- a/strainy/params.py +++ b/strainy/params.py @@ -19,7 +19,7 @@ def init_global_args_storage(args): _glob_args.mode = args.mode _glob_args.snp = args.snp _glob_args.threads = args.threads - _glob_args.flye = os.path.join(args.strainy_root, "submodules", "Flye", "bin", "flye") + #_glob_args.flye = os.path.join(args.strainy_root, "submodules", "Flye", "bin", "flye") _glob_args.log_phase = os.path.join(args.output, "log_phase") _glob_args.log_transform = os.path.join(args.output, "log_transform") _glob_args.phased_unitig_info_table_path = os.path.join(args.output, "phased_unitig_info_table.csv") diff --git a/submodules/Flye b/submodules/Flye index 6288291..72b2afa 160000 --- a/submodules/Flye +++ b/submodules/Flye @@ -1 +1 @@ -Subproject commit 62882917cdedaf5ed86c5f5b9a662262c83b09cd +Subproject commit 72b2afaa211eac5a8b8d45e8debe69de628aed28