diff --git a/.github/workflows/testmac.yml b/.github/workflows/testmac.yml index 11d656e7a05..802d656ccfb 100644 --- a/.github/workflows/testmac.yml +++ b/.github/workflows/testmac.yml @@ -15,7 +15,7 @@ on: jobs: testmac: name: Test on Mac - runs-on: macos-12 + runs-on: macos-15 steps: - name: Use cache @@ -26,14 +26,14 @@ jobs: lib include bin - key: ${{ runner.os }}-12-${{ github.ref }} + key: ${{ runner.os }}-15-${{ github.ref }} # Restore keys are a "list", but really only a multiline string is # accepted. Also we match by prefix. And the most recent cache is # used, not the most specific. # See: https://docs.github.com/en/actions/guides/caching-dependencies-to-speed-up-workflows#matching-a-cache-key restore-keys: | - ${{ runner.os }}-12-${{ github.base_ref }} - ${{ runner.os }}-12 + ${{ runner.os }}-15-${{ github.base_ref }} + ${{ runner.os }}-15 - name: Checkout code without submodules uses: actions/checkout@v2 diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 65d745c2e2b..3fbdc26907a 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -86,8 +86,6 @@ local-build-test-job: script: - THREADS=8 - nvm version - - python3 ./configure.py - - source ./source_me.sh - make get-deps - make -j${THREADS} - echo Testing @@ -96,6 +94,8 @@ local-build-test-job: - make test - make static -j${THREADS} # Also test as a backend for the tube map + # Tube map expects vg on PATH + - export PATH="$(pwd)/bin:${PATH}" - git clone https://github.com/vgteam/sequenceTubeMap.git - cd sequenceTubeMap # Tube map expects local IPv6 but Kubernetes won't let us have it diff --git a/.gitmodules b/.gitmodules index 6b6337be74d..f1fb6d32eef 100644 --- a/.gitmodules +++ b/.gitmodules @@ -63,7 +63,7 @@ url = https://github.com/adamnovak/backward-cpp.git [submodule "deps/elfutils"] path = deps/elfutils - url = git://sourceware.org/git/elfutils.git + url = https://sourceware.org/git/elfutils.git [submodule "deps/structures"] path = deps/structures url = https://github.com/vgteam/structures.git diff --git a/Dockerfile b/Dockerfile index ff1433eccfc..cf4279d6045 100644 --- a/Dockerfile +++ b/Dockerfile @@ -48,7 +48,6 @@ RUN apt-get -qq -y update && apt-get -qq -y upgrade && apt-get -qq -y install \ ###DEPS_END### # Prepare to build submodule dependencies -COPY source_me.sh /vg/source_me.sh COPY deps /vg/deps # To increase portability of the docker image, when building for amd64, set the # target CPU architecture to Nehalem (2008) rather than auto-detecting the @@ -59,17 +58,17 @@ RUN if [ -z "${TARGETARCH}" ] || [ "${TARGETARCH}" = "amd64" ] ; then sed -i s/m RUN find . -name CMakeCache.txt | xargs rm -f # Build the dependencies COPY Makefile /vg/Makefile -RUN . ./source_me.sh && CXXFLAGS="$(if [ -z "${TARGETARCH}" ] || [ "${TARGETARCH}" = "amd64" ] ; then echo " -march=nehalem "; fi)" CFLAGS="$(if [ -z "${TARGETARCH}" ] || [ "${TARGETARCH}" = "amd64" ] ; then echo " -march=nehalem "; fi)" make -j $((THREADS < $(nproc) ? THREADS : $(nproc))) deps +RUN CXXFLAGS="$(if [ -z "${TARGETARCH}" ] || [ "${TARGETARCH}" = "amd64" ] ; then echo " -march=nehalem "; fi)" CFLAGS="$(if [ -z "${TARGETARCH}" ] || [ "${TARGETARCH}" = "amd64" ] ; then echo " -march=nehalem "; fi)" make -j $((THREADS < $(nproc) ? THREADS : $(nproc))) deps # Bring in the sources, which we need in order to build. COPY src /vg/src # Build all the object files for vg, but don't link. # Also pass the arch here -RUN . ./source_me.sh && CXXFLAGS="$(if [ -z "${TARGETARCH}" ] || [ "${TARGETARCH}" = "amd64" ] ; then echo " -march=nehalem "; fi)" make -j $((THREADS < $(nproc) ? THREADS : $(nproc))) objs +RUN CXXFLAGS="$(if [ -z "${TARGETARCH}" ] || [ "${TARGETARCH}" = "amd64" ] ; then echo " -march=nehalem "; fi)" make -j $((THREADS < $(nproc) ? THREADS : $(nproc))) objs # Do the final build and link, knowing the version. Trim down the resulting binary but make sure to include enough debug info for profiling. -RUN . ./source_me.sh && CXXFLAGS="$(if [ -z "${TARGETARCH}" ] || [ "${TARGETARCH}" = "amd64" ] ; then echo " -march=nehalem "; fi)" make -j $((THREADS < $(nproc) ? THREADS : $(nproc))) static && strip -d bin/vg +RUN CXXFLAGS="$(if [ -z "${TARGETARCH}" ] || [ "${TARGETARCH}" = "amd64" ] ; then echo " -march=nehalem "; fi)" make -j $((THREADS < $(nproc) ? THREADS : $(nproc))) static && strip -d bin/vg # Ship the scripts COPY scripts /vg/scripts diff --git a/Makefile b/Makefile index c0becd6407a..b2f25faaffc 100644 --- a/Makefile +++ b/Makefile @@ -22,12 +22,14 @@ LIB_DIR:=lib # INC_DIR must be a relative path INC_DIR:=include CWD:=$(shell pwd) -CXX ?= g++ PKG_CONFIG ?= pkg-config SFX := EXE:=vg$(SFX) +# Expose compiler we want to use to all build commands as an environment variable. +export CC CXX + all: $(BIN_DIR)/$(EXE) # Magic dependencies (see ) @@ -494,12 +496,12 @@ $(LIB_DIR)/libvg.$(SHARED_SUFFIX): $(LIBVG_SHARED_DEPS) # Each test set can have its own binary, and not link everything static $(UNITTEST_EXE): $(UNITTEST_BIN_DIR)/%: $(UNITTEST_OBJ_DIR)/%.o $(UNITTEST_SUPPORT_OBJ) $(CONFIG_OBJ) $(LIB_DIR)/libvg.$(SHARED_SUFFIX) - . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) -o $@ $< $(UNITTEST_SUPPORT_OBJ) $(CONFIG_OBJ) $(LIB_DIR)/libvg.$(SHARED_SUFFIX) $(LD_LIB_DIR_FLAGS) $(LDFLAGS) $(LD_LIB_FLAGS) $(LD_STATIC_LIB_FLAGS) $(LD_STATIC_LIB_DEPS) $(LD_EXE_LIB_FLAGS) + $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) -o $@ $< $(UNITTEST_SUPPORT_OBJ) $(CONFIG_OBJ) $(LIB_DIR)/libvg.$(SHARED_SUFFIX) $(LD_LIB_DIR_FLAGS) $(LDFLAGS) $(LD_LIB_FLAGS) $(LD_STATIC_LIB_FLAGS) $(LD_STATIC_LIB_DEPS) $(LD_EXE_LIB_FLAGS) # For a normal dynamic build we remove the static build marker $(BIN_DIR)/$(EXE): $(LIB_DIR)/libvg.a $(EXE_DEPS) -rm -f $(LIB_DIR)/vg_is_static - . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) -o $(BIN_DIR)/$(EXE) $(OBJ_DIR)/main.o $(UNITTEST_OBJ) $(SUBCOMMAND_OBJ) $(CONFIG_OBJ) $(LD_LIB_DIR_FLAGS) $(LDFLAGS) $(LIB_DIR)/libvg.a $(LD_LIB_FLAGS) $(START_STATIC) $(LD_STATIC_LIB_FLAGS) $(END_STATIC) $(LD_STATIC_LIB_DEPS) $(LD_EXE_LIB_FLAGS) + $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) -o $(BIN_DIR)/$(EXE) $(OBJ_DIR)/main.o $(UNITTEST_OBJ) $(SUBCOMMAND_OBJ) $(CONFIG_OBJ) $(LD_LIB_DIR_FLAGS) $(LDFLAGS) $(LIB_DIR)/libvg.a $(LD_LIB_FLAGS) $(START_STATIC) $(LD_STATIC_LIB_FLAGS) $(END_STATIC) $(LD_STATIC_LIB_DEPS) $(LD_EXE_LIB_FLAGS) # We keep a file that we touch on the last static build. # If the vg linkables are newer than the last static build, we do a build $(LIB_DIR)/vg_is_static: $(OBJ_DIR)/main.o $(LIB_DIR)/libvg.a $(UNITTEST_OBJ) $(SUBCOMMAND_OBJ) $(CONFIG_OBJ) $(DEPS) $(LINK_DEPS) @@ -521,16 +523,15 @@ static-docker: static scripts/* # We have system-level deps to install # We want the One True Place for them to be in the Dockerfile. get-deps: - sudo apt-get install -qq -y --no-upgrade $(shell cat Dockerfile | sed -n '/^###DEPS_BEGIN###/,$${p;/^###DEPS_END###/q}' | grep -v '^ *#' | grep -v "^RUN" | tr '\n' ' ' | tr -d '\\') + sudo DEBIAN_FRONTEND=$(DEBIAN_FRONTEND) apt-get install -qq -y --no-upgrade $(shell cat Dockerfile | sed -n '/^###DEPS_BEGIN###/,$${p;/^###DEPS_END###/q}' | grep -v '^ *#' | grep -v "^RUN" | tr '\n' ' ' | tr -d '\\') # And we have submodule deps to build deps: $(DEPS) test: $(BIN_DIR)/$(EXE) $(LIB_DIR)/libvg.a test/build_graph $(BIN_DIR)/shuf $(BIN_DIR)/vcf2tsv $(FASTAHACK_DIR)/fastahack $(BIN_DIR)/rapper - . ./source_me.sh && cd test && prove -v t + cd test && prove -v t # Hide the compiler configuration from the doc tests, so that the ones that - # build code can't pick up libraries out of the bg build itself. Definitely - # don't source source_me.sh! + # build code can't pick up libraries out of the vg build itself. # But still supply vg on the PATH. Hope it knows where its own libraries are. CFLAGS= CXXFLAGS= CPPFLAGS= LDFLAGS= INCLUDE_FLAGS= LIBRARY_PATH= LD_LIBRARY_PATH= DYLD_LIBRARY_PATH= DYLD_FALLBACK_LIBRARY_PATH= LD_INCLUDE_PATH= CC= CXX= CXX_STANDARD= PATH=$(CWD)/bin:$(PATH) doc/test-docs.sh @@ -557,68 +558,67 @@ else endif test/build_graph: test/build_graph.cpp $(LIB_DIR)/libvg.a $(SRC_DIR)/vg.hpp - . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) -o test/build_graph test/build_graph.cpp $(LD_LIB_DIR_FLAGS) $(LDFLAGS) $(LIB_DIR)/libvg.a $(LD_LIB_FLAGS) $(START_STATIC) $(LD_STATIC_LIB_FLAGS) $(END_STATIC) $(FILTER) + $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) -o test/build_graph test/build_graph.cpp $(LD_LIB_DIR_FLAGS) $(LDFLAGS) $(LIB_DIR)/libvg.a $(LD_LIB_FLAGS) $(START_STATIC) $(LD_STATIC_LIB_FLAGS) $(END_STATIC) $(FILTER) # TODO: The normal and debug jemalloc builds can't safely be run at the same time. $(LIB_DIR)/libjemalloc.a: $(JEMALLOC_DIR)/src/*.c - +. ./source_me.sh && rm -f $(LIB_DIR)/libjemalloc*.* && rm -Rf $(CWD)/$(INC_DIR)/jemalloc && cd $(JEMALLOC_DIR) && ./autogen.sh && ./configure --enable-prof --disable-libdl --prefix=`pwd` $(FILTER) && $(MAKE) clean && $(MAKE) $(FILTER) && cp lib/libjemalloc.a $(CWD)/$(LIB_DIR)/ && cp -r include/* $(CWD)/$(INC_DIR)/ + +rm -f $(LIB_DIR)/libjemalloc*.* && rm -Rf $(CWD)/$(INC_DIR)/jemalloc && cd $(JEMALLOC_DIR) && ./autogen.sh && ./configure --enable-prof --disable-libdl --prefix=`pwd` $(FILTER) && $(MAKE) clean && $(MAKE) $(FILTER) && cp lib/libjemalloc.a $(CWD)/$(LIB_DIR)/ && cp -r include/* $(CWD)/$(INC_DIR)/ $(LIB_DIR)/libjemalloc_debug.a: $(JEMALLOC_DIR)/src/*.c - +. ./source_me.sh && rm -f $(LIB_DIR)/libjemalloc*.* && rm -Rf $(CWD)/$(INC_DIR)/jemalloc && cd $(JEMALLOC_DIR) && ./autogen.sh && ./configure --enable-prof --disable-libdl --enable-debug --enable-fill --prefix=`pwd` $(FILTER) && $(MAKE) clean && $(MAKE) $(FILTER) && cp lib/libjemalloc.a $(CWD)/$(LIB_DIR)/libjemalloc_debug.a && cp -r include/* $(CWD)/$(INC_DIR)/ + +rm -f $(LIB_DIR)/libjemalloc*.* && rm -Rf $(CWD)/$(INC_DIR)/jemalloc && cd $(JEMALLOC_DIR) && ./autogen.sh && ./configure --enable-prof --disable-libdl --enable-debug --enable-fill --prefix=`pwd` $(FILTER) && $(MAKE) clean && $(MAKE) $(FILTER) && cp lib/libjemalloc.a $(CWD)/$(LIB_DIR)/libjemalloc_debug.a && cp -r include/* $(CWD)/$(INC_DIR)/ # Use fake patterns to tell Make that this rule generates all these files when run once. # Here % should always match "lib" which is a common substring. # See https://stackoverflow.com/a/19822767 $(LIB_DIR)/%sdsl.a $(LIB_DIR)/%divsufsort.a $(LIB_DIR)/%divsufsort64.a : $(SDSL_DIR)/lib/*.cpp $(SDSL_DIR)/include/sdsl/*.hpp ifeq ($(shell uname -s),Darwin) - +. ./source_me.sh && cd $(SDSL_DIR) && AS_INTEGRATED_ASSEMBLER=1 BUILD_PORTABLE=1 CXXFLAGS="-fPIC $(CPPFLAGS) $(CXXFLAGS)" ./install.sh $(CWD) $(FILTER) + +cd $(SDSL_DIR) && AS_INTEGRATED_ASSEMBLER=1 BUILD_PORTABLE=1 CXXFLAGS="-fPIC $(CPPFLAGS) $(CXXFLAGS)" ./install.sh $(CWD) $(FILTER) else - +. ./source_me.sh && cd $(SDSL_DIR) && BUILD_PORTABLE=1 CXXFLAGS="-fPIC $(CPPFLAGS) $(CXXFLAGS)" ./install.sh $(CWD) $(FILTER) + +cd $(SDSL_DIR) && BUILD_PORTABLE=1 CXXFLAGS="-fPIC $(CPPFLAGS) $(CXXFLAGS)" ./install.sh $(CWD) $(FILTER) endif $(LIB_DIR)/libssw.a: $(SSW_DIR)/*.c $(SSW_DIR)/*.cpp $(SSW_DIR)/*.h - +. ./source_me.sh && cd $(SSW_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && ar rs $(CWD)/$(LIB_DIR)/libssw.a ssw.o ssw_cpp.o && cp ssw_cpp.h ssw.h $(CWD)/$(INC_DIR) + +cd $(SSW_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && ar rs $(CWD)/$(LIB_DIR)/libssw.a ssw.o ssw_cpp.o && cp ssw_cpp.h ssw.h $(CWD)/$(INC_DIR) # We need to hide -Xpreprocessor -fopenmp from Snappy, at least on Mac, because # it will drop the -Xpreprocessor and keep the -fopenmp and upset Clang. $(LIB_DIR)/libsnappy.a: $(SNAPPY_DIR)/*.cc $(SNAPPY_DIR)/*.h - +. ./source_me.sh && cd $(SNAPPY_DIR) && ./autogen.sh && CXXFLAGS="-fPIC $(filter-out -Xpreprocessor -fopenmp,$(CXXFLAGS))" ./configure --prefix=$(CWD) $(FILTER) && CXXFLAGS="-fPIC $(filter-out -Xpreprocessor -fopenmp,$(CXXFLAGS))" $(MAKE) libsnappy.la $(FILTER) && cp .libs/libsnappy.a $(CWD)/lib/ && cp snappy-c.h snappy-sinksource.h snappy-stubs-public.h snappy.h $(CWD)/include/ + +cd $(SNAPPY_DIR) && ./autogen.sh && CXXFLAGS="-fPIC $(filter-out -Xpreprocessor -fopenmp,$(CXXFLAGS))" ./configure --prefix=$(CWD) $(FILTER) && CXXFLAGS="-fPIC $(filter-out -Xpreprocessor -fopenmp,$(CXXFLAGS))" $(MAKE) libsnappy.la $(FILTER) && cp .libs/libsnappy.a $(CWD)/lib/ && cp snappy-c.h snappy-sinksource.h snappy-stubs-public.h snappy.h $(CWD)/include/ $(INC_DIR)/gcsa/gcsa.h: $(LIB_DIR)/libgcsa2.a $(LIB_DIR)/libgcsa2.a: $(LIB_DIR)/libsdsl.a $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a $(wildcard $(GCSA2_DIR)/*.cpp) $(wildcard $(GCSA2_DIR)/include/gcsa/*.h) ifeq ($(shell uname -s),Darwin) - +. ./source_me.sh && cp -r $(GCSA2_DIR)/include/gcsa $(CWD)/$(INC_DIR)/ && cd $(GCSA2_DIR) && $(MAKE) clean && make directories && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" AS_INTEGRATED_ASSEMBLER=1 $(MAKE) lib/libgcsa2.a $(FILTER) && mv lib/libgcsa2.a $(CWD)/$(LIB_DIR) + +cp -r $(GCSA2_DIR)/include/gcsa $(CWD)/$(INC_DIR)/ && cd $(GCSA2_DIR) && $(MAKE) clean && make directories && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" AS_INTEGRATED_ASSEMBLER=1 $(MAKE) lib/libgcsa2.a $(FILTER) && mv lib/libgcsa2.a $(CWD)/$(LIB_DIR) else - +. ./source_me.sh && cp -r $(GCSA2_DIR)/include/gcsa $(CWD)/$(INC_DIR)/ && cd $(GCSA2_DIR) && $(MAKE) clean && make directories && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) lib/libgcsa2.a $(FILTER) && mv lib/libgcsa2.a $(CWD)/$(LIB_DIR) + +cp -r $(GCSA2_DIR)/include/gcsa $(CWD)/$(INC_DIR)/ && cd $(GCSA2_DIR) && $(MAKE) clean && make directories && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) lib/libgcsa2.a $(FILTER) && mv lib/libgcsa2.a $(CWD)/$(LIB_DIR) endif $(INC_DIR)/gbwt/dynamic_gbwt.h: $(LIB_DIR)/libgbwt.a $(LIB_DIR)/libgbwt.a: $(LIB_DIR)/libsdsl.a $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a $(wildcard $(GBWT_DIR)/src/*.cpp) $(wildcard $(GBWT_DIR)/include/gbwt/*.h) ifeq ($(shell uname -s),Darwin) - +. ./source_me.sh && cp -r $(GBWT_DIR)/include/gbwt $(CWD)/$(INC_DIR)/ && cd $(GBWT_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" AS_INTEGRATED_ASSEMBLER=1 $(MAKE) $(FILTER) && mv lib/libgbwt.a $(CWD)/$(LIB_DIR) + +cp -r $(GBWT_DIR)/include/gbwt $(CWD)/$(INC_DIR)/ && cd $(GBWT_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" AS_INTEGRATED_ASSEMBLER=1 $(MAKE) $(FILTER) && mv lib/libgbwt.a $(CWD)/$(LIB_DIR) else - +. ./source_me.sh && cp -r $(GBWT_DIR)/include/gbwt $(CWD)/$(INC_DIR)/ && cd $(GBWT_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && mv lib/libgbwt.a $(CWD)/$(LIB_DIR) + +cp -r $(GBWT_DIR)/include/gbwt $(CWD)/$(INC_DIR)/ && cd $(GBWT_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && mv lib/libgbwt.a $(CWD)/$(LIB_DIR) endif $(INC_DIR)/gbwtgraph/gbwtgraph.h: $(LIB_DIR)/libgbwtgraph.a $(LIB_DIR)/libgbwtgraph.a: $(LIB_DIR)/libgbwt.a $(LIB_DIR)/libsdsl.a $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a $(LIB_DIR)/libhandlegraph.a $(wildcard $(GBWTGRAPH_DIR)/src/*.cpp) $(wildcard $(GBWTGRAPH_DIR)/include/gbwtgraph/*.h) ifeq ($(shell uname -s),Darwin) - +. ./source_me.sh && cp -r $(GBWTGRAPH_DIR)/include/gbwtgraph $(CWD)/$(INC_DIR)/ && cd $(GBWTGRAPH_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" AS_INTEGRATED_ASSEMBLER=1 $(MAKE) $(FILTER) && mv lib/libgbwtgraph.a $(CWD)/$(LIB_DIR) + +cp -r $(GBWTGRAPH_DIR)/include/gbwtgraph $(CWD)/$(INC_DIR)/ && cd $(GBWTGRAPH_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" AS_INTEGRATED_ASSEMBLER=1 $(MAKE) $(FILTER) && mv lib/libgbwtgraph.a $(CWD)/$(LIB_DIR) else - +. ./source_me.sh && cp -r $(GBWTGRAPH_DIR)/include/gbwtgraph $(CWD)/$(INC_DIR)/ && cd $(GBWTGRAPH_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && mv lib/libgbwtgraph.a $(CWD)/$(LIB_DIR) + +cp -r $(GBWTGRAPH_DIR)/include/gbwtgraph $(CWD)/$(INC_DIR)/ && cd $(GBWTGRAPH_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && mv lib/libgbwtgraph.a $(CWD)/$(LIB_DIR) endif $(INC_DIR)/kff_io.hpp: $(LIB_DIR)/libkff.a -# We need to drop the hardcoderd CMAKE_CXX_FLAGS. See $(LIB_DIR)/libkff.a: $(KFF_DIR)/kff_io.cpp $(KFF_DIR)/kff_io.hpp.in ifeq ($(shell uname -s),Darwin) - +. ./source_me.sh && cd $(KFF_DIR) && sed -i.bak '/set(CMAKE_CXX_FLAGS/d' CMakeLists.txt && rm -Rf build && mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-fPIC -Wall -Ofast -g $(CXXFLAGS)" .. && AS_INTEGRATED_ASSEMBLER=1 $(MAKE) $(FILTER) && cp kff_io.hpp $(CWD)/$(INC_DIR) && mv libkff.a $(CWD)/$(LIB_DIR) + +cd $(KFF_DIR) && rm -Rf build && mkdir build && cd build && cmake -DCMAKE_C_COMPILER="$(CC)" -DCMAKE_CXX_COMPILER="$(CXX)" -DCMAKE_CXX_FLAGS="-fPIC -Wall -Ofast -g $(CXXFLAGS)" .. && AS_INTEGRATED_ASSEMBLER=1 $(MAKE) $(FILTER) && cp kff_io.hpp $(CWD)/$(INC_DIR) && mv libkff.a $(CWD)/$(LIB_DIR) else - +. ./source_me.sh && cd $(KFF_DIR) && sed -i.bak '/set(CMAKE_CXX_FLAGS/d' CMakeLists.txt && rm -Rf build && mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-fPIC -Wall -Ofast -g $(CXXFLAGS)" .. && $(MAKE) $(FILTER) && cp kff_io.hpp $(CWD)/$(INC_DIR) && mv libkff.a $(CWD)/$(LIB_DIR) + +cd $(KFF_DIR) && rm -Rf build && mkdir build && cd build && cmake -DCMAKE_C_COMPILER="$(CC)" -DCMAKE_CXX_COMPILER="$(CXX)" -DCMAKE_CXX_FLAGS="-fPIC -Wall -Ofast -g $(CXXFLAGS)" .. && $(MAKE) $(FILTER) && cp kff_io.hpp $(CWD)/$(INC_DIR) && mv libkff.a $(CWD)/$(LIB_DIR) endif $(INC_DIR)/BooPHF.h: $(BBHASH_DIR)/BooPHF.h @@ -628,17 +628,17 @@ $(INC_DIR)/progress_bar.hpp: $(PROGRESS_BAR_DIR)/progress_bar.hpp +cp $(PROGRESS_BAR_DIR)/progress_bar.hpp $(CWD)/$(INC_DIR) $(OBJ_DIR)/progress_bar.o: $(PROGRESS_BAR_DIR)/progress_bar.cpp $(PROGRESS_BAR_DIR)/*.hpp - +. ./source_me.sh && $(CXX) -I$(FASTAHACK_DIR) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -c -o $@ $< + +$(CXX) -I$(FASTAHACK_DIR) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -c -o $@ $< $(SHARED_OBJ_DIR)/progress_bar.o: $(PROGRESS_BAR_DIR)/progress_bar.cpp $(PROGRESS_BAR_DIR)/*.hpp - +. ./source_me.sh && $(CXX) -I$(FASTAHACK_DIR) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -fPIC -c -o $@ $< + +$(CXX) -I$(FASTAHACK_DIR) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -fPIC -c -o $@ $< $(INC_DIR)/Fasta.h: $(FASTAHACK_DIR)/Fasta.h - +. ./source_me.sh && cd $(FASTAHACK_DIR) && cp Fasta.h $(CWD)/$(INC_DIR) + +cd $(FASTAHACK_DIR) && cp Fasta.h $(CWD)/$(INC_DIR) $(OBJ_DIR)/Fasta.o: $(FASTAHACK_DIR)/Fasta.cpp $(INC_DIR)/Fasta.h - +. ./source_me.sh && $(CXX) -I$(FASTAHACK_DIR) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -c -o $@ $< $(FILTER) + +$(CXX) -I$(FASTAHACK_DIR) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -c -o $@ $< $(FILTER) $(SHARED_OBJ_DIR)/Fasta.o: $(FASTAHACK_DIR)/Fasta.cpp $(INC_DIR)/Fasta.h - +. ./source_me.sh && $(CXX) -I$(FASTAHACK_DIR) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -fPIC -c -o $@ $< $(FILTER) + +$(CXX) -I$(FASTAHACK_DIR) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -fPIC -c -o $@ $< $(FILTER) # We have this target to clean up the old Protobuf we used to have. # We can remove it after we no longer care about building properly on a dirty @@ -664,10 +664,10 @@ $(LIB_DIR)/cleaned_old_elfutils: $(LIB_DIR)/libvgio.a: $(LIB_DIR)/libhts.a $(LIB_DIR)/libhandlegraph.a $(LIB_DIR)/pkgconfig/htslib.pc $(LIB_DIR)/cleaned_old_protobuf_v003 $(LIBVGIO_DIR)/CMakeLists.txt $(LIBVGIO_DIR)/src/*.cpp $(LIBVGIO_DIR)/include/vg/io/*.hpp $(LIBVGIO_DIR)/deps/vg.proto +rm -f $(CWD)/$(INC_DIR)/vg.pb.h $(CWD)/$(INC_DIR)/vg/vg.pb.h +rm -Rf $(CWD)/$(INC_DIR)/vg/io/ - +. ./source_me.sh && export CXXFLAGS="$(CPPFLAGS) $(CXXFLAGS)" && export LDFLAGS="$(LD_LIB_DIR_FLAGS) $(LDFLAGS)" && cd $(LIBVGIO_DIR) && rm -Rf CMakeCache.txt CMakeFiles *.cmake install_manifest.txt *.pb.cc *.pb.h *.a && rm -rf build-vg && mkdir build-vg && cd build-vg && PKG_CONFIG_PATH=$(CWD)/$(LIB_DIR)/pkgconfig:$(PKG_CONFIG_PATH) cmake -DCMAKE_CXX_STANDARD=$(CXX_STANDARD) -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_PREFIX_PATH="/usr;$(OMP_PREFIXES)" -DCMAKE_INSTALL_PREFIX=$(CWD) -DCMAKE_INSTALL_LIBDIR=lib .. $(FILTER) && $(MAKE) clean && VERBOSE=1 $(MAKE) $(FILTER) && $(MAKE) install + +export CXXFLAGS="$(CPPFLAGS) $(CXXFLAGS)" && export LDFLAGS="$(LD_LIB_DIR_FLAGS) $(LDFLAGS)" && cd $(LIBVGIO_DIR) && rm -Rf CMakeCache.txt CMakeFiles *.cmake install_manifest.txt *.pb.cc *.pb.h *.a && rm -rf build-vg && mkdir build-vg && cd build-vg && PKG_CONFIG_PATH=$(CWD)/$(LIB_DIR)/pkgconfig:$(PKG_CONFIG_PATH) cmake -DCMAKE_C_COMPILER="$(CC)" -DCMAKE_CXX_COMPILER="$(CXX)" -DCMAKE_CXX_STANDARD=$(CXX_STANDARD) -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_PREFIX_PATH="/usr;$(OMP_PREFIXES)" -DCMAKE_INSTALL_PREFIX=$(CWD) -DCMAKE_INSTALL_LIBDIR=lib .. $(FILTER) && $(MAKE) clean && VERBOSE=1 $(MAKE) $(FILTER) && $(MAKE) install $(LIB_DIR)/libhandlegraph.a: $(LIBHANDLEGRAPH_DIR)/src/include/handlegraph/*.hpp $(LIBHANDLEGRAPH_DIR)/src/*.cpp - +. ./source_me.sh && cd $(LIBHANDLEGRAPH_DIR) && rm -Rf build CMakeCache.txt CMakeFiles && mkdir build && cd build && CXXFLAGS="$(CXXFLAGS) $(CPPFLAGS)" cmake -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_INSTALL_PREFIX=$(CWD) -DCMAKE_INSTALL_LIBDIR=lib .. && $(MAKE) $(FILTER) && $(MAKE) install + +cd $(LIBHANDLEGRAPH_DIR) && rm -Rf build CMakeCache.txt CMakeFiles && mkdir build && cd build && CXXFLAGS="$(CXXFLAGS) $(CPPFLAGS)" cmake -DCMAKE_C_COMPILER="$(CC)" -DCMAKE_CXX_COMPILER="$(CXX)" -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_INSTALL_PREFIX=$(CWD) -DCMAKE_INSTALL_LIBDIR=lib .. && $(MAKE) $(FILTER) && $(MAKE) install # On Linux, libdeflate builds a .so. @@ -683,7 +683,7 @@ ifeq ($(shell uname -s),Darwin) endif $(LIB_DIR)/libdeflate.a: $(LIBDEFLATE_DIR)/*.h $(LIBDEFLATE_DIR)/lib/*.h $(LIBDEFLATE_DIR)/lib/*/*.h $(LIBDEFLATE_DIR)/lib/*.c $(LIBDEFLATE_DIR)/lib/*/*.c - +. ./source_me.sh && cd $(LIBDEFLATE_DIR) && V=1 LDFLAGS="$(LDFLAGS) $(LD_RENAMEABLE_FLAGS)" $(MAKE) $(FILTER) && cp libdeflate.a $(CWD)/$(LIB_DIR) && cp libdeflate.h $(CWD)/$(INC_DIR) + +cd $(LIBDEFLATE_DIR) && V=1 LDFLAGS="$(LDFLAGS) $(LD_RENAMEABLE_FLAGS)" $(MAKE) $(FILTER) && cp libdeflate.a $(CWD)/$(LIB_DIR) && cp libdeflate.h $(CWD)/$(INC_DIR) # We build htslib after libdeflate so it can use libdeflate. # We need to do some wizardry to get it to pick up the right build and target system types on modern autotools. @@ -696,13 +696,13 @@ $(LIB_DIR)/libdeflate.a: $(LIBDEFLATE_DIR)/*.h $(LIBDEFLATE_DIR)/lib/*.h $(LIBDE # a system path, in case another htslib is installed on the system. Some HTSlib # headers look for the current HTSlib with <>. $(LIB_DIR)/libhts%a $(LIB_DIR)/pkgconfig/htslib%pc $(LIB_DIR)/libhts%$(SHARED_SUFFIX): $(LIB_DIR)/libdeflate.a $(LIB_DIR)/libdeflate.$(SHARED_SUFFIX) $(HTSLIB_DIR)/*.c $(HTSLIB_DIR)/*.h $(HTSLIB_DIR)/htslib/*.h $(HTSLIB_DIR)/cram/*.c $(HTSLIB_DIR)/cram/*.h - +. ./source_me.sh && cd $(HTSLIB_DIR) && rm -Rf $(CWD)/$(INC_DIR)/htslib $(CWD)/$(LIB_DIR)/libhts* && autoreconf -i && autoheader && autoconf || true - +. ./source_me.sh && cd $(HTSLIB_DIR) && (./configure -n 2>&1 || true) | grep "build system type" | rev | cut -f1 -d' ' | rev >systype.txt - +. ./source_me.sh && cd $(HTSLIB_DIR) && CFLAGS="-I$(CWD)/$(HTSLIB_DIR) -isystem $(CWD)/$(HTSLIB_DIR) -I$(CWD)/$(INC_DIR) $(CFLAGS)" LDFLAGS="$(LDFLAGS) -L$(CWD)/$(LIB_DIR) $(LD_UTIL_RPATH_FLAGS)" ./configure --with-libdeflate --disable-s3 --disable-gcs --disable-libcurl --disable-plugins --prefix=$(CWD) --host=$$(cat systype.txt) $(FILTER) && $(MAKE) clean && $(MAKE) $(FILTER) && $(MAKE) install + +cd $(HTSLIB_DIR) && rm -Rf $(CWD)/$(INC_DIR)/htslib $(CWD)/$(LIB_DIR)/libhts* && autoreconf -i && autoheader && autoconf || true + +cd $(HTSLIB_DIR) && (./configure -n 2>&1 || true) | grep "build system type" | rev | cut -f1 -d' ' | rev >systype.txt + +cd $(HTSLIB_DIR) && CFLAGS="-I$(CWD)/$(HTSLIB_DIR) -isystem $(CWD)/$(HTSLIB_DIR) -I$(CWD)/$(INC_DIR) $(CFLAGS)" LDFLAGS="$(LDFLAGS) -L$(CWD)/$(LIB_DIR) $(LD_UTIL_RPATH_FLAGS)" ./configure --with-libdeflate --disable-s3 --disable-gcs --disable-libcurl --disable-plugins --prefix=$(CWD) --host=$$(cat systype.txt) $(FILTER) && $(MAKE) clean && $(MAKE) $(FILTER) && $(MAKE) install # Build and install tabixpp for vcflib. $(LIB_DIR)/libtabixpp.a: $(LIB_DIR)/libhts.a $(TABIXPP_DIR)/*.cpp $(TABIXPP_DIR)/*.hpp - +. ./source_me.sh && cd $(TABIXPP_DIR) && rm -f tabix.o libtabixpp.a && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" INCLUDES="-I$(CWD)/$(INC_DIR)" HTS_HEADERS="" $(MAKE) tabix.o $(FILTER) && ar rcs libtabixpp.a tabix.o + +cd $(TABIXPP_DIR) && rm -f tabix.o libtabixpp.a && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" INCLUDES="-I$(CWD)/$(INC_DIR)" HTS_HEADERS="" $(MAKE) tabix.o $(FILTER) && ar rcs libtabixpp.a tabix.o +cp $(TABIXPP_DIR)/libtabixpp.a $(LIB_DIR) && cp $(TABIXPP_DIR)/tabix.hpp $(INC_DIR) +echo "Name: tabixpp" > $(LIB_DIR)/pkgconfig/tabixpp.pc +echo "Description: Self-packaged tabixpp" >> $(LIB_DIR)/pkgconfig/tabixpp.pc @@ -718,7 +718,7 @@ $(LIB_DIR)/libtabixpp.a: $(LIB_DIR)/libhts.a $(TABIXPP_DIR)/*.cpp $(TABIXPP_DIR) # We need to use /usr first for CMake search or Ubuntu 22.04 will decide pybind11 is installed in / when actually it is only fully installed in /usr. $(LIB_DIR)/libvcflib.a: $(LIB_DIR)/libhts.a $(LIB_DIR)/libtabixpp.a $(VCFLIB_DIR)/src/*.cpp $(VCFLIB_DIR)/src/*.hpp $(VCFLIB_DIR)/contrib/*/*.cpp $(VCFLIB_DIR)/contrib/*/*.h +rm -f $(VCFLIB_DIR)/contrib/WFA2-lib/VERSION - +. ./source_me.sh && cd $(VCFLIB_DIR) && rm -Rf build && mkdir build && cd build && PKG_CONFIG_PATH="$(CWD)/$(LIB_DIR)/pkgconfig:$(PKG_CONFIG_PATH)" cmake -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON -DZIG=OFF -DCMAKE_C_FLAGS="$(CFLAGS)" -DCMAKE_CXX_FLAGS="$(CXXFLAGS) ${CPPFLAGS}" -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_INSTALL_PREFIX=$(CWD) -DCMAKE_PREFIX_PATH="/usr;$(OMP_PREFIXES)" .. && cmake --build . + +cd $(VCFLIB_DIR) && rm -Rf build && mkdir build && cd build && PKG_CONFIG_PATH="$(CWD)/$(LIB_DIR)/pkgconfig:$(PKG_CONFIG_PATH)" cmake -DCMAKE_C_COMPILER="$(CC)" -DCMAKE_CXX_COMPILER="$(CXX)" -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON -DZIG=OFF -DCMAKE_C_FLAGS="$(CFLAGS)" -DCMAKE_CXX_FLAGS="$(CXXFLAGS) ${CPPFLAGS}" -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_INSTALL_PREFIX=$(CWD) -DCMAKE_PREFIX_PATH="/usr;$(OMP_PREFIXES)" .. && cmake --build . +cp $(VCFLIB_DIR)/contrib/filevercmp/*.h* $(INC_DIR) +cp $(VCFLIB_DIR)/contrib/fastahack/*.h* $(INC_DIR) +cp $(VCFLIB_DIR)/contrib/smithwaterman/*.h* $(INC_DIR) @@ -732,10 +732,10 @@ $(BIN_DIR)/vcf2tsv: $(VCFLIB_DIR)/src/*.cpp $(VCFLIB_DIR)/src/*.h $(LIB_DIR)/lib +cp $(VCFLIB_DIR)/build/vcf2tsv $(BIN_DIR) $(FASTAHACK_DIR)/fastahack: $(FASTAHACK_DIR)/*.c $(FASTAHACK_DIR)/*.h $(FASTAHACK_DIR)/*.cpp - +. ./source_me.sh && cd $(FASTAHACK_DIR) && $(MAKE) $(FILTER) + +cd $(FASTAHACK_DIR) && $(MAKE) $(FILTER) $(LIB_DIR)/libgssw.a: $(GSSW_DIR)/src/gssw.c $(GSSW_DIR)/src/gssw.h - +. ./source_me.sh && cd $(GSSW_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && cp lib/libgssw.a $(CWD)/$(LIB_DIR)/ && cp src/gssw.h $(CWD)/$(INC_DIR)/ + +cd $(GSSW_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && cp lib/libgssw.a $(CWD)/$(LIB_DIR)/ && cp src/gssw.h $(CWD)/$(INC_DIR)/ $(INC_DIR)/lru_cache.h: $(DEP_DIR)/lru_cache/*.h $(DEP_DIR)/lru_cache/*.cc +cd $(DEP_DIR)/lru_cache && cp *.h* $(CWD)/$(INC_DIR)/ @@ -744,32 +744,32 @@ $(INC_DIR)/lru_cache.h: $(DEP_DIR)/lru_cache/*.h $(DEP_DIR)/lru_cache/*.cc $(INC_DIR)/dynamic/dynamic.hpp: $(DYNAMIC_DIR)/include/dynamic/*.hpp $(DYNAMIC_DIR)/include/dynamic/*/*.hpp +rm -Rf $(INC_DIR)/dynamic.hpp $(INC_DIR)/dynamic # annoyingly doesn't have an install option on the cmake, so we manually move their external dependency headers - +cd $(CWD)/$(DYNAMIC_DIR) && rm -Rf build && mkdir -p build && cd build && export CXXFLAGS="$(CPPFLAGS) $(CXXFLAGS)" && cmake -DCMAKE_VERBOSE_MAKEFILE=ON .. && make && cp -r $(CWD)/$(DYNAMIC_DIR)/deps/hopscotch_map/include/* $(CWD)/$(INC_DIR)/ + +cd $(CWD)/$(DYNAMIC_DIR) && rm -Rf build && mkdir -p build && cd build && export CXXFLAGS="$(CPPFLAGS) $(CXXFLAGS)" && cmake -DCMAKE_C_COMPILER="$(CC)" -DCMAKE_CXX_COMPILER="$(CXX)" -DCMAKE_VERBOSE_MAKEFILE=ON .. && make && cp -r $(CWD)/$(DYNAMIC_DIR)/deps/hopscotch_map/include/* $(CWD)/$(INC_DIR)/ # Do the copy of the main file last so we can tell if this recipe failed and redo it. # Otherwise we get dynamic.hpp without its deps +mkdir -p $(INC_DIR)/dynamic && cp -r $(CWD)/$(DYNAMIC_DIR)/include/dynamic/* $(INC_DIR)/dynamic/ $(INC_DIR)/sparsehash/sparse_hash_map: $(wildcard $(SPARSEHASH_DIR)/**/*.cc) $(wildcard $(SPARSEHASH_DIR)/**/*.h) - +. ./source_me.sh && cd $(SPARSEHASH_DIR) && ./autogen.sh && LDFLAGS="$(LD_LIB_DIR_FLAGS) $(LDFLAGS)" ./configure --prefix=$(CWD) $(FILTER) && $(MAKE) $(FILTER) && $(MAKE) install + +cd $(SPARSEHASH_DIR) && ./autogen.sh && LDFLAGS="$(LD_LIB_DIR_FLAGS) $(LDFLAGS)" ./configure --prefix=$(CWD) $(FILTER) && $(MAKE) $(FILTER) && $(MAKE) install $(INC_DIR)/sparsepp/spp.h: $(wildcard $(SPARSEHASH_DIR)/sparsepp/*.h) +cp -r $(SPARSEPP_DIR)/sparsepp $(INC_DIR)/ #$(INC_DIR)/Variant.h $(LIB_DIR)/libvcfh.a: $(DEP_DIR)/libVCFH/*.cpp $(DEP_DIR)/libVCFH/*.hpp - +. ./source_me.sh && cd $(DEP_DIR)/libVCFH && $(MAKE) $(FILTER) && cp libvcfh.a $(CWD)/$(LIB_DIR)/ && cp vcfheader.hpp $(CWD)/$(INC_DIR)/ + +cd $(DEP_DIR)/libVCFH && $(MAKE) $(FILTER) && cp libvcfh.a $(CWD)/$(LIB_DIR)/ && cp vcfheader.hpp $(CWD)/$(INC_DIR)/ $(LIB_DIR)/libsonlib.a: $(CWD)/$(DEP_DIR)/sonLib/C/inc/*.h $(CWD)/$(DEP_DIR)/sonLib/C/impl/*.c - +. ./source_me.sh && cd $(DEP_DIR)/sonLib && $(MAKE) clean && kyotoTycoonLib="" CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && cp lib/sonLib.a $(CWD)/$(LIB_DIR)/libsonlib.a && mkdir -p $(CWD)/$(INC_DIR)/sonLib && cp lib/*.h $(CWD)/$(INC_DIR)/sonLib + +cd $(DEP_DIR)/sonLib && $(MAKE) clean && kyotoTycoonLib="" CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && cp lib/sonLib.a $(CWD)/$(LIB_DIR)/libsonlib.a && mkdir -p $(CWD)/$(INC_DIR)/sonLib && cp lib/*.h $(CWD)/$(INC_DIR)/sonLib $(LIB_DIR)/libpinchesandcacti.a: $(LIB_DIR)/libsonlib.a $(CWD)/$(DEP_DIR)/pinchesAndCacti/inc/*.h $(CWD)/$(DEP_DIR)/pinchesAndCacti/impl/*.c - +. ./source_me.sh && cd $(DEP_DIR)/pinchesAndCacti && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && cd $(CWD)/$(DEP_DIR)/sonLib && cp lib/stPinchesAndCacti.a $(CWD)/$(LIB_DIR)/libpinchesandcacti.a && cp lib/3EdgeConnected.a $(CWD)/$(LIB_DIR)/lib3edgeconnected.a && mkdir -p $(CWD)/$(INC_DIR)/sonLib && cp lib/*.h $(CWD)/$(INC_DIR)/sonLib + +cd $(DEP_DIR)/pinchesAndCacti && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && cd $(CWD)/$(DEP_DIR)/sonLib && cp lib/stPinchesAndCacti.a $(CWD)/$(LIB_DIR)/libpinchesandcacti.a && cp lib/3EdgeConnected.a $(CWD)/$(LIB_DIR)/lib3edgeconnected.a && mkdir -p $(CWD)/$(INC_DIR)/sonLib && cp lib/*.h $(CWD)/$(INC_DIR)/sonLib # When building raptor we need to make sure to pre-generate and fix up the lexer # We also need to clear out its cmake stuff in case it found a wrong Bison and cached it. $(LIB_DIR)/libraptor2.a: $(RAPTOR_DIR)/src/* $(wildcard $(RAPTOR_DIR)/build/*) which bison - +. ./source_me.sh && cd $(RAPTOR_DIR)/build && rm -Rf CMakeCache.txt CMakeFiles CTestTestfile.cmake Makefile cmake_install.cmake src tests utils && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" cmake .. && rm -f src/turtle_parser.c && rm -f src/turtle_lexer.c && make turtle_lexer_tgt && make -f src/CMakeFiles/raptor2.dir/build.make src/turtle_lexer.c && sed -i.bak '/yycleanup/d' src/turtle_lexer.c && $(MAKE) $(FILTER) && cp src/libraptor2.a $(CWD)/$(LIB_DIR) + +cd $(RAPTOR_DIR)/build && rm -Rf CMakeCache.txt CMakeFiles CTestTestfile.cmake Makefile cmake_install.cmake src tests utils && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" cmake -DCMAKE_C_COMPILER="$(CC)" -DCMAKE_CXX_COMPILER="$(CXX)" .. && rm -f src/turtle_parser.c && rm -f src/turtle_lexer.c && make turtle_lexer_tgt && make -f src/CMakeFiles/raptor2.dir/build.make src/turtle_lexer.c && sed -i.bak '/yycleanup/d' src/turtle_lexer.c && $(MAKE) $(FILTER) && cp src/libraptor2.a $(CWD)/$(LIB_DIR) +touch $(LIB_DIR)/libraptor2.a # We need rapper from Raptor for the tests @@ -783,7 +783,7 @@ $(INC_DIR)/raptor2/raptor2.h: $(LIB_DIR)/libraptor2.a $(RAPTOR_DIR)/build/* +touch $(INC_DIR)/raptor2/raptor2.h $(LIB_DIR)/libstructures.a: $(STRUCTURES_DIR)/src/include/structures/*.hpp $(STRUCTURES_DIR)/src/*.cpp $(STRUCTURES_DIR)/Makefile - +. ./source_me.sh && cd $(STRUCTURES_DIR) && $(MAKE) clean && $(MAKE) lib/libstructures.a $(FILTER) && cp lib/libstructures.a $(CWD)/$(LIB_DIR)/ && cp -r src/include/structures $(CWD)/$(INC_DIR)/ + +cd $(STRUCTURES_DIR) && $(MAKE) clean && $(MAKE) lib/libstructures.a $(FILTER) && cp lib/libstructures.a $(CWD)/$(LIB_DIR)/ && cp -r src/include/structures $(CWD)/$(INC_DIR)/ $(INC_DIR)/sha1.hpp: $(SHA1_DIR)/sha1.hpp +cp $(SHA1_DIR)/*.h* $(CWD)/$(INC_DIR)/ @@ -813,15 +813,15 @@ $(LIB_DIR)/libdwfl.a: $(LIB_DIR)/libelf.a # running on. $(LIB_DIR)/libelf.a: $(ELFUTILS_DIR)/libebl/*.c $(ELFUTILS_DIR)/libebl/*.h $(ELFUTILS_DIR)/libdw/*.c $(ELFUTILS_DIR)/libdw/*.h $(ELFUTILS_DIR)/libelf/*.c $(ELFUTILS_DIR)/libelf/*.h $(ELFUTILS_DIR)/src/*.c $(ELFUTILS_DIR)/src/*.h $(LIB_DIR)/cleaned_old_elfutils +cd $(CWD)/$(INC_DIR)/ && rm -Rf elfutils gelf.h libelf.h dwarf.h libdwflP.h libdwfl.h libebl.h libelf.h - +. ./source_me.sh && cd $(ELFUTILS_DIR) && autoreconf -i -f && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" ./configure --enable-maintainer-mode --disable-libdebuginfod --disable-debuginfod --prefix=$(CWD) $(FILTER) - +. ./source_me.sh && cd $(ELFUTILS_DIR)/libelf && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) libelf.a $(FILTER) - +. ./source_me.sh && cd $(ELFUTILS_DIR)/libebl && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) libebl.a $(FILTER) - +. ./source_me.sh && cd $(ELFUTILS_DIR)/libdwfl && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) libdwfl.a $(FILTER) - +. ./source_me.sh && cd $(ELFUTILS_DIR)/libdwelf && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) libdwelf.a $(FILTER) - +. ./source_me.sh && cd $(ELFUTILS_DIR)/lib && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) libeu.a $(FILTER) - +. ./source_me.sh && cd $(ELFUTILS_DIR)/libcpu && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) libcpu.a $(FILTER) - +. ./source_me.sh && cd $(ELFUTILS_DIR)/backends && $(MAKE) clean CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" && $(MAKE) libebl_backends.a $(FILTER) - +. ./source_me.sh && cd $(ELFUTILS_DIR)/libdw && $(MAKE) clean CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" && $(MAKE) libdw.a known-dwarf.h $(FILTER) + +cd $(ELFUTILS_DIR) && autoreconf -i -f && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" ./configure --enable-maintainer-mode --disable-libdebuginfod --disable-debuginfod --prefix=$(CWD) $(FILTER) + +cd $(ELFUTILS_DIR)/libelf && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) libelf.a $(FILTER) + +cd $(ELFUTILS_DIR)/libebl && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) libebl.a $(FILTER) + +cd $(ELFUTILS_DIR)/libdwfl && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) libdwfl.a $(FILTER) + +cd $(ELFUTILS_DIR)/libdwelf && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) libdwelf.a $(FILTER) + +cd $(ELFUTILS_DIR)/lib && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) libeu.a $(FILTER) + +cd $(ELFUTILS_DIR)/libcpu && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) libcpu.a $(FILTER) + +cd $(ELFUTILS_DIR)/backends && $(MAKE) clean CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" && $(MAKE) libebl_backends.a $(FILTER) + +cd $(ELFUTILS_DIR)/libdw && $(MAKE) clean CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" && $(MAKE) libdw.a known-dwarf.h $(FILTER) +cd $(ELFUTILS_DIR) && mkdir -p $(CWD)/$(INC_DIR)/elfutils && cp libdw/known-dwarf.h libdw/libdw.h libebl/libebl.h libelf/elf-knowledge.h version.h libdwfl/libdwfl.h libdwelf/libdwelf.h $(CWD)/$(INC_DIR)/elfutils && cp libelf/gelf.h libelf/libelf.h libdw/dwarf.h $(CWD)/$(INC_DIR) && cp libebl/libebl.a libdw/libdw.a libdwfl/libdwfl.a libdwelf/libdwelf.a libelf/libelf.a $(CWD)/$(LIB_DIR)/ $(OBJ_DIR)/sha1.o: $(SHA1_DIR)/sha1.cpp $(SHA1_DIR)/sha1.hpp @@ -830,30 +830,30 @@ $(SHARED_OBJ_DIR)/sha1.o: $(SHA1_DIR)/sha1.cpp $(SHA1_DIR)/sha1.hpp +$(CXX) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -fPIC -c -o $@ $< $(FILTER) $(LIB_DIR)/libfml.a: $(FERMI_DIR)/*.h $(FERMI_DIR)/*.c - . ./source_me.sh && cd $(FERMI_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && cp *.h $(CWD)/$(INC_DIR)/ && cp libfml.a $(CWD)/$(LIB_DIR)/ + cd $(FERMI_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && cp *.h $(CWD)/$(INC_DIR)/ && cp libfml.a $(CWD)/$(LIB_DIR)/ # We don't need to hack the build to point at our htslib because sublinearLS gets its htslib from the include flags we set # But we do need to hack out the return type error to work around https://github.com/yoheirosen/sublinear-Li-Stephens/issues/6 # TODO: This probably means actually calling some things in the library is unsafe! $(LIB_DIR)/libsublinearLS.a: $(LINLS_DIR)/src/*.cpp $(LINLS_DIR)/src/*.hpp $(LIB_DIR)/libhts.a - . ./source_me.sh && cd $(LINLS_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(filter-out -Werror=return-type,$(CFLAGS))" CXXFLAGS="-fPIC $(filter-out -Werror=return-type,$(CXXFLAGS))" INCLUDE_FLAGS="-I$(CWD)/$(INC_DIR)" $(MAKE) libs $(FILTER) && cp lib/libsublinearLS.a $(CWD)/$(LIB_DIR)/ && mkdir -p $(CWD)/$(INC_DIR)/sublinearLS && cp src/*.hpp $(CWD)/$(INC_DIR)/sublinearLS/ + cd $(LINLS_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(filter-out -Werror=return-type,$(CFLAGS))" CXXFLAGS="-fPIC $(filter-out -Werror=return-type,$(CXXFLAGS))" INCLUDE_FLAGS="-I$(CWD)/$(INC_DIR)" $(MAKE) libs $(FILTER) && cp lib/libsublinearLS.a $(CWD)/$(LIB_DIR)/ && mkdir -p $(CWD)/$(INC_DIR)/sublinearLS && cp src/*.hpp $(CWD)/$(INC_DIR)/sublinearLS/ $(LIB_DIR)/libbdsg.a: $(INC_DIR)/BooPHF.h $(LIBBDSG_DIR)/Makefile $(LIBBDSG_DIR)/bdsg/src/*.cpp $(LIBBDSG_DIR)/bdsg/include/bdsg/*.hpp $(LIBBDSG_DIR)/bdsg/include/bdsg/internal/*.hpp $(LIBBDSG_DIR)/bdsg/include/bdsg/overlays/*.hpp $(LIB_DIR)/libhandlegraph.a $(LIB_DIR)/libsdsl.a $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a $(INC_DIR)/sparsepp/spp.h $(INC_DIR)/dynamic/dynamic.hpp $(INC_DIR)/mio/mmap.hpp - +. ./source_me.sh && rm -Rf $(CWD)/$(INC_DIR)/bdsg && cd $(LIBBDSG_DIR) && $(MAKE) clean && CPLUS_INCLUDE_PATH=$(CWD)/$(INC_DIR):$(CWD)/$(INC_DIR)/dynamic:$(CPLUS_INCLUDE_PATH) CXXFLAGS="$(INCLUDE_FLAGS) -fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && cp lib/libbdsg.a $(CWD)/$(LIB_DIR) && cp -r bdsg/include/* $(CWD)/$(INC_DIR) + +rm -Rf $(CWD)/$(INC_DIR)/bdsg && cd $(LIBBDSG_DIR) && $(MAKE) clean && CPLUS_INCLUDE_PATH=$(CWD)/$(INC_DIR):$(CWD)/$(INC_DIR)/dynamic:$(CPLUS_INCLUDE_PATH) CXXFLAGS="$(INCLUDE_FLAGS) -fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && cp lib/libbdsg.a $(CWD)/$(LIB_DIR) && cp -r bdsg/include/* $(CWD)/$(INC_DIR) $(INC_DIR)/mio/mmap.hpp: $(MIO_DIR)/include/mio/* - +. ./source_me.sh && cp -r $(MIO_DIR)/include/mio $(CWD)/$(INC_DIR)/ + +cp -r $(MIO_DIR)/include/mio $(CWD)/$(INC_DIR)/ # It would be better to copy the atomic_queue directory rather than its contents, but to avoid re-writing mmmultimap... $(INC_DIR)/atomic_queue.h: $(ATOMIC_QUEUE_DIR)/include/* - +. ./source_me.sh && cp -r $(ATOMIC_QUEUE_DIR)/include/atomic_queue/* $(CWD)/$(INC_DIR)/ + +cp -r $(ATOMIC_QUEUE_DIR)/include/atomic_queue/* $(CWD)/$(INC_DIR)/ $(INC_DIR)/mmmultiset.hpp: $(MMMULTIMAP_DIR)/src/mmmultiset.hpp $(INC_DIR)/mmmultimap.hpp $(INC_DIR)/mmmultimap.hpp: $(MMMULTIMAP_DIR)/src/mmmultimap.hpp $(MMMULTIMAP_DIR)/src/mmmultiset.hpp $(INC_DIR)/mio/mmap.hpp $(INC_DIR)/atomic_queue.h - +. ./source_me.sh && cp $(MMMULTIMAP_DIR)/src/mmmultimap.hpp $(MMMULTIMAP_DIR)/src/mmmultiset.hpp $(CWD)/$(INC_DIR)/ + +cp $(MMMULTIMAP_DIR)/src/mmmultimap.hpp $(MMMULTIMAP_DIR)/src/mmmultiset.hpp $(CWD)/$(INC_DIR)/ $(INC_DIR)/ips4o.hpp: $(IPS4O_DIR)/ips4o.hpp $(IPS4O_DIR)/ips4o/* - +. ./source_me.sh && cp -r $(IPS4O_DIR)/ips4o* $(CWD)/$(INC_DIR)/ + +cp -r $(IPS4O_DIR)/ips4o* $(CWD)/$(INC_DIR)/ # The xg repo has a cmake build system based all around external projects, and # we need it to use our installed versions of everything instead. @@ -861,7 +861,7 @@ $(INC_DIR)/ips4o.hpp: $(IPS4O_DIR)/ips4o.hpp $(IPS4O_DIR)/ips4o/* $(LIB_DIR)/libxg.a: $(XG_DIR)/src/*.hpp $(XG_DIR)/src/*.cpp $(INC_DIR)/mmmultimap.hpp $(INC_DIR)/ips4o.hpp $(LIB_DIR)/libhandlegraph.a $(LIB_DIR)/libsdsl.a $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a $(INC_DIR)/mio/mmap.hpp $(INC_DIR)/atomic_queue.h +rm -f $@ +cp -r $(XG_DIR)/src/*.hpp $(CWD)/$(INC_DIR) - +. ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -fPIC -DNO_GFAKLUGE -c -o $(XG_DIR)/xg.o $(XG_DIR)/src/xg.cpp $(FILTER) + +$(CXX) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -fPIC -DNO_GFAKLUGE -c -o $(XG_DIR)/xg.o $(XG_DIR)/src/xg.cpp $(FILTER) +ar rs $@ $(XG_DIR)/xg.o # Auto-git-versioning @@ -923,42 +923,42 @@ $(OBJ_DIR)/version.o: $(SRC_DIR)/version.cpp $(SRC_DIR)/version.hpp $(SRC_DIR)/v # Use static pattern rules so the dependency files will not be ignored if the output exists # See $(OBJ) $(OBJ_DIR)/main.o: $(OBJ_DIR)/%.o : $(SRC_DIR)/%.cpp $(OBJ_DIR)/%.d $(DEPS) - . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) + $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) @touch $@ $(SHARED_OBJ): $(SHARED_OBJ_DIR)/%.o : $(SRC_DIR)/%.cpp $(SHARED_OBJ_DIR)/%.d $(DEPS) - . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -fPIC -c -o $@ $< $(FILTER) + $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -fPIC -c -o $@ $< $(FILTER) @touch $@ $(ALGORITHMS_OBJ): $(ALGORITHMS_OBJ_DIR)/%.o : $(ALGORITHMS_SRC_DIR)/%.cpp $(ALGORITHMS_OBJ_DIR)/%.d $(DEPS) - . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) + $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) @touch $@ $(ALGORITHMS_SHARED_OBJ): $(ALGORITHMS_SHARED_OBJ_DIR)/%.o : $(ALGORITHMS_SRC_DIR)/%.cpp $(ALGORITHMS_SHARED_OBJ_DIR)/%.d $(DEPS) - . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -fPIC -c -o $@ $< $(FILTER) + $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -fPIC -c -o $@ $< $(FILTER) @touch $@ $(IO_OBJ): $(IO_OBJ_DIR)/%.o : $(IO_SRC_DIR)/%.cpp $(IO_OBJ_DIR)/%.d $(DEPS) - . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) + $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) @touch $@ $(IO_SHARED_OBJ): $(IO_SHARED_OBJ_DIR)/%.o : $(IO_SRC_DIR)/%.cpp $(IO_SHARED_OBJ_DIR)/%.d $(DEPS) - . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -fPIC -c -o $@ $< $(FILTER) + $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -fPIC -c -o $@ $< $(FILTER) @touch $@ $(SUBCOMMAND_OBJ): $(SUBCOMMAND_OBJ_DIR)/%.o : $(SUBCOMMAND_SRC_DIR)/%.cpp $(SUBCOMMAND_OBJ_DIR)/%.d $(DEPS) - . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) + $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) @touch $@ $(UNITTEST_OBJ): $(UNITTEST_OBJ_DIR)/%.o : $(UNITTEST_SRC_DIR)/%.cpp $(UNITTEST_OBJ_DIR)/%.d $(DEPS) - . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) + $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) @touch $@ $(UNITTEST_SUPPORT_OBJ): $(UNITTEST_SUPPORT_OBJ_DIR)/%.o : $(UNITTEST_SUPPORT_SRC_DIR)/%.cpp $(UNITTEST_SUPPORT_OBJ_DIR)/%.d $(DEPS) - . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) + $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) @touch $@ # Config objects get individual rules $(CONFIG_OBJ_DIR)/allocator_config_jemalloc.o: $(CONFIG_SRC_DIR)/allocator_config_jemalloc.cpp $(CONFIG_OBJ_DIR)/allocator_config_jemalloc.d $(DEPS) $(LIB_DIR)/libjemalloc.a - . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) + $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) @touch $@ $(CONFIG_OBJ_DIR)/allocator_config_jemalloc_debug.o: $(CONFIG_SRC_DIR)/allocator_config_jemalloc_debug.cpp $(CONFIG_OBJ_DIR)/allocator_config_jemalloc_debug.d $(DEPS) $(LIB_DIR)/libjemalloc_debug.a - . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) + $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) @touch $@ $(CONFIG_OBJ_DIR)/allocator_config_system.o: $(CONFIG_SRC_DIR)/allocator_config_system.cpp $(CONFIG_OBJ_DIR)/allocator_config_system.d $(DEPS) - . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) + $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) @touch $@ # Use a fake rule to build .d files, so we don't complain if they don't exist. diff --git a/README.md b/README.md index 200ecff311c..b80954cadff 100644 --- a/README.md +++ b/README.md @@ -101,7 +101,7 @@ Note that a 64-bit OS is required. Ubuntu 20.04 should work. #### Linux: Build -When you are ready, build with `. ./source_me.sh && make`. You can use `make -j16` to run 16 build threads at a time, which greatly accelerates the process. If you have more CPU cores, you can use higher numbers. +When you are ready, build with `make`. You can use `make -j16` to run 16 build threads at a time, which greatly accelerates the process. If you have more CPU cores, you can use higher numbers. Note that vg can take anywhere from 10 minutes to more than an hour to compile depending on your machine and the number of threads used. @@ -161,7 +161,7 @@ Homebrew provides another package management solution for OSX, and may be prefer With dependencies installed, VG can now be built: - . ./source_me.sh && make + make As with Linux, you can add `-j16` or other numbers at the end to run multiple build tasks at once, if your computer can handle them. diff --git a/configure.py b/configure.py deleted file mode 100644 index 1290b07696d..00000000000 --- a/configure.py +++ /dev/null @@ -1,44 +0,0 @@ -import os -import sys -import shutil - - -def make_source_me(): - var_dict = { - "LIBRARY_PATH":"lib", - "LD_LIBRARY_PATH":"lib", - "LD_INCLUDE_PATH":"include", - "C_INCLUDE_PATH":"include", - "CPLUS_INCLUDE_PATH":"include", - "INCLUDE_PATH":"include", - "PATH":"bin" } - - my_wd = os.getcwd() - out_file = "source_me.sh" - - with open(out_file, "w") as ofi: - for i in var_dict: - o_line = "export" + " " + i + "=" + my_wd + "/" + var_dict[i] + ":$" + i + "\n" - ofi.write(o_line) - -def set_compiler(): - compiler_dict = { - "CC": "gcc", - "CXX": "g++" - } - - out_file = "source_me.sh" - with open(out_file, "a") as ofi: - for i in compiler_dict: - o_line = "export" + " " + i + "=" + "$(which " + compiler_dict[i] + ")\n" - ofi.write(o_line) - -def check_deps(): - return - -## pulled these out with this line: for i in `cat source_me.sh | cut -f 2 -d " " | cut -f 1 -d "="`; do echo "\"\$$i\":\"\","; done -if __name__ == "__main__": - make_source_me() - set_compiler() -## "$CC":"", - #"$CXX":"", diff --git a/deps/gbwt b/deps/gbwt index 029c2b45667..2aab62f0664 160000 --- a/deps/gbwt +++ b/deps/gbwt @@ -1 +1 @@ -Subproject commit 029c2b456675eab0fbb55de21c8002e1660cb0d3 +Subproject commit 2aab62f0664b2ce8eb4cffd6b3872c89b85fdfa6 diff --git a/deps/gbwtgraph b/deps/gbwtgraph index de61d340695..26a940cdf3e 160000 --- a/deps/gbwtgraph +++ b/deps/gbwtgraph @@ -1 +1 @@ -Subproject commit de61d340695f64b8aa978816b195d6687e7fda5a +Subproject commit 26a940cdf3eb25974be7c49f598c2dd99e6ecf95 diff --git a/deps/gcsa2 b/deps/gcsa2 index 7dbf9305817..8b6b049ab64 160000 --- a/deps/gcsa2 +++ b/deps/gcsa2 @@ -1 +1 @@ -Subproject commit 7dbf93058171b5af420f1b6166d4d2bf54cd8748 +Subproject commit 8b6b049ab6444e891bb7c38ec8f38169ce62e5fb diff --git a/deps/libbdsg b/deps/libbdsg index 757896a678e..e98cda26c25 160000 --- a/deps/libbdsg +++ b/deps/libbdsg @@ -1 +1 @@ -Subproject commit 757896a678eb7af7215147ef38cdb9315775dfe0 +Subproject commit e98cda26c2522bee80292705494dca8dcffa1c5c diff --git a/deps/sdsl-lite b/deps/sdsl-lite index 863d0118cf3..ef23c5fe989 160000 --- a/deps/sdsl-lite +++ b/deps/sdsl-lite @@ -1 +1 @@ -Subproject commit 863d0118cf303f9c9c55576e0f6b2f70ecd9689a +Subproject commit ef23c5fe9899f2b0afa53a32162ed0b06aff0e89 diff --git a/deps/sublinear-Li-Stephens b/deps/sublinear-Li-Stephens index 241001025d7..ef54e081fae 160000 --- a/deps/sublinear-Li-Stephens +++ b/deps/sublinear-Li-Stephens @@ -1 +1 @@ -Subproject commit 241001025d7cf516c0c9577f94e858e76560269f +Subproject commit ef54e081faebb3fef7ff54cf5d4f8068836951f7 diff --git a/doc/wiki b/doc/wiki index f70ea363837..f28a1e56005 160000 --- a/doc/wiki +++ b/doc/wiki @@ -1 +1 @@ -Subproject commit f70ea36383784fc731454c1b09014f65ad4f74d5 +Subproject commit f28a1e56005c729cf5c2dad6a251447bedba2949 diff --git a/scripts/setup-server b/scripts/setup-server index e3965b61ca9..794f12d1917 100644 --- a/scripts/setup-server +++ b/scripts/setup-server @@ -36,7 +36,7 @@ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 100 --slave git clone --recursive https://github.com/vgteam/vg.git # build vg -cd vg && source ./source_me.sh && make -j 32 static && sudo cp bin/vg /usr/local/bin/ +cd vg && make -j 32 static && sudo cp bin/vg /usr/local/bin/ sudo cp scripts/vg_sim_pos_compare.py /usr/local/bin/ cd ~ diff --git a/source_me.sh b/source_me.sh old mode 100755 new mode 100644 index 0bb948c9280..d650cc14afe --- a/source_me.sh +++ b/source_me.sh @@ -1,23 +1,4 @@ -export LIBRARY_PATH=`pwd`/lib:$LIBRARY_PATH -export LD_LIBRARY_PATH=`pwd`/lib:$LD_LIBRARY_PATH -export DYLD_LIBRARY_PATH=`pwd`/lib:$DYLD_LIBRARY_PATH -export LD_INCLUDE_PATH=`pwd`/include:$LD_INCLUDE_PATH -# Setting include directories via C_INCLUDE_PATH/CPLUS_INCLUDE_PATH will -# automatically get them demoted to the end of the search list even if a -I -# option is passed to try and bump them up earlier, before other -I options. -# We leave the Makefile in charge of finding all the include directories. -export CFLAGS="-I $(pwd)/include ${CFLAGS}" -export CXXFLAGS="-I $(pwd)/include -I$(pwd)/include/dynamic ${CXXFLAGS}" -export PATH=`pwd`/bin:`pwd`/scripts:"$PATH" -export CC=$(which gcc) -export CXX=$(which g++) - -# -# disable until file arguments work as in normal bash :( -# -# add bash autocompletion -#if test -n "$BASH_VERSION" -#then -# -# . ./autocomp.bash -#fi +# We used to have a script here to set up all the include and library search +# paths for the vg build. But now the Makefile knows how to do it all for the +# build, and the vg binary knows where to look for its dynamic libraries. +echo 1>&2 "Sourcing source_me.sh is no longer necessary" diff --git a/src/algorithms/subgraph.cpp b/src/algorithms/subgraph.cpp index 2ea619ea952..86fac693ba1 100644 --- a/src/algorithms/subgraph.cpp +++ b/src/algorithms/subgraph.cpp @@ -1,5 +1,6 @@ #include "subgraph.hpp" #include "../path.hpp" +#include "../crash.hpp" namespace vg { namespace algorithms { @@ -290,83 +291,116 @@ void extract_path_range(const PathPositionHandleGraph& source, path_handle_t pat } } -/// add subpaths to the subgraph, providing a concatenation of subpaths that are discontiguous over the subgraph -/// based on their order in the path position index provided by the source graph -/// will clear any path found in both graphs before writing the new steps into it -/// if subpath_naming is true, a suffix will be added to each path in the subgraph denoting its offset -/// in the source graph (unless the subpath was not cut up at all) -void add_subpaths_to_subgraph(const PathPositionHandleGraph& source, MutablePathHandleGraph& subgraph, - bool subpath_naming) { - std::unordered_map > subpaths; +void add_subpaths_to_subgraph(const PathPositionHandleGraph& source, MutablePathHandleGraph& subgraph) { + + // We want to organize all visits by base path. This key type holds the + // sense, sample and locus names, haplotype, and phase block. + using base_metadata_t = std::tuple; + + // This stores, for each source graph base path, for each start offset, the handle at that offset on the path. + std::unordered_map > subpaths; + + // This stores information about base paths that don't have subranges, and + // their full lengths and circularity flags, so we can avoid generating new + // subrange metadata when we just have all of a path. + std::unordered_map> full_path_info; + subgraph.for_each_handle([&](const handle_t& h) { handlegraph::nid_t id = subgraph.get_id(h); if (source.has_node(id)) { handle_t handle = source.get_handle(id); source.for_each_step_position_on_handle(handle, [&](const step_handle_t& step, const bool& is_rev, const uint64_t& pos) { path_handle_t path = source.get_path_handle_of_step(step); - std::string path_name = source.get_path_name(path); - subpaths[path_name][pos] = is_rev ? subgraph.flip(h) : h; + // Figure out the base path this visit is on + base_metadata_t key = {source.get_sense(path), source.get_sample_name(path), source.get_locus_name(path), source.get_haplotype(path), source.get_phase_block(path)}; + // Figure out the subrange of the base path it is relative to + subrange_t path_subrange = source.get_subrange(path); + uint64_t visit_offset = pos; + if (path_subrange != PathMetadata::NO_SUBRANGE) { + // If we have the position relative to a subrange, adjust by that subrange's offset. + visit_offset += path_subrange.first; + } + subpaths[key][visit_offset] = is_rev ? subgraph.flip(h) : h; + + if (path_subrange == PathMetadata::NO_SUBRANGE) { + // There's no subrange set, so this path is full-length in the source graph. + // See if we know of this path as a full-length path or not + auto it = full_path_info.find(key); + if (it == full_path_info.end()) { + // We haven't recorded its length and circularity yet, so do it. + full_path_info.emplace_hint(it, key, std::make_pair(source.get_path_length(path), source.get_is_circular(path))); + } + } return true; }); } }); - function new_subpath = - [&subgraph](const string& path_name, bool is_circular, size_t subpath_offset) { - PathSense sense; - string sample; - string locus; - size_t haplotype; - size_t phase_block; - subrange_t subrange; - PathMetadata::parse_path_name(path_name, sense, sample, locus, haplotype, phase_block, subrange); - if (subrange == PathMetadata::NO_SUBRANGE) { - subrange.first = subpath_offset; - } else { - subrange.first += subpath_offset; - } - subrange.first = subpath_offset; - subrange.second = PathMetadata::NO_END_POSITION; - string subpath_name = PathMetadata::create_path_name(sense, sample, locus, haplotype, phase_block, subrange); - if (subgraph.has_path(subpath_name)) { - subgraph.destroy_path(subgraph.get_path_handle(subpath_name)); - } - return subgraph.create_path_handle(subpath_name, is_circular); - }; + for (auto& base_and_visits : subpaths) { + // For each base path + const base_metadata_t& base_path_metadata = base_and_visits.first; + const auto& start_to_handle = base_and_visits.second; + // If we didn't put anything in the visit collection, it shouldn't be here. + crash_unless(!start_to_handle.empty()); - for (auto& subpath : subpaths) { - const std::string& path_name = subpath.first; - path_handle_t source_path_handle = source.get_path_handle(path_name); - // destroy the path if it exists - if (subgraph.has_path(path_name)) { - subgraph.destroy_path(subgraph.get_path_handle(path_name)); - } - // create a new path. give it a subpath name if the flag's on and its smaller than original - path_handle_t path; - if (!subpath_naming || subpath.second.size() == source.get_step_count(source_path_handle) || - subpath.second.empty()) { - path = subgraph.create_path_handle(path_name, source.get_is_circular(source_path_handle)); - } else { - path = new_subpath(path_name, source.get_is_circular(source_path_handle), subpath.second.begin()->first); - } - for (auto p = subpath.second.begin(); p != subpath.second.end(); ++p) { - const handle_t& handle = p->second; - if (p != subpath.second.begin() && subpath_naming) { - auto prev = p; - --prev; - const handle_t& prev_handle = prev->second; - // distance from map - size_t delta = p->first - prev->first; - // what the distance should be if they're contiguous depends on relative orienations - size_t cont_delta = subgraph.get_length(prev_handle); - if (delta != cont_delta) { - // we have a discontinuity! we'll make a new path can continue from there - assert(subgraph.get_step_count(path) > 0); - path = new_subpath(path_name, subgraph.get_is_circular(path), p->first); + // We're going to walk over all the visits and find contiguous runs + auto run_start = start_to_handle.begin(); + auto run_end = run_start; + size_t start_coordinate = run_start->first; + while (run_end != start_to_handle.end()) { + // Until we run out of runs + // Figure out where this node ends on the path + size_t stop_coordinate = run_end->first + subgraph.get_length(run_end->second); + + // Look ahead + ++run_end; + + if (run_end != start_to_handle.end() && run_end->first == stop_coordinate) { + // The next visit is still contiguous, so advance. + continue; + } + + // Otherwise we've reached a break in continuity. We have a + // contiguous run from run_start to run_end, visiting the subrange + // start_coordinate to stop_coordinate. + + // Find out if we cover a full source graph path. + subrange_t run_subrange = {start_coordinate, stop_coordinate}; + bool is_circular = false; + if (start_coordinate == 0) { + // We might be a full path + auto found_length_and_circularity = full_path_info.find(base_path_metadata); + if (found_length_and_circularity != full_path_info.end() && found_length_and_circularity->second.first == stop_coordinate) { + // We are a full path + run_subrange = PathMetadata::NO_SUBRANGE; + // We can be circular. + is_circular = found_length_and_circularity->second.second; } } - //fill in the path information - subgraph.append_step(path, handle); + + // Make a path with all the metadata + path_handle_t new_path = subgraph.create_path( + std::get<0>(base_path_metadata), + std::get<1>(base_path_metadata), + std::get<2>(base_path_metadata), + std::get<3>(base_path_metadata), + std::get<4>(base_path_metadata), + run_subrange, + is_circular + ); + + for (auto it = run_start; it != run_end; ++it) { + // Copy the path's visits + subgraph.append_step(new_path, it->second); + } + + // Set up the next subpath. + // Set where it starts. + run_start = run_end; + if (run_start != start_to_handle.end()) { + // And if it will exist, set its start coordinate. + start_coordinate = run_start->first; + } } } } diff --git a/src/algorithms/subgraph.hpp b/src/algorithms/subgraph.hpp index fef9e174235..321baf20e40 100644 --- a/src/algorithms/subgraph.hpp +++ b/src/algorithms/subgraph.hpp @@ -34,13 +34,15 @@ void extract_id_range(const HandleGraph& source, const nid_t& id1, const nid_t& /// if end < 0, then it will walk to the end of the path void extract_path_range(const PathPositionHandleGraph& source, path_handle_t path_handle, int64_t start, int64_t end, MutableHandleGraph& subgraph); -/// add subpaths to the subgraph, providing a concatenation of subpaths that are discontiguous over the subgraph -/// based on their order in the path position index provided by the source graph -/// will clear any path found in both graphs before writing the new steps into it -/// if subpath_naming is true, a suffix will be added to each path in the subgraph denoting its offset -/// in the source graph (unless the subpath was not cut up at all) -void add_subpaths_to_subgraph(const PathPositionHandleGraph& source, MutablePathHandleGraph& subgraph, - bool subpath_naming = false); +/// Add subpaths to the subgraph for all paths visiting its nodes in the source +/// graph. +/// +/// Always generates correct path metadata, and a path for each contiguous +/// fragment of any base path. Assumes the source graph does not contain any +/// overlapping path fragments on a given base path, and that the subgraph does +/// not already contain any paths on a base path also present in the source +/// graph. +void add_subpaths_to_subgraph(const PathPositionHandleGraph& source, MutablePathHandleGraph& subgraph); /// We can accumulate a subgraph without accumulating all the edges between its nodes /// this helper ensures that we get the full set diff --git a/src/chunker.cpp b/src/chunker.cpp index 00fb6f6a3e9..eb76a3cd09d 100644 --- a/src/chunker.cpp +++ b/src/chunker.cpp @@ -5,6 +5,7 @@ #include "algorithms/subgraph.hpp" #include "vg.hpp" #include "clip.hpp" +#include "crash.hpp" //#define debug @@ -22,16 +23,9 @@ PathChunker::~PathChunker() { void PathChunker::extract_subgraph(const Region& region, int64_t context, int64_t length, bool forward_only, MutablePathMutableHandleGraph& subgraph, Region& out_region) { - // This method still depends on VG - // (not a super high priority to port, as calling can now be done at genome scale and we no longer - // have to chunk up paths) - VG* vg_subgraph = dynamic_cast(&subgraph); - if (vg_subgraph == nullptr) { - vg_subgraph = new VG(); - assert(subgraph.get_node_count() == 0); - } // extract our path range into the graph + // TODO: Handle incoming names with subranges. path_handle_t path_handle = graph->get_path_handle(region.seq); step_handle_t start_step = graph->get_step_at_position(path_handle, region.start); handle_t start_handle = graph->get_handle_of_step(start_step); @@ -53,250 +47,98 @@ void PathChunker::extract_subgraph(const Region& region, int64_t context, int64_ if (graph->get_is_reverse(step_handle)) { step_handle = graph->flip(step_handle); } - if (!vg_subgraph->has_node(graph->get_id(step_handle))) { - vg_subgraph->create_handle(graph->get_sequence(step_handle), graph->get_id(step_handle)); + if (!subgraph.has_node(graph->get_id(step_handle))) { + subgraph.create_handle(graph->get_sequence(step_handle), graph->get_id(step_handle)); } }; // expand the context and get path information // if forward_only true, then we only go forward. if (context > 0) { - algorithms::expand_subgraph_by_steps(*graph, *vg_subgraph, context, forward_only); + algorithms::expand_subgraph_by_steps(*graph, subgraph, context, forward_only); } if (length > 0) { - algorithms::expand_subgraph_by_length(*graph, *vg_subgraph, context, forward_only); + algorithms::expand_subgraph_by_length(*graph, subgraph, context, forward_only); } else if (context == 0 && length == 0) { - algorithms::add_connecting_edges_to_subgraph(*graph, *vg_subgraph); + algorithms::add_connecting_edges_to_subgraph(*graph, subgraph); } - algorithms::add_subpaths_to_subgraph(*graph, *vg_subgraph, true); - - // merge back our reference path to use the old chopping code - // todo: work with subpaths somehow? - if (!vg_subgraph->has_path(region.seq)) { - map ref_subpaths; - vg_subgraph->for_each_path_handle([&](path_handle_t path_handle) { - string path_name = vg_subgraph->get_path_name(path_handle); - subrange_t subrange; - path_name = Paths::strip_subrange(path_name, &subrange); - if (subrange != PathMetadata::NO_SUBRANGE && path_name == region.seq) { - ref_subpaths[subrange.first] = path_handle; - } - }); - path_handle_t new_ref_path = vg_subgraph->create_path_handle(region.seq, graph->get_is_circular(path_handle)); - for (auto& ref_subpath : ref_subpaths) { - vg_subgraph->for_each_step_in_path(ref_subpath.second, [&] (step_handle_t subpath_step) { - vg_subgraph->append_step(new_ref_path, vg_subgraph->get_handle_of_step(subpath_step)); - }); - vg_subgraph->destroy_path(ref_subpath.second); - } - } - - // build the vg of the subgraph - vg_subgraph->remove_orphan_edges(); - - // get our range endpoints before context expansion - list& mappings = vg_subgraph->paths.get_path(region.seq); - assert(!mappings.empty()); - size_t mappings_size = mappings.size(); - int64_t input_start_node = graph->get_id(start_handle); - int64_t input_end_node = graph->get_id(end_handle); - -#ifdef debug -#pragma omp critical(cerr) - { - cerr << "Path range in expanded subgraph is " << *mappings.begin() << "-" << *mappings.rbegin() << endl; - } -#endif - - // replaces old xg position_in_path() to check node counts in path - function(const PathHandleGraph&, handle_t, path_handle_t)> path_steps_of_handle = - [] (const PathHandleGraph& graph, handle_t handle, path_handle_t path_handle) { - vector node_steps = graph.steps_of_handle(handle); - vector node_path_steps; - for (auto step : node_steps) { - if (graph.get_path_handle_of_step(step) == path_handle) { - node_path_steps.push_back(step); - } - } - return node_path_steps; - }; - - // we have no direct way of getting our steps out of the subgraph, so we - // go through node ids. the problem is that cycles can introduce - // ambiguity. we check for that here (only to punt on it later) - vector start_node_path_steps = path_steps_of_handle(*graph, start_handle, path_handle); - vector end_node_path_steps = path_steps_of_handle(*graph, end_handle, path_handle); - bool end_points_on_cycle = start_node_path_steps.size() > 1 || end_node_path_steps.size() > 1; - - // keep track of the edges in our original path - set, pair>> path_edge_set = - // walking out with the context length (as supported below) won't always work as expansion - // can grab an arbitrary amount of path regardless of context. so we load up the entire path: - // (todo: could sniff out limits from subgraph...) - get_path_edge_index(graph->path_begin(path_handle), graph->path_back(path_handle), std::max(context, length)); - - // the distance between them and the nodes in our input range - size_t left_padding = 0; - size_t right_padding = 0; - // do we need to rewrite back to our graph? - bool rewrite_paths = false; + algorithms::add_subpaths_to_subgraph(*graph, subgraph); + + // Now we need to figure out how we expanded the target path region we + // asked for. + // + // We don't just want the lowest and highest bounds of any subpath, we want + // the lowest and highest bound of the subpaths that actually overlap the + // targeted region. + + // Find the lowest and highest offsets visited by any subpath of the target path we extracted on. + PathSense sense = graph->get_sense(path_handle); + std::string sample = graph->get_sample_name(path_handle); + std::string locus = graph->get_locus_name(path_handle); + size_t haplotype = graph->get_haplotype(path_handle); + size_t phase_block = graph->get_phase_block(path_handle); - if (!end_points_on_cycle) { - // start and end of our expanded chunk - auto start_it = mappings.begin(); - auto end_it = --mappings.end(); - - // find our input range in the expanded path. we know these nodes only appear once. - for (; start_it != mappings.end() && start_it->node_id() != input_start_node; ++start_it); - for (; end_it != mappings.begin() && end_it->node_id() != input_end_node; --end_it); - - // walk back our start point as we can without rank discontinuities. doesn't matter - // if we encounter cycles here, because we keep a running path length - auto cur_it = start_it; - auto prev_it = cur_it; - if (prev_it != mappings.begin()) { - for (; prev_it != mappings.begin(); --prev_it) { - cur_it = prev_it; - --cur_it; - handle_t prev_handle = vg_subgraph->get_handle(prev_it->node_id(), - prev_it->is_reverse()); - handle_t cur_handle = vg_subgraph->get_handle(cur_it->node_id(), - cur_it->is_reverse()); - edge_t edge = vg_subgraph->edge_handle(cur_handle, prev_handle); - if (!path_edge_set.count(make_pair(make_pair(vg_subgraph->get_id(edge.first), vg_subgraph->get_is_reverse(edge.first)), - make_pair(vg_subgraph->get_id(edge.second), vg_subgraph->get_is_reverse(edge.second))))) { -#ifdef debug -#pragma omp critical(cerr) - { - cerr << "found discontinuity between when left scanning path in subgraph: " << *cur_it << " and " << *prev_it << endl; - } -#endif - break; - } - left_padding += cur_it->length; - } - } - start_it = prev_it; - // walk forward the end point - cur_it = end_it; - prev_it = cur_it; - for (++cur_it; cur_it != mappings.end(); ++prev_it, ++cur_it) { - handle_t prev_handle = vg_subgraph->get_handle(prev_it->node_id(), - prev_it->is_reverse()); - handle_t cur_handle = vg_subgraph->get_handle(cur_it->node_id(), - cur_it->is_reverse()); - edge_t edge = vg_subgraph->edge_handle(prev_handle, cur_handle); - if (!path_edge_set.count(make_pair(make_pair(vg_subgraph->get_id(edge.first), vg_subgraph->get_is_reverse(edge.first)), - make_pair(vg_subgraph->get_id(edge.second), vg_subgraph->get_is_reverse(edge.second))))) { -#ifdef debug -#pragma omp critical(cerr) - { - cerr << "found discontinuity between when right scanning path in subgraph: " << *prev_it << " and " << *cur_it << endl; + // Find the outer bounds of selected subpaths of the target path + size_t min_start = std::numeric_limits::max(); + size_t max_end = 0; - } -#endif - break; - } - right_padding += cur_it->length; + subgraph.for_each_path_matching({sense}, {sample}, {locus}, [&](const path_handle_t subpath) { + if (subgraph.get_haplotype(subpath) != haplotype || subgraph.get_phase_block(subpath) != phase_block) { + // Skip this subpath since it's not the right phase/fragment + return true; } - end_it = prev_it; - rewrite_paths = start_it != mappings.begin() || end_it != --mappings.end(); - - // cut out nodes before and after discontinuity - mappings.erase(mappings.begin(), start_it); - mappings.erase(++end_it, mappings.end()); - } - // We're clipping at a cycle in the reference path. Just preserve the path as-is from the - // input region. - else { - mappings.clear(); - for (step_handle_t step = start_step; step != end_plus_one_step; step = graph->get_next_step(step)) { - handle_t step_handle = graph->get_handle_of_step(step); - mapping_t mapping; - mapping.set_node_id(graph->get_id(step_handle)); - mapping.set_is_reverse(graph->get_is_reverse(step_handle)); - mappings.push_back(mapping); + subrange_t subpath_subrange = subgraph.get_subrange(subpath); + if (subpath_subrange == PathMetadata::NO_SUBRANGE) { + // Fill in a 0 start + subpath_subrange.first = 0; } - rewrite_paths = true; - } - - // Cut our graph so that our reference path end points are graph tips. This will let the - // snarl finder use the path to find telomeres. - path_handle_t sg_path_handle = vg_subgraph->get_path_handle(region.seq); - Node* start_node = vg_subgraph->get_node(mappings.begin()->node_id()); - auto sg_start_steps = path_steps_of_handle(*vg_subgraph, vg_subgraph->get_handle(start_node->id()), sg_path_handle); - if (rewrite_paths && sg_start_steps.size() == 1) { - if (!mappings.begin()->is_reverse() && vg_subgraph->start_degree(start_node) != 0) { - for (auto edge : vg_subgraph->edges_to(start_node)) { -#ifdef debug -#pragma omp critical(cerr) - { - cerr << "clipping out edge " << pb2json(*edge) << " in order to make path start a tip" << endl; - } -#endif - vg_subgraph->destroy_edge(edge); - } - } else if (mappings.begin()->is_reverse() && vg_subgraph->end_degree(start_node) != 0) { - for (auto edge : vg_subgraph->edges_from(start_node)) { -#ifdef debug -#pragma omp critical(cerr) - { - cerr << "clipping out edge " << pb2json(*edge) << " in order to make path start a tip" << endl; - } -#endif - vg_subgraph->destroy_edge(edge); + if (subpath_subrange.second == PathMetadata::NO_END_POSITION) { + // Compute a length and use that to get the end. + // TODO: Sniff for an efficient/available get_path_length. + size_t path_length = 0; + for (handle_t handle : subgraph.scan_path(subpath)) { + path_length += subgraph.get_length(handle); } + subpath_subrange.second = subpath_subrange.first + path_length; } - } - Node* end_node = vg_subgraph->get_node(mappings.rbegin()->node_id()); - auto sg_end_steps = path_steps_of_handle(*vg_subgraph, vg_subgraph->get_handle(end_node->id()), sg_path_handle); - if (rewrite_paths && sg_end_steps.size() == 1) { - if (!mappings.rbegin()->is_reverse() && vg_subgraph->end_degree(end_node) != 0) { - for (auto edge : vg_subgraph->edges_from(end_node)) { -#ifdef debug -#pragma omp critical(cerr) - { - cerr << "clipping out edge " << pb2json(*edge) << " in order to make path end a tip" << endl; - } -#endif - vg_subgraph->destroy_edge(edge); - } - } else if (mappings.rbegin()->is_reverse() && vg_subgraph->start_degree(end_node) != 0) { - for (auto edge : vg_subgraph->edges_to(end_node)) { -#ifdef debug -#pragma omp critical(cerr) - { - cerr << "clipping out edge " << pb2json(*edge) << " in order to make path end a tip" << endl; - } -#endif - vg_subgraph->destroy_edge(edge); - } + + if (subpath_subrange.first > region.end || subpath_subrange.second <= region.start) { + // This subpath doesn't actually overlap the selected target path + // region (which is 0-based, end-inclusive), and so shouldn't count + // for extending the selected region along the target path. + return true; } - } - // Sync our updated paths lists back into the Graph protobuf - if (rewrite_paths) { - vg_subgraph->paths.rebuild_node_mapping(); - vg_subgraph->paths.rebuild_mapping_aux(); - vg_subgraph->graph.clear_path(); - vg_subgraph->paths.to_graph(vg_subgraph->graph); + // Min/max in the subrange bounds + min_start = std::min(min_start, subpath_subrange.first); + max_end = std::max(max_end, subpath_subrange.second); + + return true; + }); + + // TODO: We assume we actually found some of the target path + crash_unless(min_start != std::numeric_limits::max()); + + // Hackily remove source path subrange offsets if any + subrange_t source_subrange = graph->get_subrange(path_handle); + if (source_subrange != PathMetadata::NO_SUBRANGE) { + // If we requested something on this path region, we can't handle + // finding part of an earlier path region. + // TODO: Handle it. + crash_unless(min_start <= source_subrange.first); + min_start -= source_subrange.first; + max_end -= source_subrange.first; } - // copy back out of vg if necessary - if (dynamic_cast(&subgraph) == nullptr) { - handlealgs::copy_path_handle_graph(vg_subgraph, &subgraph); - delete vg_subgraph; - } + // We can't represent a region with a 0 end-exclusive coordinate. + crash_unless(max_end != 0); - // start could fall inside a node. we find out where in the path the - // 0-offset point of the node is. - int64_t input_start_pos = graph->get_position_of_step(start_step); - int64_t input_end_pos = graph->get_position_of_step(end_step); + // Produce the output region. Convert coordinates to be 0-based, end-inclusive. out_region.seq = region.seq; - out_region.start = input_start_pos - left_padding; - out_region.end = input_end_pos + graph->get_length(end_handle) + right_padding - 1; + out_region.start = min_start; + out_region.end = max_end - 1; } void PathChunker::extract_snarls(const Region& region, SnarlManager& snarl_manager, MutablePathMutableHandleGraph& subgraph) { @@ -350,7 +192,7 @@ void PathChunker::extract_snarls(const Region& region, SnarlManager& snarl_manag algorithms::add_connecting_edges_to_subgraph(*graph, subgraph); // now fill in the paths - algorithms::add_subpaths_to_subgraph(*graph, subgraph, true); + algorithms::add_subpaths_to_subgraph(*graph, subgraph); } void PathChunker::extract_path_component(const string& path_name, MutablePathMutableHandleGraph& subgraph, Region& out_region) { @@ -361,18 +203,18 @@ void PathChunker::extract_path_component(const string& path_name, MutablePathMut path_ids.insert(graph->get_id(handle)); } - extract_component(path_ids, subgraph, true); + extract_component(path_ids, subgraph); out_region.seq = path_name; } -void PathChunker::extract_component(const unordered_set& node_ids, MutablePathMutableHandleGraph& subgraph, bool subpath_naming) { +void PathChunker::extract_component(const unordered_set& node_ids, MutablePathMutableHandleGraph& subgraph) { for (nid_t node_id : node_ids) { subgraph.create_handle(graph->get_sequence(graph->get_handle(node_id)), node_id); } algorithms::expand_subgraph_by_steps(*graph, subgraph, numeric_limits::max()); - algorithms::add_subpaths_to_subgraph(*graph, subgraph, subpath_naming); + algorithms::add_subpaths_to_subgraph(*graph, subgraph); } void PathChunker::extract_id_range(vg::id_t start, vg::id_t end, int64_t context, int64_t length, @@ -389,7 +231,7 @@ void PathChunker::extract_id_range(vg::id_t start, vg::id_t end, int64_t context if (length) { algorithms::expand_subgraph_by_length(*graph, subgraph, context, forward_only); } - algorithms::add_subpaths_to_subgraph(*graph, subgraph, true); + algorithms::add_subpaths_to_subgraph(*graph, subgraph); // build the vg out_region.start = subgraph.min_node_id(); diff --git a/src/chunker.hpp b/src/chunker.hpp index b9d88aeb60d..56922100613 100644 --- a/src/chunker.hpp +++ b/src/chunker.hpp @@ -31,9 +31,12 @@ class PathChunker { /** Extract subgraph corresponding to given path region into its * own vg graph, and send it to out_stream. The boundaries of the - * extracted graph (which can be different because we expand context and don't - * cut nodes) are written to out_region. If forward_only set, context - * is only expanded in the forward direction + * extracted region of the target path (which can be different because we + * expand context and don't cut nodes) are written to out_region. If the + * target path goes through the extracted region multiple times, only the + * extended bounds of the visit containing the target region are produced. + * + * If forward_only set, context is only expanded in the forward direction * * NOTE: we follow convention of Region coordinates being 0-based * inclusive. @@ -49,14 +52,14 @@ class PathChunker { MutablePathMutableHandleGraph& subgraph); /** - * Extract a connected component containing a given path + * Extract a connected component containing a given path. Processes path metadata and creates subpaths. */ void extract_path_component(const string& path_name, MutablePathMutableHandleGraph& subgraph, Region& out_region); /** - * Extract a connected component starting from an id set + * Extract a connected component starting from an id set. Processes path metadata and creates subpaths. */ - void extract_component(const unordered_set& node_ids, MutablePathMutableHandleGraph& subgraph, bool subpath_naming); + void extract_component(const unordered_set& node_ids, MutablePathMutableHandleGraph& subgraph); /** * Like above, but use (inclusive) id range instead of region on path. diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index aa052db955a..0f15b8139d7 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -1168,9 +1168,10 @@ string Deconstructor::get_vcf_header() { } if (sample_to_haps.empty()) { - cerr << "Error [vg deconstruct]: No paths found for alt alleles in the graph. Note that " - << "exhaustive path-free traversal finding is no longer supported, and vg deconstruct " - << "now only works on embedded paths and GBWT threads." << endl; + cerr << "Error [vg deconstruct]: No paths other than selected reference(s) found in the graph, " + << "so no alt alleles can be generated. Note that exhaustive path-free traversal finding " + << "is no longer supported, and vg deconstruct now only works on embedded paths and GBWT " + << "threads." << endl; exit(1); } diff --git a/src/gbwt_helper.cpp b/src/gbwt_helper.cpp index 0958ba45533..19306a7dfe3 100644 --- a/src/gbwt_helper.cpp +++ b/src/gbwt_helper.cpp @@ -88,26 +88,6 @@ gbwt::size_type gbwt_node_width(const HandleGraph& graph) { return gbwt::bit_length(gbwt::Node::encode(graph.max_node_id(), true)); } -void finish_gbwt_constuction(gbwt::GBWTBuilder& builder, - const std::vector& sample_names, - const std::vector& contig_names, - size_t haplotype_count, bool print_metadata, - const std::string& header) { - - builder.finish(); - builder.index.metadata.setSamples(sample_names); - builder.index.metadata.setHaplotypes(haplotype_count); - builder.index.metadata.setContigs(contig_names); - if (print_metadata) { - #pragma omp critical - { - std::cerr << header << ": "; - gbwt::operator<<(std::cerr, builder.index.metadata); - std::cerr << std::endl; - } - } -} - //------------------------------------------------------------------------------ void load_gbwt(gbwt::GBWT& index, const std::string& filename, bool show_progress) { diff --git a/src/gbwt_helper.hpp b/src/gbwt_helper.hpp index 33e1545066c..57b819a579b 100644 --- a/src/gbwt_helper.hpp +++ b/src/gbwt_helper.hpp @@ -67,13 +67,6 @@ gbwt::vector_type path_predecessors(const PathHandleGraph& graph, const std::str /// Determine the node width in bits for the GBWT nodes based on the given graph. gbwt::size_type gbwt_node_width(const HandleGraph& graph); -/// Finish GBWT construction and optionally print the metadata. -void finish_gbwt_constuction(gbwt::GBWTBuilder& builder, - const std::vector& sample_names, - const std::vector& contig_names, - size_t haplotype_count, bool print_metadata, - const std::string& header = "GBWT"); - //------------------------------------------------------------------------------ /* diff --git a/src/haplotype_indexer.cpp b/src/haplotype_indexer.cpp index 528160485cd..75399a86e64 100644 --- a/src/haplotype_indexer.cpp +++ b/src/haplotype_indexer.cpp @@ -2,20 +2,22 @@ * \file haplotype_indexer.cpp: implementations of haplotype indexing with the GBWT */ +#include "haplotype_indexer.hpp" + #include -#include +#include #include #include #include +#include #include #include "gbwt_helper.hpp" -#include "haplotype_indexer.hpp" - -#include "path.hpp" #include "alignment.hpp" +#include "hash_map.hpp" +#include "path.hpp" using namespace std; @@ -394,7 +396,7 @@ std::unique_ptr HaplotypeIndexer::build_gbwt(const std::vecto // And add every path that passes the filter (including haplotype paths) from the source graph. gbwtgraph::store_paths(builder, *graph, {PathSense::GENERIC, PathSense::REFERENCE, PathSense::HAPLOTYPE}, &path_filter); - // Finish the construction for this set of threads and put the index back. + // Finish the construction for this set of paths and put the index back. builder.finish(); builder.swapIndex(*index); } @@ -413,24 +415,45 @@ std::unique_ptr HaplotypeIndexer::build_gbwt(const PathHandle return build_gbwt({}, "GBWT", &graph); } -std::unique_ptr HaplotypeIndexer::build_gbwt(const PathHandleGraph& graph, - const std::vector& aln_filenames, const std::string& aln_format) const { +std::unique_ptr HaplotypeIndexer::build_gbwt(const HandleGraph& graph, + const std::vector& aln_filenames, const std::string& aln_format, size_t parallel_jobs) const { + + // Handle multithreading and parallel jobs. + parallel_jobs = std::max(1, parallel_jobs); + int old_threads = omp_get_max_threads(); + omp_set_num_threads(parallel_jobs); - // GBWT metadata. - std::vector sample_names, contig_names; - std::map> sample_info; // name -> (id, count) - contig_names.push_back("0"); // An artificial contig. - size_t haplotype_count = 0; + // Partition the graph into construction jobs. + if (this->show_progress) { + #pragma omp critical + { + std::cerr << "Partitioning the graph into GBWT construction jobs" << std::endl; + } + } + size_t target_size = graph.get_node_count() / parallel_jobs; + gbwtgraph::ConstructionJobs jobs = gbwtgraph::gbwt_construction_jobs(graph, target_size); + if (this->show_progress) { + #pragma omp critical + { + std::cerr << "Created " << jobs.size() << " parallel construction jobs" << std::endl; + } + } // GBWT construction. - gbwt::GBWTBuilder builder(gbwt_node_width(graph), this->gbwt_buffer_size * gbwt::MILLION, this->id_interval); - builder.index.addMetadata(); + std::vector builder_mutexes(jobs.size()); + std::vector> builders(jobs.size()); + // This is a bit inefficient, as read names are often longer than the SSO threshold for GCC (but not for Clang). + // TODO: Maybe use concatenated 0-terminated names? + std::vector> read_names(jobs.size()); + for (size_t i = 0; i < jobs.size(); i++) { + builders[i].reset(new gbwt::GBWTBuilder(gbwt_node_width(graph), this->gbwt_buffer_size * gbwt::MILLION, this->id_interval)); + } // Actual work. if (this->show_progress) { #pragma omp critical { - std::cerr << "Converting " << aln_format << " to threads" << std::endl; + std::cerr << "Converting " << aln_format << " to GBWT paths" << std::endl; } } std::function lambda = [&](Alignment& aln) { @@ -438,37 +461,98 @@ std::unique_ptr HaplotypeIndexer::build_gbwt(const PathHandle for (auto& m : aln.path().mapping()) { buffer.push_back(mapping_to_gbwt(m)); } - builder.insert(buffer, true); // Insert in both orientations. - size_t sample_id = 0, sample_count = 0; - auto iter = sample_info.find(aln.name()); - if (iter == sample_info.end()) { - sample_id = sample_names.size(); - sample_names.push_back(aln.name()); - sample_info[aln.name()] = std::pair(sample_id, sample_count); - haplotype_count++; - } else { - sample_id = iter->second.first; - sample_count = iter->second.second; - iter->second.second++; + size_t job_id = 0; + if (buffer.size() > 0) { + job_id = jobs.job(gbwt::Node::id(buffer.front())); + if (job_id >= jobs.size()) { + job_id = 0; + } + } + { + // Insert the path into the appropriate builder and record the read name. + std::lock_guard lock(builder_mutexes[job_id]); + builders[job_id]->insert(buffer, true); + read_names[job_id].push_back(aln.name()); } - builder.index.metadata.addPath(sample_id, 0, 0, sample_count); }; for (auto& file_name : aln_filenames) { if (aln_format == "GAM") { get_input_file(file_name, [&](istream& in) { - vg::io::for_each(in, lambda); + vg::io::for_each_parallel(in, lambda); }); } else { assert(aln_format == "GAF"); - vg::io::gaf_unpaired_for_each(graph, file_name, lambda); + vg::io::gaf_unpaired_for_each_parallel(graph, file_name, lambda); } } - - // Finish the construction and extract the index. - finish_gbwt_constuction(builder, sample_names, contig_names, haplotype_count, this->show_progress); - std::unique_ptr built(new gbwt::DynamicGBWT()); - builder.swapIndex(*built); - return built; + + // Finish the construction and convert to compressed GBWT. + std::vector partial_indexes(jobs.size()); + #pragma omp parallel for schedule(dynamic, 1) + for (size_t i = 0; i < jobs.size(); i++) { + builders[i]->finish(); + partial_indexes[i] = gbwt::GBWT(builders[i]->index); + builders[i].reset(); + } + + // Merge the partial indexes. + if (this->show_progress) { + #pragma omp critical + { + std::cerr << "Merging the partial GBWTs" << std::endl; + } + } + std::unique_ptr result(new gbwt::GBWT(partial_indexes)); + partial_indexes.clear(); + + // Create the metadata. + if (this->show_progress) { + #pragma omp critical + { + std::cerr << "Creating metadata" << std::endl; + } + } + result->addMetadata(); + result->metadata.setContigs({ "unknown" }); + { + // We can use 32-bit values, as GBWT metadata uses them as well. + string_hash_map> read_info; // name -> (sample id, fragment count) + for (auto& names : read_names) { + for (const std::string& name : names) { + std::uint32_t sample_id = 0, fragment_count = 0; + auto iter = read_info.find(name); + if (iter == read_info.end()) { + sample_id = read_info.size(); + read_info[name] = std::make_pair(sample_id, fragment_count); + } else { + sample_id = iter->second.first; + fragment_count = iter->second.second; + iter->second.second++; + } + result->metadata.addPath(sample_id, 0, 0, fragment_count); + } + names = std::vector(); + } + std::vector sample_names(read_info.size()); + for (auto& p : read_info) { + sample_names[p.second.first] = p.first; + } + read_info = string_hash_map>(); + result->metadata.setSamples(sample_names); + result->metadata.setHaplotypes(sample_names.size()); + } + if (this->show_progress) { + #pragma omp critical + { + std::cerr << "GBWT: "; + gbwt::operator<<(std::cerr, result->metadata); + std::cerr << std::endl; + } + } + + // Restore the number of threads. + omp_set_num_threads(old_threads); + return result; } } diff --git a/src/haplotype_indexer.hpp b/src/haplotype_indexer.hpp index f555aec947b..ca212a94267 100644 --- a/src/haplotype_indexer.hpp +++ b/src/haplotype_indexer.hpp @@ -135,10 +135,15 @@ class HaplotypeIndexer : public Progressive { * the same name, the corresponding GBWT path names will have the same * sample identifier but different values in the count field. * - * aln_format can be "GAM" or "GAF" + * aln_format can be "GAM" or "GAF". + * + * Runs approximately the given number of jobs in parallel. The exact + * number depends on the sizes of weakly connected components in the graph. + * Each job uses at most 2 threads. */ - std::unique_ptr build_gbwt(const PathHandleGraph& graph, - const std::vector& aln_filenames, const std::string& aln_format) const; + std::unique_ptr build_gbwt(const HandleGraph& graph, + const std::vector& aln_filenames, const std::string& aln_format, + size_t parallel_jobs = 1) const; }; } diff --git a/src/kmer.cpp b/src/kmer.cpp index 52d63713096..188e3f01ca6 100644 --- a/src/kmer.cpp +++ b/src/kmer.cpp @@ -348,7 +348,15 @@ string write_gcsa_kmers_to_tmpfile(const HandleGraph& graph, int kmer_size, size temp_file::remove(tmpfile); throw ex; } + if (!out) { + std::cerr << "error[write_gcsa_kmers_to_tmpfile]: I/O error while writing kmers to " << tmpfile << std::endl; + exit(1); + } out.close(); + if (!out) { + std::cerr << "error[write_gcsa_kmers_to_tmpfile]: I/O error while closing kmer file " << tmpfile << std::endl; + exit(1); + } return tmpfile; } diff --git a/src/snarl_distance_index.cpp b/src/snarl_distance_index.cpp index bc00016dbce..7a5f3e5bd49 100644 --- a/src/snarl_distance_index.cpp +++ b/src/snarl_distance_index.cpp @@ -20,13 +20,13 @@ size_t maximum_distance(const SnarlDistanceIndex& distance_index, pos_t pos1, po get_id(pos2), get_is_rev(pos2), get_offset(pos2)); } -void fill_in_distance_index(SnarlDistanceIndex* distance_index, const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit) { +void fill_in_distance_index(SnarlDistanceIndex* distance_index, const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit, bool silence_warnings) { distance_index->set_snarl_size_limit(size_limit); //Build the temporary distance index from the graph SnarlDistanceIndex::TemporaryDistanceIndex temp_index = make_temporary_distance_index(graph, snarl_finder, size_limit); - if (temp_index.use_oversized_snarls) { + if (!silence_warnings && temp_index.use_oversized_snarls) { cerr << "warning: distance index uses oversized snarls, which may make mapping slow" << endl; cerr << "\ttry increasing --snarl-limit when building the distance index" << endl; } diff --git a/src/snarl_distance_index.hpp b/src/snarl_distance_index.hpp index 6af73878c97..781b30f6708 100644 --- a/src/snarl_distance_index.hpp +++ b/src/snarl_distance_index.hpp @@ -22,7 +22,7 @@ size_t maximum_distance(const SnarlDistanceIndex& distance_index, pos_t pos1, po //Fill in the index //size_limit is a limit on the number of nodes in a snarl, after which the index won't store pairwise distances -void fill_in_distance_index(SnarlDistanceIndex* distance_index, const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit = 50000); +void fill_in_distance_index(SnarlDistanceIndex* distance_index, const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit = 50000, bool silence_warnings=true); //Fill in the temporary snarl record with distances void populate_snarl_index(SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, diff --git a/src/subcommand/chunk_main.cpp b/src/subcommand/chunk_main.cpp index 22f6e02651e..970b75d35b6 100644 --- a/src/subcommand/chunk_main.cpp +++ b/src/subcommand/chunk_main.cpp @@ -349,6 +349,36 @@ int main_chunk(int argc, char** argv) { bool chunk_gam = !gam_files.empty() && gam_split_size == 0; bool chunk_graph = gam_and_graph || (!chunk_gam && gam_split_size == 0); + // parse the regions into a list before loading the graph, if we're + // specifying regions by path name. + vector regions; + if (!region_strings.empty()) { + for (auto& region_string : region_strings) { + Region region; + parse_region(region_string, region); + regions.push_back(region); + } + } + if (!path_list_file.empty()) { + ifstream pr_stream(path_list_file.c_str()); + if (!pr_stream) { + cerr << "error:[vg chunk] unable to open path regions: " << path_list_file << endl; + return 1; + } + while (pr_stream) { + string buf; + std::getline(pr_stream, buf); + if (!buf.empty()) { + Region region; + parse_region(buf, region); + regions.push_back(region); + } + } + } + if (!in_bed_file.empty()) { + parse_bed_regions(in_bed_file, regions); + } + // Load the snarls unique_ptr snarl_manager; if (!snarl_filename.empty()) { @@ -377,9 +407,17 @@ int main_chunk(int argc, char** argv) { return 1; } in.close(); + + // To support the regions we were asked for, we might need to ensure + // the paths they are on are actually indexed for reference style + // offset lookups. + std::unordered_set ensure_indexed; + for (auto& region : regions) { + ensure_indexed.insert(region.seq); + } path_handle_graph = vg::io::VPKG::load_one(xg_file); - graph = overlay_helper.apply(path_handle_graph.get()); + graph = overlay_helper.apply(path_handle_graph.get(), ensure_indexed); in.close(); } @@ -463,35 +501,7 @@ int main_chunk(int argc, char** argv) { // (instead of an index) unordered_map node_to_component; - // parse the regions into a list - vector regions; - if (!region_strings.empty()) { - for (auto& region_string : region_strings) { - Region region; - parse_region(region_string, region); - regions.push_back(region); - } - } - else if (!path_list_file.empty()) { - ifstream pr_stream(path_list_file.c_str()); - if (!pr_stream) { - cerr << "error:[vg chunk] unable to open path regions: " << path_list_file << endl; - return 1; - } - while (pr_stream) { - string buf; - std::getline(pr_stream, buf); - if (!buf.empty()) { - Region region; - parse_region(buf, region); - regions.push_back(region); - } - } - } - else if (!in_bed_file.empty()) { - parse_bed_regions(in_bed_file, regions); - } - else if (id_range) { + if (id_range) { if (n_chunks) { // Determine the ranges from the source graph itself. // how many nodes per range? @@ -556,9 +566,9 @@ int main_chunk(int argc, char** argv) { delete range_stream; } } - else if (graph != nullptr && (!components || path_components)) { - // every path - graph->for_each_path_handle([&](path_handle_t path_handle) { + if (graph != nullptr && path_components) { + // every reference or generic path (guaranteed to be reference indexed) + graph->for_each_path_matching({PathSense::REFERENCE, PathSense::GENERIC}, {}, {}, [&](path_handle_t path_handle) { Region region; region.seq = graph->get_path_name(path_handle); if (!Paths::is_alt(region.seq)) { @@ -596,7 +606,7 @@ int main_chunk(int argc, char** argv) { if (!id_range) { for (auto& region : regions) { if (!graph->has_path(region.seq)) { - cerr << "error[vg chunk]: input path " << region.seq << " not found in xg index" << endl; + cerr << "error[vg chunk]: input path " << region.seq << " not found in graph" << endl; return 1; } region.start = max((int64_t)0, region.start); @@ -716,7 +726,7 @@ int main_chunk(int argc, char** argv) { map trace_thread_frequencies; if (!component_ids.empty()) { subgraph = vg::io::new_output_graph(output_format); - chunker.extract_component(component_ids[i], *subgraph, false); + chunker.extract_component(component_ids[i], *subgraph); output_regions[i] = region; } else if (id_range == false) { diff --git a/src/subcommand/find_main.cpp b/src/subcommand/find_main.cpp index 48ebda09a6a..2e1fa0355d2 100644 --- a/src/subcommand/find_main.cpp +++ b/src/subcommand/find_main.cpp @@ -1,4 +1,5 @@ #include "subcommand.hpp" +#include "../crash.hpp" #include "../utility.hpp" #include "../mapper.hpp" #include @@ -683,8 +684,9 @@ int main_find(int argc, char** argv) { ofstream out(s.str().c_str()); vg::io::save_handle_graph(&graph, out); out.close(); - // reset our graph - dynamic_cast(graph).clear(); + // reset our graph so it has no nodes or paths anymore + graph.clear(); + crash_unless(graph.get_path_count() == 0); } if (subgraph_k) { prep_graph(); // don't forget to prep the graph, or the kmer set will be wrong[ diff --git a/src/subcommand/gbwt_main.cpp b/src/subcommand/gbwt_main.cpp index db1175b426e..4eca4e47a81 100644 --- a/src/subcommand/gbwt_main.cpp +++ b/src/subcommand/gbwt_main.cpp @@ -37,7 +37,7 @@ struct GBWTConfig { build_mode build = build_none; merge_mode merge = merge_none; path_cover_mode path_cover = path_cover_none; - bool metadata_mode = false, thread_mode = false; + bool metadata_mode = false, path_mode = false; // Input GBWT construction. HaplotypeIndexer haplotype_indexer; @@ -55,8 +55,8 @@ struct GBWTConfig { // Other parameters and flags. bool show_progress = false; - bool count_threads = false; - bool metadata = false, contigs = false, haplotypes = false, samples = false, list_names = false, thread_names = false, tags = false; + bool count_paths = false; + bool metadata = false, contigs = false, haplotypes = false, samples = false, list_names = false, path_names = false, tags = false; bool include_named_paths = false; size_t num_paths = default_num_paths(), context_length = default_context_length(); bool num_paths_set = false; @@ -70,7 +70,7 @@ struct GBWTConfig { // File names. std::string gbwt_output; // Output GBWT. - std::string thread_output; // Threads in SDSL format. + std::string path_output; // Paths in SDSL format. std::string graph_output; // Output GBWTGraph. std::string segment_translation; // Segment to node translation output. std::string r_index_name; // Output r-index. @@ -163,7 +163,7 @@ void step_4_path_cover(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& con void step_5_gbwtgraph(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& config); void step_6_r_index(GBWTHandler& gbwts, GBWTConfig& config); void step_7_metadata(GBWTHandler& gbwts, GBWTConfig& config); -void step_8_threads(GBWTHandler& gbwts, GBWTConfig& config); +void step_8_paths(GBWTHandler& gbwts, GBWTConfig& config); void report_time_memory(const std::string& what, double start_time, const GBWTConfig& config); void print_metadata(std::ostream& out, const GBWTHandler& gbwts); @@ -233,9 +233,9 @@ int main_gbwt(int argc, char** argv) { step_7_metadata(gbwts, config); } - // Thread options. - if (config.thread_mode) { - step_8_threads(gbwts, config); + // Path options. + if (config.path_mode) { + step_8_paths(gbwts, config); } return 0; @@ -260,7 +260,7 @@ void help_gbwt(char** argv) { std::cerr << " --id-interval N store path ids at one out of N positions (default " << gbwt::DynamicGBWT::SAMPLE_INTERVAL << ")" << std::endl; std::cerr << std::endl; std::cerr << "Multithreading:" << std::endl; - std::cerr << " --num-jobs N use at most N parallel build jobs (for -v, -G, -l, -P; default " << GBWTConfig::default_build_jobs() << ")" << std::endl; + std::cerr << " --num-jobs N use at most N parallel build jobs (for -v, -G, -A, -l, -P; default " << GBWTConfig::default_build_jobs() << ")" << std::endl; std::cerr << " --num-threads N use N parallel search threads (for -b and -r; default " << omp_get_max_threads() << ")" << std::endl; std::cerr << std::endl; std::cerr << "Step 1: GBWT construction (requires -o and one of { -v, -G, -Z, -E, A }):" << std::endl; @@ -330,12 +330,12 @@ void help_gbwt(char** argv) { std::cerr << " -H, --haplotypes print the number of haplotypes" << std::endl; std::cerr << " -S, --samples print the number of samples" << std::endl; std::cerr << " -L, --list-names list contig/sample names (use with -C or -S)" << std::endl; - std::cerr << " -T, --thread-names list thread names" << std::endl; + std::cerr << " -T, --path-names list path names" << std::endl; std::cerr << " --tags list GBWT tags" << std::endl; std::cerr << std::endl; - std::cerr << "Step 8: Threads (one input GBWT):" << std::endl; - std::cerr << " -c, --count-threads print the number of threads" << std::endl; - std::cerr << " -e, --extract FILE extract threads in SDSL format to FILE" << std::endl; + std::cerr << "Step 8: Paths (one input GBWT):" << std::endl; + std::cerr << " -c, --count-paths print the number of paths" << std::endl; + std::cerr << " -e, --extract FILE extract paths in SDSL format to FILE" << std::endl; std::cerr << std::endl; } @@ -424,7 +424,11 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) { constexpr int OPT_PASS_PATHS = 1400; constexpr int OPT_GBZ_FORMAT = 1500; constexpr int OPT_TAGS = 1700; - + + // Deprecated options. + constexpr int OPT_THREAD_NAMES = 2000; + constexpr int OPT_COUNT_THREADS = 2001; + // Make a collection of all the known tags and their descriptions. Use an ordered map so that we can do some typo guessing. // Values are description and list of prohibited characters. const std::map>> KNOWN_TAGS = { @@ -519,11 +523,13 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) { { "haplotypes", no_argument, 0, 'H' }, { "samples", no_argument, 0, 'S' }, { "list-names", no_argument, 0, 'L' }, - { "thread-names", no_argument, 0, 'T' }, + { "path-names", no_argument, 0, 'T' }, + { "thread-names", no_argument, 0, OPT_THREAD_NAMES }, { "tags", no_argument, 0, OPT_TAGS }, - // Threads - { "count-threads", no_argument, 0, 'c' }, + // Paths + { "count-paths", no_argument, 0, 'c' }, + { "count-threads", no_argument, 0, OPT_COUNT_THREADS }, { "extract", required_argument, 0, 'e' }, { "help", no_argument, 0, 'h' }, @@ -864,7 +870,12 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) { config.metadata_mode = true; break; case 'T': - config.thread_names = true; + config.path_names = true; + config.metadata_mode = true; + break; + case OPT_THREAD_NAMES: + std::cerr << "warning: [vg gbwt] option --thread-names is deprecated; use --path-names instead" << std::endl; + config.path_names = true; config.metadata_mode = true; break; case OPT_TAGS: @@ -872,14 +883,19 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) { config.metadata_mode = true; break; - // Threads + // Paths case 'c': - config.count_threads = true; - config.thread_mode = true; + config.count_paths = true; + config.path_mode = true; + break; + case OPT_COUNT_THREADS: + std::cerr << "warning: [vg gbwt] option --count-threads is deprecated; use --count-paths instead" << std::endl; + config.count_paths = true; + config.path_mode = true; break; case 'e': - config.thread_output = optarg; - config.thread_mode = true; + config.path_output = optarg; + config.path_mode = true; break; case 'h': @@ -999,9 +1015,11 @@ void validate_gbwt_config(GBWTConfig& config) { if (!config.to_remove.empty()) { if (config.build == GBWTConfig::build_gbz) { std::cerr << "error: [vg gbwt] the GBWT extracted from GBZ cannot have paths modified" << std::endl; + std::exit(EXIT_FAILURE); } if (config.build == GBWTConfig::build_gbwtgraph) { std::cerr << "error: [vg gbwt] the GBWT loaded with a GBWTGraph cannot have paths modified" << std::endl; + std::exit(EXIT_FAILURE); } if (!(config.input_filenames.size() == 1 || config.merge != GBWTConfig::merge_none) || !has_gbwt_output) { std::cerr << "error: [vg gbwt] removing a sample requires one input GBWT and output GBWT" << std::endl; @@ -1075,9 +1093,9 @@ void validate_gbwt_config(GBWTConfig& config) { } } - if (config.thread_mode) { + if (config.path_mode) { if (!one_input_gbwt) { - std::cerr << "error: [vg gbwt] thread operations require one input GBWT" << std::endl; + std::cerr << "error: [vg gbwt] path operations require one input GBWT" << std::endl; std::exit(EXIT_FAILURE); } } @@ -1371,7 +1389,8 @@ void step_1_build_gbwts(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& co if (config.show_progress) { std::cerr << "Input type: " << (config.gam_format ? "GAM" : "GAF") << std::endl; } - std::unique_ptr temp = config.haplotype_indexer.build_gbwt(*(graphs.path_graph), config.input_filenames, (config.gam_format ? "GAM" : "GAF")); + std::unique_ptr temp = + config.haplotype_indexer.build_gbwt(*(graphs.path_graph), config.input_filenames, (config.gam_format ? "GAM" : "GAF"), config.build_jobs); gbwts.use(*temp); } @@ -1456,7 +1475,7 @@ void remove_samples(GBWTHandler& gbwts, GBWTConfig& config) { gbwts.use_dynamic(); if (!(gbwts.dynamic.hasMetadata() && gbwts.dynamic.metadata.hasPathNames() && gbwts.dynamic.metadata.hasSampleNames())) { - std::cerr << "error: [vg gbwt] the index does not contain metadata with thread and sample names" << std::endl; + std::cerr << "error: [vg gbwt] the index does not contain metadata with path and sample names" << std::endl; std::exit(EXIT_FAILURE); } @@ -1469,11 +1488,11 @@ void remove_samples(GBWTHandler& gbwts, GBWTConfig& config) { } std::vector path_ids = gbwts.dynamic.metadata.removeSample(sample_id); if (path_ids.empty()) { - std::cerr << "warning: [vg gbwt] no threads associated with sample " << sample_name << std::endl; + std::cerr << "warning: [vg gbwt] no paths associated with sample " << sample_name << std::endl; continue; } if (config.show_progress) { - std::cerr << "Removing " << path_ids.size() << " threads for sample " << sample_name << std::endl; + std::cerr << "Removing " << path_ids.size() << " paths for sample " << sample_name << std::endl; } gbwts.dynamic.remove(path_ids); } @@ -1664,7 +1683,7 @@ void step_7_metadata(GBWTHandler& gbwts, GBWTConfig& config) { } } - if (config.thread_names) { + if (config.path_names) { auto& metadata = get_metadata(); if (metadata.hasPathNames()) { // Precompute some metadata @@ -1674,7 +1693,7 @@ void step_7_metadata(GBWTHandler& gbwts, GBWTConfig& config) { std::cout << gbwtgraph::compose_path_name(gbwts.compressed, i, sense) << std::endl; } } else { - std::cerr << "error: [vg gbwt] the metadata does not contain thread names" << std::endl; + std::cerr << "error: [vg gbwt] the metadata does not contain path names" << std::endl; } } @@ -1690,19 +1709,19 @@ void step_7_metadata(GBWTHandler& gbwts, GBWTConfig& config) { //---------------------------------------------------------------------------- -void step_8_threads(GBWTHandler& gbwts, GBWTConfig& config) { - // Extract threads in SDSL format. - if (!config.thread_output.empty()) { +void step_8_paths(GBWTHandler& gbwts, GBWTConfig& config) { + // Extract paths in SDSL format. + if (!config.path_output.empty()) { double start = gbwt::readTimer(); if (config.show_progress) { - std::cerr << "Extracting threads to " << config.thread_output << std::endl; + std::cerr << "Extracting paths to " << config.path_output << std::endl; } gbwts.use_compressed(); if (config.show_progress) { std::cerr << "Starting the extraction" << std::endl; } gbwt::size_type node_width = gbwt::bit_length(gbwts.compressed.sigma() - 1); - gbwt::text_buffer_type out(config.thread_output, std::ios::out, gbwt::MEGABYTE, node_width); + gbwt::text_buffer_type out(config.path_output, std::ios::out, gbwt::MEGABYTE, node_width); for (gbwt::size_type id = 0; id < gbwts.compressed.sequences(); id += 2) { // Ignore reverse complements. gbwt::vector_type sequence = gbwts.compressed.extract(id); for (auto node : sequence) { @@ -1711,11 +1730,11 @@ void step_8_threads(GBWTHandler& gbwts, GBWTConfig& config) { out.push_back(gbwt::ENDMARKER); } out.close(); - report_time_memory("Threads extracted", start, config); + report_time_memory("Paths extracted", start, config); } - // There are two sequences for each thread. - if (config.count_threads) { + // There are two sequences for each path. + if (config.count_paths) { gbwts.use_compressed(); std::cout << (gbwts.compressed.sequences() / 2) << std::endl; } diff --git a/src/subcommand/index_main.cpp b/src/subcommand/index_main.cpp index 3ffb69dc70a..731fdcff26d 100644 --- a/src/subcommand/index_main.cpp +++ b/src/subcommand/index_main.cpp @@ -1,4 +1,4 @@ -// index.cpp: define the "vg index" subcommand, which makes xg, GCSA2, and GBWT indexes +// index.cpp: define the "vg index" subcommand, which makes xg, GCSA2, and distance indexes #include #include @@ -11,7 +11,6 @@ #include "subcommand.hpp" #include "../vg.hpp" -#include "../haplotype_indexer.hpp" #include "xg.hpp" #include #include @@ -23,12 +22,10 @@ #include "../integrated_snarl_finder.hpp" #include "../snarl_distance_index.hpp" #include "../source_sink_overlay.hpp" -#include "../gbwt_helper.hpp" #include "../gbwtgraph_helper.hpp" #include "../gcsa_helper.hpp" #include -#include #include #include @@ -47,24 +44,6 @@ void help_index(char** argv) { << "xg options:" << endl << " -x, --xg-name FILE use this file to store a succinct, queryable version of the graph(s), or read for GCSA or distance indexing" << endl << " -L, --xg-alts include alt paths in xg" << endl - << "gbwt options (more in vg gbwt):" << endl - << " -v, --vcf-phasing FILE generate threads from the haplotypes in the VCF file FILE" << endl - << " -W, --ignore-missing don't warn when variants in the VCF are missing from the graph; silently skip them" << endl - << " -T, --store-threads generate threads from the embedded paths" << endl - << " -M, --store-gam FILE generate threads from the alignments in gam FILE (many allowed)" << endl - << " -F, --store-gaf FILE generate threads from the alignments in gaf FILE (many allowed)" << endl - << " -G, --gbwt-name FILE store the threads as GBWT in FILE" << endl - << " -z, --actual-phasing do not make unphased homozygous genotypes phased"<< endl - << " -P, --force-phasing replace unphased genotypes with randomly phased ones" << endl - << " -o, --discard-overlaps skip overlapping alternate alleles if the overlap cannot be resolved" << endl - << " -B, --batch-size N number of samples per batch (default 200)" << endl - << " -u, --buffer-size N GBWT construction buffer size in millions of nodes (default 100)" << endl - << " -n, --id-interval N store haplotype ids at one out of N positions (default 1024)" << endl - << " -R, --range X..Y process samples X to Y (inclusive)" << endl - << " -r, --rename V=P rename contig V in the VCFs to path P in the graph (may repeat)" << endl - << " --rename-variants when renaming contigs, find variants in the graph based on the new name" << endl - << " -I, --region C:S-E operate on only the given 1-based region of the given VCF contig (may repeat)" << endl - << " -E, --exclude SAMPLE exclude any samples with the given name from haplotype indexing" << endl << "gcsa options:" << endl << " -g, --gcsa-out FILE output a GCSA2 index to the given file" << endl //<< " -i, --dbg-in FILE use kmers from FILE instead of input VG (may repeat)" << endl @@ -83,12 +62,6 @@ void help_index(char** argv) { << " if N is 0 then don't store distances, only the snarl tree" << endl; } -void multiple_thread_sources() { - std::cerr << "error: [vg index] cannot generate threads from multiple sources (VCF, GAM, GAF, paths)" << std::endl; - std::cerr << "error: [vg index] GBWT indexes can be built separately and merged with vg gbwt" << std::endl; - std::exit(EXIT_FAILURE); -} - int main_index(int argc, char** argv) { if (argc == 2) { @@ -101,25 +74,18 @@ int main_index(int argc, char** argv) { #define OPT_DISTANCE_SNARL_LIMIT 1002 // Which indexes to build. - bool build_xg = false, build_gbwt = false, build_gcsa = false, build_dist = false; + bool build_xg = false, build_gcsa = false, build_dist = false; // Files we should read. string vcf_name, mapping_name; vector dbg_names; // Files we should write. - string xg_name, gbwt_name, gcsa_name, dist_name; - + string xg_name, gcsa_name, dist_name; // General bool show_progress = false; - // GBWT - HaplotypeIndexer haplotype_indexer; - enum thread_source_type { thread_source_none, thread_source_vcf, thread_source_paths, thread_source_gam, thread_source_gaf }; - thread_source_type thread_source = thread_source_none; - vector aln_file_names; - // GCSA gcsa::size_type kmer_size = gcsa::Key::MAX_LENGTH; gcsa::ConstructionParameters params; @@ -152,7 +118,7 @@ int main_index(int argc, char** argv) { {"thread-db", required_argument, 0, 'F'}, {"xg-alts", no_argument, 0, 'L'}, - // GBWT + // GBWT. These have been removed and will return an error. {"vcf-phasing", required_argument, 0, 'v'}, {"ignore-missing", no_argument, 0, 'W'}, {"store-threads", no_argument, 0, 'T'}, @@ -211,7 +177,6 @@ int main_index(int argc, char** argv) { break; case 'p': show_progress = true; - haplotype_indexer.show_progress = true; break; // XG @@ -223,111 +188,26 @@ int main_index(int argc, char** argv) { xg_alts = true; break; - // GBWT - case 'v': - if (thread_source != thread_source_none) { - multiple_thread_sources(); - } - thread_source = thread_source_vcf; - vcf_name = optarg; - break; - case 'W': - haplotype_indexer.warn_on_missing_variants = false; - break; - case 'T': - if (thread_source != thread_source_none) { - multiple_thread_sources(); - } - thread_source = thread_source_paths; - break; - case 'M': - if (thread_source != thread_source_none && thread_source != thread_source_gam) { - multiple_thread_sources(); - } - thread_source = thread_source_gam; - build_gbwt = true; - aln_file_names.push_back(optarg); - break; - case 'F': - if (thread_source != thread_source_none && thread_source != thread_source_gaf) { - multiple_thread_sources(); - } - thread_source = thread_source_gaf; - build_gbwt = true; - aln_file_names.push_back(optarg); - break; - case 'G': - build_gbwt = true; - gbwt_name = optarg; - break; - case 'z': - haplotype_indexer.phase_homozygous = false; - break; - case 'P': - haplotype_indexer.force_phasing = true; - break; - case 'o': - haplotype_indexer.discard_overlaps = true; - break; - case 'B': - haplotype_indexer.samples_in_batch = std::max(parse(optarg), 1ul); - break; - case 'u': - haplotype_indexer.gbwt_buffer_size = std::max(parse(optarg), 1ul); - break; - case 'n': - haplotype_indexer.id_interval = parse(optarg); - break; - case 'R': - { - // Parse first..last - string temp(optarg); - size_t found = temp.find(".."); - if(found == string::npos || found == 0 || found + 2 == temp.size()) { - cerr << "error: [vg index] could not parse range " << temp << endl; - exit(1); - } - haplotype_indexer.sample_range.first = parse(temp.substr(0, found)); - haplotype_indexer.sample_range.second = parse(temp.substr(found + 2)) + 1; - } - break; - case 'r': - { - // Parse the rename old=new - string key_value(optarg); - auto found = key_value.find('='); - if (found == string::npos || found == 0 || found + 1 == key_value.size()) { - cerr << "error: [vg index] could not parse rename " << key_value << endl; - exit(1); - } - // Parse out the two parts - string vcf_contig = key_value.substr(0, found); - string graph_contig = key_value.substr(found + 1); - // Add the name mapping - haplotype_indexer.path_to_vcf[graph_contig] = vcf_contig; - } - break; - case OPT_RENAME_VARIANTS: - haplotype_indexer.rename_variants = true; - break; - case 'I': - { - // We want to parse this region specifier - string region(optarg); - - Region parsed; - parse_region(region, parsed); - if (parsed.start <= 0 || parsed.end <= 0) { - // We need both range bounds, and we can't accept 0 since input is 1-based. - cerr << "error: [vg index] could not parse 1-based region " << optarg << endl; - } - - // Make sure to correct the coordinates to 0-based exclusive-end, from 1-based inclusive-end - haplotype_indexer.regions[parsed.seq] = make_pair((size_t) (parsed.start - 1), (size_t) parsed.end); - } - break; + // GBWT. The options remain, but they are no longer supported. + case 'v': // Fall through + case 'W': // Fall through + case 'T': // Fall through + case 'M': // Fall through + case 'F': // Fall through + case 'G': // Fall through + case 'z': // Fall through + case 'P': // Fall through + case 'o': // Fall through + case 'B': // Fall through + case 'u': // Fall through + case 'n': // Fall through + case 'R': // Fall through + case 'r': // Fall through + case OPT_RENAME_VARIANTS: // Fall through + case 'I': // Fall through case 'E': - haplotype_indexer.excluded_samples.insert(optarg); + std::cerr << "error: [vg index] GBWT construction options have been removed; use vg gbwt instead" << std::endl; + std::exit(EXIT_FAILURE); break; // GCSA @@ -391,37 +271,11 @@ int main_index(int argc, char** argv) { } - if (xg_name.empty() && gbwt_name.empty() && - gcsa_name.empty() && !build_gai_index && !build_vgi_index && dist_name.empty()) { + if (xg_name.empty() && gcsa_name.empty() && !build_gai_index && !build_vgi_index && dist_name.empty()) { cerr << "error: [vg index] index type not specified" << endl; return 1; } - if (build_gbwt && thread_source == thread_source_none) { - cerr << "error: [vg index] cannot build GBWT without threads" << endl; - return 1; - } - - if (thread_source != thread_source_none && !build_gbwt) { - cerr << "error: [vg index] no GBWT output specified for the threads" << endl; - return 1; - } - - if (thread_source == thread_source_gam || thread_source == thread_source_gaf) { - for (const auto& name : aln_file_names) { - if (name == "-") { - cerr << "error: [vg index] GAM (-M) and GAF (-F) input files cannot be read from stdin (-)" << endl; - return 1; - } - } - } - - if (thread_source != thread_source_none && file_names.size() != 1) { - cerr << "error: [vg index] exactly one graph required for generating threads" << std::endl; - cerr << "error: [vg index] you may combine the graphs with vg index -x combined.xg --xg-alts" << std::endl; - return 1; - } - if (file_names.size() <= 0 && dbg_names.empty()){ //cerr << "No graph provided for indexing. Please provide a .vg file or GCSA2-format deBruijn graph to index." << endl; //return 1; @@ -481,30 +335,6 @@ int main_index(int argc, char** argv) { vg::io::save_handle_graph(&xg_index, xg_name); } - // Generate threads - if (thread_source != thread_source_none) { - - // Load the only input graph. - unique_ptr path_handle_graph; - path_handle_graph = vg::io::VPKG::load_one(file_names[0]); - - std::unique_ptr gbwt_index(nullptr); - if (thread_source == thread_source_vcf) { - std::vector parse_files = haplotype_indexer.parse_vcf(vcf_name, *path_handle_graph); - path_handle_graph.reset(); // Save memory by deleting the graph. - gbwt_index = haplotype_indexer.build_gbwt(parse_files); - } else if (thread_source == thread_source_paths) { - gbwt_index = haplotype_indexer.build_gbwt(*path_handle_graph); - } else if (thread_source == thread_source_gam) { - gbwt_index = haplotype_indexer.build_gbwt(*path_handle_graph, aln_file_names, "GAM"); - } else if (thread_source == thread_source_gaf) { - gbwt_index = haplotype_indexer.build_gbwt(*path_handle_graph, aln_file_names, "GAF"); - } - if (build_gbwt && gbwt_index.get() != nullptr) { - save_gbwt(*gbwt_index, gbwt_name, show_progress); - } - } // End of thread indexing. - // Build GCSA if (build_gcsa) { @@ -701,7 +531,7 @@ int main_index(int argc, char** argv) { SnarlDistanceIndex distance_index; //Fill it in - fill_in_distance_index(&distance_index, xg.get(), &snarl_finder, snarl_limit); + fill_in_distance_index(&distance_index, xg.get(), &snarl_finder, snarl_limit, false); // Save it distance_index.serialize(dist_name); } else { diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp index fb668d7edd4..bb028949269 100644 --- a/src/subcommand/paths_main.cpp +++ b/src/subcommand/paths_main.cpp @@ -18,6 +18,7 @@ #include "../gbwt_helper.hpp" #include "../traversal_clusters.hpp" #include "../io/save_handle_graph.hpp" +#include #include #include #include @@ -57,6 +58,8 @@ void help_paths(char** argv) { << " -G, --generic-paths select the generic, non-reference, non-haplotype paths" << endl << " -R, --reference-paths select the reference paths" << endl << " -H, --haplotype-paths select the haplotype paths paths" << endl + << " configuration:" << endl + << " -o, --overlay apply a ReferencePathOverlayHelper to the graph" << endl << " -t, --threads N number of threads to use [all available]. applies only to snarl finding within -n" << endl; } @@ -122,6 +125,7 @@ int main_paths(int argc, char** argv) { bool coverage = false; const size_t coverage_bins = 10; bool normalize_paths = false; + bool overlay = false; int c; optind = 2; // force optind past command positional argument @@ -151,6 +155,7 @@ int main_paths(int argc, char** argv) { {"reference-paths", no_argument, 0, 'R'}, {"haplotype-paths", no_argument, 0, 'H'}, {"coverage", no_argument, 0, 'c'}, + {"overlay", no_argument, 0, 'o'}, // Hidden options for backward compatibility. {"threads", no_argument, 0, 'T'}, @@ -160,7 +165,7 @@ int main_paths(int argc, char** argv) { }; int option_index = 0; - c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:drnaGRHp:ct:", + c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:drnaGRHp:coTq:t:", long_options, &option_index); // Detect the end of the options. @@ -276,6 +281,10 @@ int main_paths(int argc, char** argv) { output_formats++; break; + case 'o': + overlay = true; + break; + case 'T': std::cerr << "warning: [vg paths] option --threads is obsolete and unnecessary" << std::endl; break; @@ -375,11 +384,23 @@ int main_paths(int argc, char** argv) { // Load whatever indexes we were given // Note: during handlifiction, distinction between -v and -x options disappeared. - unique_ptr graph; + unique_ptr path_handle_graph; if (!graph_file.empty()) { // Load the graph - graph = vg::io::VPKG::load_one(graph_file); + path_handle_graph = vg::io::VPKG::load_one(graph_file); + } + bdsg::ReferencePathOverlayHelper overlay_helper; + PathHandleGraph* graph = nullptr; + if (path_handle_graph) { + if (overlay) { + // Try to apply the overlay if the user wanted it and this isn't + // already a graph implementing position lookups. + graph = overlay_helper.apply(path_handle_graph.get()); + } else { + graph = path_handle_graph.get(); + } } + unique_ptr gbwt_index; if (!gbwt_file.empty()) { // We want a gbwt @@ -418,7 +439,7 @@ int main_paths(int argc, char** argv) { if (extract_as_gam || extract_as_gaf) { // Open up a GAM/GAF output stream aln_emitter = vg::io::get_non_hts_alignment_emitter("-", extract_as_gaf ? "GAF" : "GAM", {}, get_thread_count(), - graph.get()); + graph); } else if (extract_as_vg) { // Open up a VG Graph chunk output stream graph_emitter = unique_ptr>(new vg::io::ProtobufEmitter(cout)); @@ -589,7 +610,7 @@ int main_paths(int argc, char** argv) { }; if (drop_paths || retain_paths || normalize_paths) { - MutablePathMutableHandleGraph* mutable_graph = dynamic_cast(graph.get()); + MutablePathMutableHandleGraph* mutable_graph = dynamic_cast(graph); if (!mutable_graph) { std::cerr << "error[vg paths]: graph cannot be modified" << std::endl; exit(1); @@ -610,7 +631,7 @@ int main_paths(int argc, char** argv) { for_each_selected_path([&](const path_handle_t& path_handle) { selected_paths.insert(path_handle); }); - merge_equivalent_traversals_in_graph(mutable_graph, selected_paths); + merge_equivalent_traversals_in_graph(mutable_graph, selected_paths, true); } if (!to_destroy.empty()) { mutable_graph->destroy_paths(to_destroy); diff --git a/src/subcommand/rna_main.cpp b/src/subcommand/rna_main.cpp index 0e59220fae4..515c2112e16 100644 --- a/src/subcommand/rna_main.cpp +++ b/src/subcommand/rna_main.cpp @@ -54,6 +54,7 @@ void help_rna(char** argv) { << "\nOutput options:" << endl << " -b, --write-gbwt FILE write pantranscriptome transcript paths as GBWT index file" << endl + << " -v, --write-hap-gbwt FILE write input haplotypes as a GBWT with node IDs matching the output graph" << endl << " -f, --write-fasta FILE write pantranscriptome transcript sequences as fasta file" << endl << " -i, --write-info FILE write pantranscriptome transcript info table as tsv file" << endl << " -q, --out-exclude-ref exclude reference transcripts from pantranscriptome output" << endl @@ -88,6 +89,7 @@ int32_t main_rna(int32_t argc, char** argv) { bool gbwt_add_bidirectional = false; string fasta_out_filename = ""; string info_out_filename = ""; + string hap_gbwt_out_filename = ""; int32_t num_threads = 1; bool show_progress = false; @@ -110,8 +112,9 @@ int32_t main_rna(int32_t argc, char** argv) { {"remove-non-gene", no_argument, 0, 'd'}, {"do-not-sort", no_argument, 0, 'o'}, {"add-ref-paths", no_argument, 0, 'r'}, - {"add-hap-paths", no_argument, 0, 'a'}, + {"add-hap-paths", no_argument, 0, 'a'}, {"write-gbwt", required_argument, 0, 'b'}, + {"write-hap-gbwt", required_argument, 0, 'v'}, {"write-fasta", required_argument, 0, 'f'}, {"write-info", required_argument, 0, 'i'}, {"out-ref-paths", no_argument, 0, 'u'}, @@ -124,7 +127,7 @@ int32_t main_rna(int32_t argc, char** argv) { }; int32_t option_index = 0; - c = getopt_long(argc, argv, "n:m:y:s:l:zjec:k:dorab:f:i:uqgt:ph?", long_options, &option_index); + c = getopt_long(argc, argv, "n:m:y:s:l:zjec:k:dorab:v:f:i:uqgt:ph?", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) @@ -188,10 +191,14 @@ int32_t main_rna(int32_t argc, char** argv) { case 'a': add_projected_transcript_paths = true; break; - + case 'b': gbwt_out_filename = optarg; break; + + case 'v': + hap_gbwt_out_filename = optarg; + break; case 'f': fasta_out_filename = optarg; @@ -486,6 +493,19 @@ int32_t main_rna(int32_t argc, char** argv) { gbwt_builder.finish(); save_gbwt(gbwt_builder.index, gbwt_out_filename); } + + // Write a haplotype GBWT with node IDs updated to match the spliced graph. + if (!hap_gbwt_out_filename.empty()) { + if (!haplotype_index.get()) { + cerr << "[vg rna] Warning: not saving updated haplotypes to " << hap_gbwt_out_filename << " because haplotypes were not provided as input" << endl; + } + else { + ofstream hap_gbwt_ostream; + hap_gbwt_ostream.open(hap_gbwt_out_filename); + + haplotype_index->serialize(hap_gbwt_ostream); + } + } // Write transcript sequences in transcriptome as fasta file. if (!fasta_out_filename.empty()) { @@ -496,7 +516,7 @@ int32_t main_rna(int32_t argc, char** argv) { transcriptome.write_transcript_sequences(&fasta_ostream, exclude_reference_transcripts); fasta_ostream.close(); - } + } // Write transcript info in transcriptome as tsv file. if (!info_out_filename.empty()) { diff --git a/src/subcommand/validate_main.cpp b/src/subcommand/validate_main.cpp index 1512aa98ec7..2308f6a2c77 100644 --- a/src/subcommand/validate_main.cpp +++ b/src/subcommand/validate_main.cpp @@ -136,7 +136,7 @@ int main_validate(int argc, char** argv) { if (!gam_only) { VG* vg_graph = dynamic_cast(graph.get()); if (vg_graph != nullptr) { - if (!vg_graph->is_valid(true, true, check_orphans, true)) { + if (!vg_graph->is_valid(true, true, true, check_orphans)) { valid_graph = false; } } diff --git a/src/traversal_clusters.cpp b/src/traversal_clusters.cpp index 7e361c84f6a..130b65f69e1 100644 --- a/src/traversal_clusters.cpp +++ b/src/traversal_clusters.cpp @@ -2,6 +2,7 @@ #include "traversal_finder.hpp" #include "integrated_snarl_finder.hpp" #include "snarl_distance_index.hpp" +#include "snarls.hpp" //#define debug @@ -379,38 +380,64 @@ static void merge_equivalent_traversals_in_snarl(MutablePathHandleGraph* graph, } } -void merge_equivalent_traversals_in_graph(MutablePathHandleGraph* graph, const unordered_set& selected_paths) { - // compute the snarls - SnarlDistanceIndex distance_index; - { - IntegratedSnarlFinder snarl_finder(*graph); - // todo: why can't I pass in 0 below -- I don't want any dinstances! - fill_in_distance_index(&distance_index, graph, &snarl_finder, 1); - } +void merge_equivalent_traversals_in_graph(MutablePathHandleGraph* graph, const unordered_set& selected_paths, + bool use_snarl_manager) { // only consider embedded paths that span snarl PathTraversalFinder path_trav_finder(*graph); - // do every snarl top-down. this is because it's possible (tho probably rare) for a child snarl to - // be redundant after normalizing its parent. don't think the opposite (normalizing child) - // causes redundant parent.. todo: can we guarantee?! - net_handle_t root = distance_index.get_root(); - deque queue = {root}; - - while (!queue.empty()) { - net_handle_t net_handle = queue.front(); - queue.pop_front(); - if (distance_index.is_snarl(net_handle)) { - net_handle_t start_bound = distance_index.get_bound(net_handle, false, true); - net_handle_t end_bound = distance_index.get_bound(net_handle, true, false); - handle_t start_handle = distance_index.get_handle(start_bound, graph); - handle_t end_handle = distance_index.get_handle(end_bound, graph); + if (use_snarl_manager) { + // compute the snarls using the old snarl manager + IntegratedSnarlFinder finder(*graph); + SnarlManager snarl_manager(std::move(finder.find_snarls_parallel())); + + deque queue; + snarl_manager.for_each_top_level_snarl([&](const Snarl* snarl) { + queue.push_back(snarl); + }); + + while (!queue.empty()) { + const Snarl* snarl = queue.front(); + queue.pop_front(); + handle_t start_handle = graph->get_handle(snarl->start().node_id(), snarl->start().backward()); + handle_t end_handle = graph->get_handle(snarl->end().node_id(), snarl->end().backward()); merge_equivalent_traversals_in_snarl(graph, selected_paths, path_trav_finder, start_handle, end_handle); - } - if (net_handle == root || distance_index.is_snarl(net_handle) || distance_index.is_chain(net_handle)) { - distance_index.for_each_child(net_handle, [&](net_handle_t child_handle) { - queue.push_back(child_handle); - }); + const vector& children = snarl_manager.children_of(snarl); + for (const Snarl* child : children) { + queue.push_back(child); + } + } + } else { + // compute the snarls using the distance index + // this is what we want to do going forward since it uses the new api, no protobuf etc, + // but unfortunately it seems way slower on some graphs, hence + SnarlDistanceIndex distance_index; + { + IntegratedSnarlFinder snarl_finder(*graph); + fill_in_distance_index(&distance_index, graph, &snarl_finder, 0); + } + + // do every snarl top-down. this is because it's possible (tho probably rare) for a child snarl to + // be redundant after normalizing its parent. don't think the opposite (normalizing child) + // causes redundant parent.. todo: can we guarantee?! + net_handle_t root = distance_index.get_root(); + deque queue = {root}; + + while (!queue.empty()) { + net_handle_t net_handle = queue.front(); + queue.pop_front(); + if (distance_index.is_snarl(net_handle)) { + net_handle_t start_bound = distance_index.get_bound(net_handle, false, true); + net_handle_t end_bound = distance_index.get_bound(net_handle, true, false); + handle_t start_handle = distance_index.get_handle(start_bound, graph); + handle_t end_handle = distance_index.get_handle(end_bound, graph); + merge_equivalent_traversals_in_snarl(graph, selected_paths, path_trav_finder, start_handle, end_handle); + } + if (net_handle == root || distance_index.is_snarl(net_handle) || distance_index.is_chain(net_handle)) { + distance_index.for_each_child(net_handle, [&](net_handle_t child_handle) { + queue.push_back(child_handle); + }); + } } } } diff --git a/src/traversal_clusters.hpp b/src/traversal_clusters.hpp index 646cbf46409..b35bbb3d730 100644 --- a/src/traversal_clusters.hpp +++ b/src/traversal_clusters.hpp @@ -99,6 +99,9 @@ vector> assign_child_snarls_to_traversals(const PathHandleGraph* gra /// Note: this doesn't modify the graph toplogy, so uncovered nodes and edges as a result of path editing /// would usually need removale with vg clip afterwards /// -void merge_equivalent_traversals_in_graph(MutablePathHandleGraph* graph, const unordered_set& selected_paths); +/// the use_snarl_manager toggles between distnace index and snarl manager for computing snarls +/// (adding this option to (hopefully) temporarily revert to the snarl manager for performance reasons) +void merge_equivalent_traversals_in_graph(MutablePathHandleGraph* graph, const unordered_set& selected_paths, + bool use_snarl_manager=false); } diff --git a/src/unittest/chunker.cpp b/src/unittest/chunker.cpp index 91326aa1ca9..24f7d3b645a 100644 --- a/src/unittest/chunker.cpp +++ b/src/unittest/chunker.cpp @@ -186,10 +186,28 @@ TEST_CASE("basic graph chunking", "[chunk]") { VG subgraph; Region out_region; chunker.extract_subgraph(region, 1, 0, false, subgraph, out_region); + + // We include node 4 in the cycle, and also node 3 which is 1 away, so + // we include all the loops arounf the cycle and need to start the + // extracted path region where it enters node 3 at base 6. REQUIRE(subgraph.node_count() == 7); REQUIRE(subgraph.edge_count() == 9); + REQUIRE(out_region.start == 6); + + } + + SECTION("Partial graph via cyclic path, 0 expansion") { + + Region region = {"z", 35, 36}; + VG subgraph; + Region out_region; + chunker.extract_subgraph(region, 0, 0, false, subgraph, out_region); + + REQUIRE(subgraph.node_count() == 1); + REQUIRE(subgraph.edge_count() == 0); REQUIRE(out_region.start == 31); + REQUIRE(out_region.end == 36); // End is inclusive } diff --git a/src/unittest/snarl_distance_index.cpp b/src/unittest/snarl_distance_index.cpp index 127c3c28ccb..1450595e496 100644 --- a/src/unittest/snarl_distance_index.cpp +++ b/src/unittest/snarl_distance_index.cpp @@ -156,7 +156,7 @@ namespace vg { REQUIRE(std::get<2>(traceback.second.back()) == -5); } } - TEST_CASE( "Nested chain with loop", "[snarl_distance]" ) { + TEST_CASE( "Nested chain with loop", "[snarl_distance][bug]" ) { VG graph; @@ -196,9 +196,9 @@ namespace vg { //get the snarls IntegratedSnarlFinder snarl_finder(graph); - SnarlDistanceIndex distance_index; - fill_in_distance_index(&distance_index, &graph, &snarl_finder); SECTION("Traversal of chain") { + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder); net_handle_t chain1_13 = distance_index.get_parent(distance_index.get_node_net_handle(n1->id())); distance_index.for_each_child(chain1_13, [&](const net_handle_t& child) { if (distance_index.is_node(child)) { @@ -206,6 +206,8 @@ namespace vg { }); } SECTION("Minimum distances are correct") { + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder); net_handle_t node2 = distance_index.get_node_net_handle(n2->id()); net_handle_t chain1_13 = distance_index.get_parent(node2); REQUIRE(distance_index.distance_in_parent(chain1_13, distance_index.flip(node2), distance_index.flip(node2)) == 0); @@ -217,6 +219,8 @@ namespace vg { REQUIRE(distance_index.minimum_distance(n7->id(),false, 0, n8->id(), true, 0) == 1); } SECTION("Paths are correct") { + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder); size_t traversal_i = 0; vector> actual_path; @@ -237,6 +241,8 @@ namespace vg { //REQUIRE(traversal_i == 7); } SECTION("Path that leaves lowest common ancestor") { + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder); size_t traversal_i = 0; vector> actual_path; @@ -260,6 +266,10 @@ namespace vg { //}); //REQUIRE(traversal_i == 8); } + SECTION("Distanceless index") { + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder, 0); + } } TEST_CASE( "Snarl decomposition can deal with multiple connected components", "[snarl_distance]" ) { @@ -788,9 +798,9 @@ namespace vg { //get the snarls IntegratedSnarlFinder snarl_finder(graph); - SnarlDistanceIndex distance_index; - fill_in_distance_index(&distance_index, &graph, &snarl_finder); SECTION("Traverse the root") { + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder); net_handle_t chain1 = distance_index.get_parent(distance_index.get_node_net_handle(n1->id())); net_handle_t node2 = distance_index.get_node_net_handle(n2->id()); net_handle_t chain2 = distance_index.get_parent(node2); @@ -806,6 +816,8 @@ namespace vg { REQUIRE(found == 2); } SECTION("into_which_snarl") { + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder); REQUIRE((distance_index.into_which_snarl(n4->id(), false) == std::make_tuple(4, false, true) || distance_index.into_which_snarl(n4->id(), false) == std::make_tuple(5, true, true))); REQUIRE((distance_index.into_which_snarl(n5->id(), true) == std::make_tuple(4, false, true) || @@ -815,6 +827,10 @@ namespace vg { REQUIRE((distance_index.into_which_snarl(n4->id(), true) == std::make_tuple(4, true, false) || distance_index.into_which_snarl(n4->id(), true) == std::make_tuple(2, false, false))); } + SECTION("distanceless index") { + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder, 0); + } } @@ -1840,7 +1856,7 @@ namespace vg { } TEST_CASE( "Snarl decomposition can handle chains with nodes in different directions", - "[snarl_distance][bug]" ) { + "[snarl_distance]" ) { // This graph will have a snarl from 1 to 8, a snarl from 2 to 7, @@ -6787,7 +6803,7 @@ namespace vg { } } //End test case - TEST_CASE("top level chain subgraph", "[snarl_distance][snarl_distance_subgraph][bug]") { + TEST_CASE("top level chain subgraph", "[snarl_distance][snarl_distance_subgraph]") { VG graph; Node* n1 = graph.create_node("GCA"); @@ -6821,8 +6837,6 @@ namespace vg { Edge* e16 = graph.create_edge(n10, n12); Edge* e17 = graph.create_edge(n11, n12); - ofstream out ("test_graph.vg"); - graph.serialize(out); IntegratedSnarlFinder snarl_finder(graph); @@ -6858,9 +6872,6 @@ namespace vg { SnarlDistanceIndex distance_index; fill_in_distance_index(&distance_index, &graph, &snarl_finder); subgraph_in_distance_range(distance_index, path, &graph, 11, 14, sub_graph, true); - for (auto& id : sub_graph) { - cerr << id << endl; - } REQUIRE(!sub_graph.count(3)); REQUIRE(!sub_graph.count(8)); @@ -7054,6 +7065,51 @@ namespace vg { } } + TEST_CASE( "Loop in chain not connected to snarl", "[snarl_distance]" ) { + + VG graph; + + Node* n1 = graph.create_node("GCAA"); + Node* n2 = graph.create_node("GCAA"); + Node* n3 = graph.create_node("AAAT"); + Node* n4 = graph.create_node("T"); + Node* n5 = graph.create_node("G"); + Node* n6 = graph.create_node("AAAA"); + Node* n7 = graph.create_node("GGAA"); + Node* n8 = graph.create_node("TTT"); + + Edge* e1 = graph.create_edge(n1, n2); + Edge* e2 = graph.create_edge(n2, n2, true, false); + Edge* e3 = graph.create_edge(n2, n3); + + Edge* e4 = graph.create_edge(n3, n4); + Edge* e5 = graph.create_edge(n3, n5); + Edge* e6 = graph.create_edge(n4, n6); + Edge* e7 = graph.create_edge(n5, n6); + Edge* e8 = graph.create_edge(n6, n7); + Edge* e9 = graph.create_edge(n6, n6, false, true); + Edge* e10 = graph.create_edge(n7, n8); + + + //get the snarls + IntegratedSnarlFinder snarl_finder(graph); + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder); + + + SECTION("Traversal of chain") { + net_handle_t chain1_6 = distance_index.get_parent(distance_index.get_node_net_handle(n1->id())); + distance_index.for_each_child(chain1_6, [&](const net_handle_t& child) { + assert(distance_index.canonical(distance_index.get_parent(child)) == distance_index.canonical(chain1_6)); + }); + } + SECTION("Minimum distances are correct") { + REQUIRE(distance_index.minimum_distance(n1->id(),false, 0, n1->id(), true, 0) == 30); + } + + } + + TEST_CASE("random test subgraph", "[snarl_distance][snarl_distance_subgraph]") { int64_t min = 20; int64_t max = 50; diff --git a/src/version.cpp b/src/version.cpp index ae5135e3910..72bb4292262 100644 --- a/src/version.cpp +++ b/src/version.cpp @@ -112,7 +112,17 @@ const unordered_map Version::codenames = { {"v1.57.0", "Franchini"}, {"v1.58.0", "Cartari"}, {"v1.59.0", "Casatico"}, - {"v1.69.0", "Bologna"} // Reserved + {"v1.60.0", "Annicco"}, + {"v1.61.0", "Plodio"}, + {"v1.62.0", "Ranzano"}, + {"v1.63.0", "Boccaleone"}, + {"v1.64.0", "Vibbiana"}, + {"v1.65.0", "Carfon"}, + {"v1.66.0", "Navetta"}, + {"v1.67.0", "Vetria"}, + {"v1.68.0", "Rimbocchi"}, + {"v1.69.0", "Bologna"}, + {"v1.70.0", "Zebedassi"} // Add more codenames here }; diff --git a/src/vg.cpp b/src/vg.cpp index 48f9ee531bc..985834b5571 100644 --- a/src/vg.cpp +++ b/src/vg.cpp @@ -623,6 +623,7 @@ void VG::destroy_edge(const handle_t& left, const handle_t& right) { } void VG::clear() { + clear_paths(); graph.mutable_node()->Clear(); graph.mutable_edge()->Clear(); clear_indexes(); diff --git a/test/Makefile b/test/Makefile index 5ae913873f1..bdf0526ace7 100644 --- a/test/Makefile +++ b/test/Makefile @@ -20,7 +20,7 @@ $(vg): cd .. && $(MAKE) bin/vg build_graph: build_graph.cpp - cd .. && . ./source_me.sh && $(MAKE) test/build_graph + cd .. && $(MAKE) test/build_graph clean: rm -f build_graph diff --git a/test/t/05_vg_find.t b/test/t/05_vg_find.t index f7c54e36eee..67a51741da1 100644 --- a/test/t/05_vg_find.t +++ b/test/t/05_vg_find.t @@ -112,7 +112,7 @@ rm -f t.xg t.vg t.x:30:35.vg t.x:10:20.vg q.x:30:35.vg q.x:10:20.vg t.bed vg construct -r small/xy.fa -v small/xy2.vcf.gz -R x -C -a > x.vg 2> /dev/null vg index -x x.xg x.vg -vg index -G x.gbwt -v small/xy2.vcf.gz x.vg +vg gbwt -v small/xy2.vcf.gz -o x.gbwt -x x.vg is $(vg find -p x -x x.xg -K 16 -H x.gbwt | cut -f 5 | sort | uniq -c | tail -n 1 | awk '{ print $1 }') 1510 "we find the expected number of kmers with haplotype frequency equal to 2" rm -f x.vg x.xg x.gbwt @@ -124,7 +124,7 @@ is $? 0 "GFA i/o for find -n consistent with converting both ways" # Find nodes that map to the provided ids vg construct -m 32 -r small/xy.fa -v small/xy2.vcf.gz -R x -C -a > x.vg 2> /dev/null -vg index -G x.gbwt -v small/xy2.vcf.gz x.vg +vg gbwt -v small/xy2.vcf.gz -o x.gbwt -x x.vg vg prune -u -m x.mapping -g x.gbwt -e 1 x.vg > x.unfolded.vg rm -f expected.gfa diff --git a/test/t/06_vg_index.t b/test/t/06_vg_index.t index a8ac6add8dc..0e3edf78dbc 100644 --- a/test/t/06_vg_index.t +++ b/test/t/06_vg_index.t @@ -7,7 +7,7 @@ PATH=../bin:$PATH # for vg export LC_ALL="en_US.utf8" # force ekg's favorite sort order -plan tests 54 +plan tests 45 # Single graph without haplotypes vg construct -r small/x.fa -v small/x.vcf.gz > x.vg @@ -39,9 +39,6 @@ rm -f x3.gcsa x3.gcsa.lcp # Single graph with haplotypes vg construct -r small/x.fa -v small/x.vcf.gz -a > x.vg -vg index -G x.gbwt -v small/x.vcf.gz x.vg -is $? 0 "building a GBWT index of a graph with haplotypes" - vg index -x x.xg x.vg is $? 0 "building an XG index of a graph with haplotypes" @@ -55,51 +52,25 @@ is $(vg paths -x x-ap.xg -L | wc -l) $(vg paths -v x.vg -L | wc -l) "xg index do vg index -g x.gcsa x.vg is $? 0 "building a GCSA index of a graph with haplotypes" -vg index -x x2.xg -G x2.gbwt -v small/x.vcf.gz -g x2.gcsa x.vg +vg index -x x2.xg -g x2.gcsa x.vg is $? 0 "building all indexes at once" -cmp x.xg x2.xg && cmp x.gbwt x2.gbwt && cmp x.gcsa x2.gcsa && cmp x.gcsa.lcp x2.gcsa.lcp +cmp x.xg x2.xg && cmp x.gcsa x2.gcsa && cmp x.gcsa.lcp x2.gcsa.lcp is $? 0 "the indexes are identical" -vg index -x x2-ap.xg -G x2-ap.gbwt -v small/x.vcf.gz -g x2-ap.gcsa x.vg -L +vg index -x x2-ap.xg -g x2-ap.gcsa x.vg -L is $? 0 "building all indexes at once, while leaving alt paths in xg" -cmp x.gbwt x2-ap.gbwt && cmp x.gcsa x2-ap.gcsa && cmp x.gcsa.lcp x2-ap.gcsa.lcp +cmp x.gcsa x2-ap.gcsa && cmp x.gcsa.lcp x2-ap.gcsa.lcp is $? 0 "the indexes are identical with -L" is $(vg paths -x x2-ap.xg -L | wc -l) $(vg paths -v x.vg -L | wc -l) "xg index does contains alt paths with index -L all at once" -# Exclude a sample from the GBWT index -vg index -G empty.gbwt -v small/x.vcf.gz --exclude 1 x.vg -is $? 0 "samples can be excluded from haplotype indexing" -is $(vg gbwt -c empty.gbwt) 0 "excluded samples were not included in the GBWT index" - -# Make GBWT from GAM -vg paths -v x.vg -X -Q _alt > x-alts.gam -vg index x.vg -M x-alts.gam -G x-gam.gbwt -# Make GBWT from GAF -vg convert x.vg -G x-alts.gam > x-alts.gaf -vg index x.vg -F x-alts.gaf -G x-gaf.gbwt -cmp x-gaf.gbwt x-gam.gbwt -is $? 0 "GBWT from GAF same as from GAM" - rm -f x.vg -rm -f x.xg x-ap.xg x.gbwtx.gcsa x.gcsa.lcp -rm -f x2.xg x2.gbwt x2.gcsa x2.gcsa.lcp -rm -f x2-ap.xg x2-ap.gbwt x2-ap.gcsa x2-ap.gcsa.lcp -rm -f empty.gbwt -rm -f x-alts.gam x-alts.gaf x-gam.gbwt x-gaf.gbwt - - -# Subregion graph with haplotypes -vg construct -r small/x.fa -v small/x.vcf.gz -a --region x:100-200 > x.part.vg - -vg index -x x.part.xg -G x.part.gbwt --region x:100-200 -v small/x.vcf.gz x.part.vg 2>log.txt -is $? 0 "building GBWT index for a regional graph" - -is "$(cat log.txt | wc -c)" "0" "no warnings about missing variants produced" - -rm -f x.part.vg x.part.xg x.part.gbwt log.txt +rm -f x.xg x-ap.xg x.gcsa x.gcsa.lcp +rm -f x2.xg x2.gcsa x2.gcsa.lcp +rm -f x2-ap.xg x2-ap.gcsa x2-ap.gcsa.lcp +rm -f x-alts.gam x-alts.gaf # Multiple graphs without haplotypes @@ -129,9 +100,6 @@ vg construct -r small/xy.fa -v small/xy2.vcf.gz -R x -C -a > x.vg 2> /dev/null vg construct -r small/xy.fa -v small/xy2.vcf.gz -R y -C -a > y.vg 2> /dev/null vg ids -j x.vg y.vg -vg index -G x.gbwt -v small/xy2.vcf.gz x.vg && vg index -G y.gbwt -v small/xy2.vcf.gz y.vg && vg gbwt -m -f -o xy.gbwt x.gbwt y.gbwt -is $? 0 "building a GBWT index of multiple graphs with haplotypes" - vg index -x xy.xg x.vg y.vg is $? 0 "building an XG index of multiple graphs with haplotypes" @@ -144,31 +112,14 @@ is $? 0 "building XG and GCSA indexes at once" vg index -x xy-alt.xg -L x.vg y.vg is $? 0 "building an XG index with alt paths" -vg index -G xy2.gbwt -v small/xy2.vcf.gz xy-alt.xg -is $? 0 "building a GBWT index from an XG index" - -cmp xy.xg xy2.xg && cmp xy.gcsa xy2.gcsa && cmp xy.gcsa.lcp xy2.gcsa.lcp && cmp xy.gbwt xy2.gbwt +cmp xy.xg xy2.xg && cmp xy.gcsa xy2.gcsa && cmp xy.gcsa.lcp xy2.gcsa.lcp is $? 0 "the indexes are identical" rm -f x.vg y.vg -rm -f x.gbwt y.gbwt -rm -f xy.xg xy.gbwt xy.gcsa xy.gcsa.lcp -rm -f xy2.xg xy2.gbwt xy2.gcsa xy2.gcsa.lcp +rm -f xy.xg xy.gcsa xy.gcsa.lcp +rm -f xy2.xg xy2.gcsa xy2.gcsa.lcp rm -f xy-alt.xg - -# GBWT construction options -vg construct -r small/xy.fa -v small/xy2.vcf.gz -R x -C -a > x.vg 2> /dev/null - -vg index -G x_ref.gbwt -T x.vg -is $? 0 "GBWT can be built for paths" - -rm -f x_ref.gbwt - -# We do not test GBWT construction parameters (-B, -u, -n) because they matter only for large inputs. -# We do not test chromosome-length path generation (-P, -o) for the same reason. - - # Other tests vg construct -m 1000 -r small/x.fa -v small/x.vcf.gz >x.vg vg index -x x.xg x.vg bogus123.vg 2>/dev/null diff --git a/test/t/07_vg_map.t b/test/t/07_vg_map.t index 08a99edc398..a7c1e41c9a8 100644 --- a/test/t/07_vg_map.t +++ b/test/t/07_vg_map.t @@ -152,7 +152,8 @@ is $? 1 "error on vg map -f (paired, RHS)" # Now do the GBWT vg construct -a -r small/x.fa -v small/x.vcf.gz >x.vg -vg index -x x.xg -g x.gcsa -v small/x.vcf.gz --gbwt-name x.gbwt -k 16 x.vg +vg index -x x.xg -g x.gcsa -k 16 x.vg +vg gbwt -v small/x.vcf.gz -o x.gbwt -x x.vg # This read is all ref which matches no haplotype in x.vcf.gz and visits some unused nodes is "$(vg map -x x.xg -g x.gcsa --gbwt-name x.gbwt --hap-exp 1 --full-l-bonus 0 -f reads/x.unvisited.fq -j | jq -r '.score')" "36" "mapping a read that touches unused nodes gets the base score" diff --git a/test/t/11_vg_paths.t b/test/t/11_vg_paths.t index 36729a7a0ba..e83fa8de659 100644 --- a/test/t/11_vg_paths.t +++ b/test/t/11_vg_paths.t @@ -11,7 +11,8 @@ plan tests 26 vg construct -r small/x.fa -v small/x.vcf.gz -a > x.vg vg construct -r small/x.fa -v small/x.vcf.gz > x2.vg -vg index -x x.xg -G x.gbwt -v small/x.vcf.gz x.vg +vg index -x x.xg x.vg +vg gbwt -v small/x.vcf.gz -o x.gbwt -x x.vg # List path/thread names from various input formats is "$(vg paths --list -v x2.vg)" "x" "path listing works from vg" @@ -90,10 +91,10 @@ diff original.fa norm_x4.fa is $? 0 "path normalizer doesnt alter path sequences" # note: x3 is x4 in reverse, so we key on that -grep x3 norm_x2.gfa | awk '{print $3}' > x4.path -grep x3 norm_x2.gfa | awk '{print $3}' >> x4.path -grep x3 norm_x2.gfa | awk '{print $3}'> x4.norm.path -grep x5 norm_x2.gfa | awk '{print $3}' >> x4.norm.path +grep x3 norm_x4.gfa | awk '{print $3}' > x4.path +grep x3 norm_x4.gfa | awk '{print $3}' >> x4.path +grep x3 norm_x4.gfa | awk '{print $3}'> x4.norm.path +grep x5 norm_x4.gfa | awk '{print $3}' >> x4.norm.path diff x4.path x4.norm.path is $? 0 "path normalizere correctly snapped all equivalent paths to x4" diff --git a/test/t/13_vg_sim.t b/test/t/13_vg_sim.t index 0481a82e463..ecc0799eeda 100644 --- a/test/t/13_vg_sim.t +++ b/test/t/13_vg_sim.t @@ -11,7 +11,7 @@ plan tests 36 vg construct -r small/x.fa -v small/x.vcf.gz >x.vg vg construct -r small/x.fa -v small/x.vcf.gz -a >x2.vg vg index -x x.xg x.vg -vg index -G x.gbwt -v small/x.vcf.gz x2.vg +vg gbwt -o x.gbwt -v small/x.vcf.gz -x x2.vg is $(vg sim -l 100 -n 100 -x x.xg | wc -l) 100 \ "vg sim creates the correct number of reads" @@ -58,7 +58,7 @@ rm -f x.vg x2.vg x.xg x.gbwt n.vg n.fa n.xg vg construct -r small/xy.fa -v small/x.vcf.gz -a >xy.vg vg index -x xy.xg xy.vg -vg index -G xy.gbwt -v small/x.vcf.gz xy.vg +vg gbwt -o xy.gbwt -v small/x.vcf.gz -x xy.vg vg sim -s 12345 -n 1000 -l 2 -e 0.1 -x xy.xg -g xy.gbwt --sample-name 1 --any-path >/dev/null is $? "0" "Sample simulation works along with --any-path" diff --git a/test/t/18_vg_call.t b/test/t/18_vg_call.t index 6a457f4eb9f..b97cee3fb08 100644 --- a/test/t/18_vg_call.t +++ b/test/t/18_vg_call.t @@ -89,7 +89,7 @@ is "${REF_COUNT_V}" "${REF_COUNT_A}" "Same number of reference calls with -a as # Output snarl traversals into a GBWT then genotype that vg call HGSVC_alts.xg -k HGSVC_alts.pack -s HG00514 -T | gzip > HGSVC_travs.gaf.gz -vg index HGSVC_alts.xg -F HGSVC_travs.gaf.gz -G HGSVC_travs.gbwt +vg gbwt -o HGSVC_travs.gbwt -x HGSVC_alts.xg -A HGSVC_travs.gaf.gz vg call HGSVC_alts.xg -k HGSVC_alts.pack -g HGSVC_travs.gbwt -s HG00514 > HGSVC_travs.vcf vg call HGSVC_alts.xg -k HGSVC_alts.pack -s HG00514 > HGSVC_direct.vcf # extract the called genotypes diff --git a/test/t/26_deconstruct.t b/test/t/26_deconstruct.t index 03c7f8f3a87..6dc6af6841d 100644 --- a/test/t/26_deconstruct.t +++ b/test/t/26_deconstruct.t @@ -90,7 +90,9 @@ printf "P\ty\t1+,3+,5+,6+,8+,9+,11+,12+,9+,10+,12+,14+,15+\t8M,1M,1M,3M,1M,19M,1 vg view -Fv cyclic_tiny.gfa > cyclic_tiny.vg vg index cyclic_tiny.vg -x cyclic_tiny.xg vg find -x cyclic_tiny.xg -n 10 -n 11 -n 12 -n 13 -n 14 -n 15 -c 1 > cycle.vg -vg index cycle.vg -x cycle.xg +# TODO: Make deconstruct see through subpaths to the base path +vg view cycle.vg | sed 's/\([xy]\)\[[-0-9]*\]/\1/g' >cycle-asfullpaths.gfa +vg index cycle-asfullpaths.gfa -x cycle.xg vg deconstruct cycle.xg -p y -e -t 1 > cycle_decon.vcf is $(grep -v "#" cycle_decon.vcf | wc -l) 1 "cyclic reference deconstruction has correct number of variants" grep -v "#" cycle_decon.vcf | grep 20 | awk '{print $1 "\t" $2 "\t" $4 "\t" $5 "\t" $10}' > cycle_decon.tsv @@ -121,7 +123,8 @@ is "$?" 0 "deconstructing vg graph gives same output as xg graph" rm -f tiny_names.gfa tiny_names.vg tiny_names.xg tiny_names_decon.vcf tiny_names_decon_vg.vcf vg construct -r small/x.fa -v small/x.vcf.gz -a > x.vg -vg index -x x.xg -G x.gbwt -v small/x.vcf.gz x.vg +vg index -x x.xg x.vg +vg gbwt -v small/x.vcf.gz -o x.gbwt -x x.vg vg deconstruct x.xg -g x.gbwt | bgzip > x.decon.vcf.gz tabix -f -p vcf x.decon.vcf.gz cat small/x.fa | bcftools consensus small/x.vcf.gz -s 1 -H 1 > small.s1.h1.fa diff --git a/test/t/30_vg_chunk.t b/test/t/30_vg_chunk.t index 20c8466ea88..48e52ffdb93 100644 --- a/test/t/30_vg_chunk.t +++ b/test/t/30_vg_chunk.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 32 +plan tests 35 # Construct a graph with alt paths so we can make a GBWT and a GBZ vg construct -m 1000 -r small/x.fa -v small/x.vcf.gz -a >x.vg @@ -66,8 +66,8 @@ is $(vg chunk -x x.xg -r 1:1 -c 2 -T | vg view - -j | jq .node | grep id | wc -l # Check that traces work on a GBWT is $(vg chunk -x x.xg -G x.gbwt -r 1:1 -c 2 -T | vg view - -j | jq .node | grep id | wc -l) 5 "id chunker traces correct chunk size" -is "$(vg chunk -x x.xg -r 1:1 -c 2 -T | vg view - -j | jq -c '.path[] | select(.name != "x[0]")' | wc -l)" 0 "chunker extracts no threads from an empty gPBWT" -is "$(vg chunk -x x.xg -G x.haps.gbwt -r 1:1 -c 2 -T | vg view - -j | jq -c '.path[] | select(.name != "x[0]")' | wc -l)" 2 "chunker extracts 2 local threads from a gBWT with 2 locally distinct threads in it" +is "$(vg chunk -x x.xg -r 1:1 -c 2 -T | vg view - -j | jq -c '.path[] | select(.name | startswith("x[0") | not)' | wc -l)" 0 "chunker extracts no threads from an empty gPBWT" +is "$(vg chunk -x x.xg -G x.haps.gbwt -r 1:1 -c 2 -T | vg view - -j | jq -c '.path[] | select(.name | startswith("x[0") | not)' | wc -l)" 2 "chunker extracts 2 local threads from a gBWT with 2 locally distinct threads in it" is "$(vg chunk -x x.xg -G x.gbwt -r 1:1 -c 2 -T | vg view - -j | jq -r '.path[] | select(.name == "thread_0") | .mapping | length')" 3 "chunker can extract a partial haplotype from a GBWT" is "$(vg chunk -x x.gbz -r 1:1 -c 2 -T | vg view - -j | jq -r '.path[] | select(.name == "thread_0") | .mapping | length')" 3 "chunker can extract a partial haplotype from a GBZ" is "$(vg chunk -x x.gbz -r 1:1 -c 2 -T --no-embedded-haplotypes | vg view - -j | jq -r '.path[] | select(.name == "thread_0") | .mapping | length')" "" "chunker doesn't see haplotypes in the GBZ if asked not to" @@ -126,5 +126,15 @@ is "$?" 0 "components finds subgraphs" rm -f xy.vg x.vg y.vg x_nodes.txt y_nodes.txt convert path_chunk_x.vg convert path_chunk_y.vg pc_x_nodes.txt pc_y_nodes.txt x_paths.txt pc_x_paths.txt components_chunk_0.vg components_chunk_1.vg comp_0_nodes.txt comp_1_nodes.txt comp_nodes.txt nodes.txt x.gam y.gam xy.gam path_chunk_x.gam path_chunk_y.gam +vg gbwt --gbz-format --graph-name graph.gbz --gfa-input graphs/gfa_with_reference.gfa +vg chunk -x graph.gbz -p sample1#1#chr1#0:1-2 -c 1 >part.vg 2>log.txt +grep "out_of_range" log.txt +is "$?" "1" "chunking on a haplotype path does not produce an out of range error" +grep "not found in" log.txt +is "$?" "1" "chunking on a haplotype path does not produce a path not found error" +is "$(vg stats -z part.vg | grep nodes | cut -f2)" "4" "chunking on a haplotype produces the correct size graph" + +rm -f graph.gbz part.vg log.txt + diff --git a/test/t/33_vg_mpmap.t b/test/t/33_vg_mpmap.t index 16af8c4cef9..e569b2b9ed6 100644 --- a/test/t/33_vg_mpmap.t +++ b/test/t/33_vg_mpmap.t @@ -11,7 +11,8 @@ plan tests 21 # Exercise the GBWT # Index a couple of nearly identical contigs vg construct -m 1000 -a -r small/xy.fa -v small/xy2.vcf.gz >xy2.vg -vg index -x xy2.xg -g xy2.gcsa -v small/xy2.vcf.gz --gbwt-name xy2.gbwt -k 16 xy2.vg +vg index -x xy2.xg -g xy2.gcsa -k 16 xy2.vg +vg gbwt -v small/xy2.vcf.gz -o xy2.gbwt -x xy2.vg # We turn off the background model calibration with -B and ignore it with -P 1 diff --git a/test/t/37_vg_gbwt.t b/test/t/37_vg_gbwt.t index 6c11196fe7a..a5849bc47a5 100644 --- a/test/t/37_vg_gbwt.t +++ b/test/t/37_vg_gbwt.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 159 +plan tests 155 # Build vg graphs for two chromosomes @@ -21,10 +21,6 @@ vg index -x xy-alt.xg -L x.vg y.vg # Single chromosome: haplotypes vg gbwt -x x.vg -o x.gbwt -v small/xy2.vcf.gz is $? 0 "chromosome X GBWT with vg gbwt" -vg index -G x2.gbwt -v small/xy2.vcf.gz x.vg -is $? 0 "chromosome X GBWT with vg index" -cmp x.gbwt x2.gbwt -is $? 0 "identical construction results with vg gbwt and vg index" vg gbwt -x x.vg -o parse --parse-only -v small/xy2.vcf.gz is $? 0 "chromosome X VCF parse" ../deps/gbwt/bin/build_gbwt -p -r parse_x > /dev/null 2> /dev/null @@ -33,46 +29,38 @@ cmp x.gbwt parse_x.gbwt is $? 0 "identical construction results with vg gbwt and from VCF parse" # Single chromosome: metadata for haplotypes -is $(vg gbwt -c x.gbwt) 2 "chromosome X: 2 threads" +is $(vg gbwt -c x.gbwt) 2 "chromosome X: 2 paths" is $(vg gbwt -C x.gbwt) 1 "chromosome X: 1 contig" is $(vg gbwt -H x.gbwt) 2 "chromosome X: 2 haplotypes" is $(vg gbwt -S x.gbwt) 1 "chromosome X: 1 sample" -is $(vg gbwt -T x.gbwt | wc -l) 2 "chromosome X: 2 thread names" +is $(vg gbwt -T x.gbwt | wc -l) 2 "chromosome X: 2 path names" is $(vg gbwt -C -L x.gbwt | wc -l) 1 "chromosome X: 1 contig name" is $(vg gbwt -S -L x.gbwt | wc -l) 1 "chromosome X: 1 sample name" -rm -f x.gbwt x2.gbwt parse_x.gbwt +rm -f x.gbwt parse_x.gbwt rm -f parse_x parse_x_0_1 # Single chromosome: paths vg gbwt -E -o x.ref.gbwt -x x.vg is $? 0 "chromosome X reference GBWT with vg gbwt" -vg index -G x2.ref.gbwt -T x.vg -is $? 0 "chromosome X reference GBWT with vg index" -cmp x.ref.gbwt x2.ref.gbwt -is $? 0 "identical construction results with vg gbwt and vg index" -is $(vg gbwt -c x.ref.gbwt) 1 "chromosome X reference: 1 thread" +is $(vg gbwt -c x.ref.gbwt) 1 "chromosome X reference: 1 path" -rm -f x.ref.gbwt x2.ref.gbwt +rm -f x.ref.gbwt # Single chromosome: alignments -vg paths -v x.vg -X -Q _alt > x.alts.gam +vg paths -x x.vg -X -Q _alt > x.alts.gam vg convert -G x.alts.gam x.vg > x.alts.gaf -vg gbwt -A -o x.alts.gaf.gbwt -x x.vg x.alts.gaf +vg gbwt -A --num-jobs 1 -o x.alts.gaf.gbwt -x x.vg x.alts.gaf is $? 0 "chromosome X GAF with vg gbwt" -vg index -F x.alts.gaf -G x2.alts.gaf.gbwt x.vg -is $? 0 "chromosome X GAF with vg index" -cmp x.alts.gaf.gbwt x2.alts.gaf.gbwt -is $? 0 "identical construction results with vg gbwt and vg index" -vg gbwt -A --gam-format -o x.alts.gam.gbwt -x x.vg x.alts.gam +vg gbwt -A --num-jobs 1 --gam-format -o x.alts.gam.gbwt -x x.vg x.alts.gam is $? 0 "chromosome X GAM with vg gbwt" cmp x.alts.gaf.gbwt x.alts.gaf.gbwt is $? 0 "identical construction results from GAF and GAM" rm -f x.alts.gam x.alts.gaf -rm -f x.alts.gaf.gbwt x2.alts.gaf.gbwt x.alts.gam.gbwt +rm -f x.alts.gaf.gbwt x.alts.gam.gbwt # Graph region: haplotypes @@ -80,12 +68,8 @@ vg construct -r small/x.fa -v small/x.vcf.gz -a --region x:100-200 > x.part.vg vg gbwt -x x.part.vg -o x.part.gbwt --vcf-region x:100-200 -v small/x.vcf.gz 2> log.txt is $? 0 "chromosome X subgraph GBWT with vg gbwt" is "$(cat log.txt | wc -c)" 0 "no warnings about missing variants" -vg index -G x2.part.gbwt --region x:100-200 -v small/x.vcf.gz x.part.vg 2> log.txt -is $? 0 "chromosome X subgraph GBWT with vg index" -cmp x.part.gbwt x2.part.gbwt -is $? 0 "identical construction results with vg gbwt and vg index" -rm -f x.part.vg x.part.gbwt x2.part.gbwt log.txt +rm -f x.part.vg x.part.gbwt log.txt # Multiple chromosomes: haplotypes @@ -115,7 +99,7 @@ vg gbwt -x xy-alt.xg -o xy.1000gp.gbwt --preset 1000gp -v small/xy2.vcf.gz is $? 0 "construction preset: 1000gp" # Multiple chromosomes: metadata for haplotypes -is $(vg gbwt -c xy.merge.gbwt) 4 "multiple chromosomes: 4 threads" +is $(vg gbwt -c xy.merge.gbwt) 4 "multiple chromosomes: 4 paths" is $(vg gbwt -C xy.merge.gbwt) 2 "multiple chromosomes: 2 contigs" is $(vg gbwt -H xy.merge.gbwt) 2 "multiple chromosomes: 2 haplotypes" is $(vg gbwt -S xy.merge.gbwt) 1 "multiple chromosomes: 1 sample" @@ -124,19 +108,34 @@ rm -f x.gbwt y.gbwt xy.merge.gbwt xy.fast.gbwt xy.parallel.gbwt xy.direct.gbwt x rm -f xy.1000gp.gbwt +# Multiple chromosomes: alignments +vg paths -x xy-alt.xg -X -Q _alt > xy.alts.gam +vg convert -G xy.alts.gam xy.xg > xy.alts.gaf +vg gbwt -A --num-jobs 1 -o xy.alts.gaf.gbwt -x xy.xg xy.alts.gaf +is $? 0 "multi-chromosome GAF with vg gbwt" +vg gbwt -A --num-jobs 1 --gam-format -o xy.alts.gam.gbwt -x xy.xg xy.alts.gam +is $? 0 "multi-chromosome GAM with vg gbwt" +cmp xy.alts.gaf.gbwt xy.alts.gaf.gbwt +is $? 0 "identical construction results from GAF and GAM" + +vg gbwt -A --num-jobs 2 -o multi.gbwt -x xy.xg xy.alts.gaf +is $? 0 "multi-chromosome GAF with vg gbwt using multiple jobs" +is $(vg gbwt -c xy.alts.gaf.gbwt) 58 "single job: 58 paths" +is $(vg gbwt -c multi.gbwt) 58 "multiple jobs: 58 paths" + +rm -f xy.alts.gam xy.alts.gaf +rm -f xy.alts.gaf.gbwt xy.alts.gam.gbwt multi.gbwt + + # Multiple chromosomes: paths as contigs vg gbwt -E -o xy.contigs.gbwt -x xy.xg is $? 0 "paths as contigs with vg gbwt" -vg index -G xy2.contigs.gbwt -T xy.xg -is $? 0 "paths as contigs with vg index" -cmp xy.contigs.gbwt xy2.contigs.gbwt -is $? 0 "identical construction results with vg gbwt and vg index" -is $(vg gbwt -c xy.contigs.gbwt) 2 "paths as contigs: 2 threads" +is $(vg gbwt -c xy.contigs.gbwt) 2 "paths as contigs: 2 paths" is $(vg gbwt -C xy.contigs.gbwt) 2 "paths as contigs: 2 contigs" is $(vg gbwt -H xy.contigs.gbwt) 1 "paths as contigs: 1 haplotype" is $(vg gbwt -S xy.contigs.gbwt) 1 "paths as contigs: 1 sample" -rm -f xy.contigs.gbwt xy2.contigs.gbwt +rm -f xy.contigs.gbwt # Build an r-index @@ -194,35 +193,35 @@ rm -f x.gbwt empty.gbwt x2.gbwt vg gbwt -x xy-alt.xg -o xy.gbwt -v small/xy2.vcf.gz vg gbwt -E -o xy.ref.gbwt -x xy.xg vg gbwt -m -o xy.both.gbwt xy.gbwt xy.ref.gbwt -is $(vg gbwt -c xy.both.gbwt) 6 "haplotypes and paths: 6 threads" +is $(vg gbwt -c xy.both.gbwt) 6 "haplotypes and paths: 6 paths" # Remove the reference sample that GBWTs use for paths vg gbwt -R _gbwt_ref -o xy.removed.gbwt xy.both.gbwt is $? 0 "samples can be removed from a GBWT index" -is $(vg gbwt -c xy.removed.gbwt) 4 "haplotypes only: 4 threads" +is $(vg gbwt -c xy.removed.gbwt) 4 "haplotypes only: 4 paths" rm -f xy.gbwt xy.ref.gbwt xy.both.gbwt xy.removed.gbwt # Build a three-sample GBWT from a simple GFA vg gbwt -o all.gbwt -G graphs/three_samples.gfa -is $(vg gbwt -c all.gbwt) 12 "all samples: 12 threads" +is $(vg gbwt -c all.gbwt) 12 "all samples: 12 paths" is $(vg gbwt -H all.gbwt) 6 "all samples: 6 haplotypes" # Remove samples 1 and 3 vg gbwt -R sample1 -R sample3 -o removed.gbwt all.gbwt is $? 0 "multiple samples can be removed from a GBWT index" -is $(vg gbwt -c removed.gbwt) 4 "sample 2: 4 threads" +is $(vg gbwt -c removed.gbwt) 4 "sample 2: 4 paths" is $(vg gbwt -H removed.gbwt) 2 "sample 2: 2 haplotypes" rm -f all.gbwt removed.gbwt -# Extract threads from GBWT +# Extract paths from GBWT vg gbwt -x x.vg -o x.gbwt -v small/xy2.vcf.gz vg gbwt -e x.extract x.gbwt -is $? 0 "threads can be extracted from GBWT" -is $(cat x.extract | wc -c) 121 "correct size for the thread file" +is $? 0 "paths can be extracted from GBWT" +is $(cat x.extract | wc -c) 121 "correct size for the paths file" rm -f x.gbwt x.extract @@ -271,7 +270,7 @@ rm -f extracted.gbwt extracted2.gbwt extracted2.gg vg gbwt -P -n 16 -x xy.xg -g xy.cover.gg -o xy.cover.gbwt is $? 0 "Path cover GBWTGraph construction" is $(md5sum xy.cover.gg | cut -f 1 -d\ ) 6a2738f51472e0ba1553a815a005b157 "GBWTGraph was serialized correctly" -is $(vg gbwt -c xy.cover.gbwt) 32 "path cover: 32 threads" +is $(vg gbwt -c xy.cover.gbwt) 32 "path cover: 32 paths" is $(vg gbwt -C xy.cover.gbwt) 2 "path cover: 2 contigs" is $(vg gbwt -H xy.cover.gbwt) 16 "path cover: 16 haplotypes" is $(vg gbwt -S xy.cover.gbwt) 16 "path cover: 16 samples" @@ -282,10 +281,10 @@ rm -f xy.cover.gg xy.cover.gbwt vg gbwt -P -n 16 -x xy.xg -g xy.cover.gg -o xy.cover.gbwt --pass-paths is $? 0 "Path cover GBWTGraph construction" is $(md5sum xy.cover.gg | cut -f 1 -d\ ) 6a2738f51472e0ba1553a815a005b157 "GBWTGraph was serialized correctly" -is $(vg gbwt -c xy.cover.gbwt) 34 "path cover w/ paths: 34 threads" -is $(vg gbwt -C xy.cover.gbwt) 2 "path cover w/ paths: 2 contigs" -is $(vg gbwt -H xy.cover.gbwt) 17 "path cover w/ paths: 17 haplotypes" -is $(vg gbwt -S xy.cover.gbwt) 17 "path cover w/ paths: 17 samples" +is $(vg gbwt -c xy.cover.gbwt) 34 "path cover w/ reference paths: 34 paths" +is $(vg gbwt -C xy.cover.gbwt) 2 "path cover w/ reference paths: 2 contigs" +is $(vg gbwt -H xy.cover.gbwt) 17 "path cover w/ reference paths: 17 haplotypes" +is $(vg gbwt -S xy.cover.gbwt) 17 "path cover w/ reference paths: 17 samples" rm -f xy.cover.gg xy.cover.gbwt @@ -293,7 +292,7 @@ rm -f xy.cover.gg xy.cover.gbwt vg gbwt -x xy-alt.xg -g xy.local.gg -l -n 16 -o xy.local.gbwt -v small/xy2.vcf.gz is $? 0 "Local haplotypes GBWTGraph construction" is $(md5sum xy.local.gg | cut -f 1 -d\ ) 00429586246711abcf1367a97d3c468c "GBWTGraph was serialized correctly" -is $(vg gbwt -c xy.local.gbwt) 32 "local haplotypes: 32 threads" +is $(vg gbwt -c xy.local.gbwt) 32 "local haplotypes: 32 paths" is $(vg gbwt -C xy.local.gbwt) 2 "local haplotypes: 2 contigs" is $(vg gbwt -H xy.local.gbwt) 16 "local haplotypes: 16 haplotypes" is $(vg gbwt -S xy.local.gbwt) 16 "local haplotypes: 16 samples" @@ -314,10 +313,10 @@ rm -f xy.local.gg xy.local.gbwt xy.local.gbz vg gbwt -x xy-alt.xg -g xy.local.gg -l -n 16 -o xy.local.gbwt -v small/xy2.vcf.gz --pass-paths is $? 0 "Local haplotypes GBWTGraph construction" is $(md5sum xy.local.gg | cut -f 1 -d\ ) 6a2738f51472e0ba1553a815a005b157 "GBWTGraph was serialized correctly" -is $(vg gbwt -c xy.local.gbwt) 34 "local haplotypes w/ paths: 34 threads" -is $(vg gbwt -C xy.local.gbwt) 2 "local haplotypes w/ paths: 2 contigs" -is $(vg gbwt -H xy.local.gbwt) 17 "local haplotypes w/ paths: 17 haplotypes" -is $(vg gbwt -S xy.local.gbwt) 17 "local haplotypes w/ paths: 17 samples" +is $(vg gbwt -c xy.local.gbwt) 34 "local haplotypes w/ reference paths: 34 paths" +is $(vg gbwt -C xy.local.gbwt) 2 "local haplotypes w/ reference paths: 2 contigs" +is $(vg gbwt -H xy.local.gbwt) 17 "local haplotypes w/ reference paths: 17 haplotypes" +is $(vg gbwt -S xy.local.gbwt) 17 "local haplotypes w/ reference paths: 17 samples" rm -f xy.local.gg xy.local.gbwt @@ -325,11 +324,11 @@ rm -f xy.local.gg xy.local.gbwt vg gbwt -G haplotype-sampling/micb-kir3dl1.gfa -g large.gbz --gbz-format vg gbwt -Z large.gbz -l -n 16 --pass-paths -o large.local.gbwt is $? 0 "Local haplotypes with reference paths from a larger GBZ" -is $(vg gbwt -c large.local.gbwt) 36 "local haplotypes w/ paths: 36 threads" -is $(vg gbwt -C large.local.gbwt) 2 "local haplotypes w/ paths: 2 contigs" -is $(vg gbwt -H large.local.gbwt) 18 "local haplotypes w/ paths: 18 haplotypes" -is $(vg gbwt -S large.local.gbwt) 18 "local haplotypes w/ paths: 18 samples" -is $(vg gbwt --tags large.local.gbwt | grep -c reference_samples) 1 "local haplotypes w/ paths: reference_samples set" +is $(vg gbwt -c large.local.gbwt) 36 "local haplotypes w/ reference paths: 36 paths" +is $(vg gbwt -C large.local.gbwt) 2 "local haplotypes w/ reference paths: 2 contigs" +is $(vg gbwt -H large.local.gbwt) 18 "local haplotypes w/ reference paths: 18 haplotypes" +is $(vg gbwt -S large.local.gbwt) 18 "local haplotypes w/ reference paths: 18 samples" +is $(vg gbwt --tags large.local.gbwt | grep -c reference_samples) 1 "local haplotypes w/ reference paths: reference_samples set" rm -f large.gbz large.local.gbwt @@ -339,7 +338,7 @@ vg gbwt -x x.vg -o x.gbwt -v small/xy2.vcf.gz vg gbwt -a -n 16 -x xy.xg -g augmented.gg -o augmented.gbwt x.gbwt is $? 0 "Augmented GBWTGraph construction" is $(md5sum augmented.gg | cut -f 1 -d\ ) 00429586246711abcf1367a97d3c468c "GBWTGraph was serialized correctly" -is $(vg gbwt -c augmented.gbwt) 18 "augmented: 18 threads" +is $(vg gbwt -c augmented.gbwt) 18 "augmented: 18 paths" is $(vg gbwt -C augmented.gbwt) 2 "augmented: 2 contigs" is $(vg gbwt -H augmented.gbwt) 18 "augmented: 18 haplotypes" is $(vg gbwt -S augmented.gbwt) 17 "augmented: 17 samples" @@ -355,7 +354,7 @@ rm -f x.vg y.vg x.xg xy.xg xy-alt.xg vg gbwt -o gfa.gbwt -G graphs/components_walks.gfa is $? 0 "GBWT construction from GFA" is $(md5sum gfa.gbwt | cut -f 1 -d\ ) 44c27c37c7af6911c26aea2a41008460 "GBWT was serialized correctly" -is $(vg gbwt -c gfa.gbwt) 4 "gfa: 4 threads" +is $(vg gbwt -c gfa.gbwt) 4 "gfa: 4 paths" is $(vg gbwt -C gfa.gbwt) 2 "gfa: 2 contigs" is $(vg gbwt -H gfa.gbwt) 2 "gfa: 2 haplotypes" is $(vg gbwt -S gfa.gbwt) 1 "gfa: 1 sample" @@ -376,7 +375,7 @@ is $(md5sum gfa2.gbz | cut -f 1 -d\ ) ab241a3f79a781a367b701cb8888bf01 "GBZ was # Build GBWT and GBWTGraph from GFA with node chopping vg gbwt -o chopping.gbwt -g chopping.gg --translation chopping.trans --max-node 2 -G graphs/chopping_walks.gfa is $? 0 "GBWT+GBWTGraph construction from GFA with chopping" -is $(vg gbwt -c chopping.gbwt) 3 "chopping: 3 threads" +is $(vg gbwt -c chopping.gbwt) 3 "chopping: 3 paths" is $(vg gbwt -C chopping.gbwt) 1 "chopping: 1 contig" is $(vg gbwt -H chopping.gbwt) 3 "chopping: 3 haplotypes" is $(vg gbwt -S chopping.gbwt) 2 "chopping: 2 samples" @@ -391,7 +390,7 @@ is $(wc -l < from_gbz.trans) 8 "from GBZ: 8 translations" # Build GBWT and GBWTGraph from GFA with both paths and walks vg gbwt -o ref_paths.gbwt -g ref_paths.gg --translation ref_paths.trans -G graphs/components_paths_walks.gfa is $? 0 "GBWT+GBWTGraph construction from GFA with reference paths" -is $(vg gbwt -c ref_paths.gbwt) 6 "ref paths: 6 threads" +is $(vg gbwt -c ref_paths.gbwt) 6 "ref paths: 6 paths" is $(vg gbwt -C ref_paths.gbwt) 2 "ref paths: 2 contigs" is $(vg gbwt -H ref_paths.gbwt) 3 "ref paths: 3 haplotypes" is $(vg gbwt -S ref_paths.gbwt) 2 "ref paths: 2 samples" diff --git a/test/t/38_vg_prune.t b/test/t/38_vg_prune.t index 3cd5db9cde0..1aa7533c260 100644 --- a/test/t/38_vg_prune.t +++ b/test/t/38_vg_prune.t @@ -10,7 +10,7 @@ plan tests 21 # Build a graph with one path and two threads vg construct -m 32 -r small/xy.fa -v small/xy2.vcf.gz -R x -C -a > x.vg 2> /dev/null -vg index -G x.gbwt -v small/xy2.vcf.gz x.vg +vg gbwt -o x.gbwt -v small/xy2.vcf.gz -x x.vg # Basic pruning: 5 components, 51 nodes, 51 edges vg prune -e 1 x.vg > y.vg @@ -54,8 +54,8 @@ rm -f x.vg x.gbwt vg construct -m 32 -r small/xy.fa -v small/xy2.vcf.gz -R x -C -a > x.vg 2> /dev/null vg construct -m 32 -r small/xy.fa -v small/xy2.vcf.gz -R y -C -a > y.vg 2> /dev/null vg ids -j -m xy.mapping x.vg y.vg -vg index -G x.gbwt -v small/xy2.vcf.gz x.vg -vg index -G y.gbwt -v small/xy2.vcf.gz y.vg +vg gbwt -o x.gbwt -v small/xy2.vcf.gz -x x.vg +vg gbwt -o y.gbwt -v small/xy2.vcf.gz -x y.vg # Prune a single-chromosome graph using multi-chromosome GBWT vg gbwt -m -o xy.gbwt x.gbwt y.gbwt diff --git a/test/t/46_vg_minimizer.t b/test/t/46_vg_minimizer.t index 06f9133f09d..e4090b54c9d 100644 --- a/test/t/46_vg_minimizer.t +++ b/test/t/46_vg_minimizer.t @@ -62,8 +62,10 @@ rm -f x.vg x.xg x.gbwt x.snarls x.dist x.mi x.gg x.gbz vg construct -r small/xy.fa -v small/xy2.vcf.gz -R x -C -a > x.vg 2> /dev/null vg construct -r small/xy.fa -v small/xy2.vcf.gz -R y -C -a > y.vg 2> /dev/null vg ids -j x.vg y.vg -vg index -x x.xg -G x.gbwt -v small/xy2.vcf.gz x.vg -vg index -x y.xg -G y.gbwt -v small/xy2.vcf.gz y.vg +vg index -x x.xg x.vg +vg gbwt -o x.gbwt -x x.vg -v small/xy2.vcf.gz +vg index -x y.xg y.vg +vg gbwt -o y.gbwt -x y.vg -v small/xy2.vcf.gz # Appending to the index vg minimizer --no-dist -t 1 -o x.mi -g x.gbwt x.xg diff --git a/vgci/vgci.sh b/vgci/vgci.sh index 4526dac20aa..c48ddbcf553 100755 --- a/vgci/vgci.sh +++ b/vgci/vgci.sh @@ -30,7 +30,7 @@ KEEP_INTERMEDIATE_FILES=0 # Should we show stdout and stderr from tests? If so, set to "-s". SHOW_OPT="" # What toil-vg should we install? -TOIL_VG_PACKAGE="git+https://github.com/vgteam/toil-vg.git@c9bd6414f935e6095574a41a34addbb8d87b41a6" +TOIL_VG_PACKAGE="git+https://github.com/vgteam/toil-vg.git@45782c7ba5a372e7c3587ac1c63f4895176fc828" # What toil should we install? # Could be something like "toil[aws,mesos]==3.20.0" # or "git+https://github.com/DataBiosphere/toil.git@3ab74776a3adebd6db75de16985ce9d734f60743#egg=toil[aws,mesos]" @@ -252,7 +252,6 @@ then if [ "${LOCAL_BUILD}" == "1" ] then # Just build vg here - . ./source_me.sh make -j ${NUM_CORES} if [ "$?" -ne 0 ]