From b093eeace4832ecabc85d43b70f84d495a674988 Mon Sep 17 00:00:00 2001 From: kojix2 <2xijok@gmail.com> Date: Thu, 12 Oct 2023 11:44:56 +0900 Subject: [PATCH 01/54] Update elfutils submodule URL --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index 6b6337be74d..f1fb6d32eef 100644 --- a/.gitmodules +++ b/.gitmodules @@ -63,7 +63,7 @@ url = https://github.com/adamnovak/backward-cpp.git [submodule "deps/elfutils"] path = deps/elfutils - url = git://sourceware.org/git/elfutils.git + url = https://sourceware.org/git/elfutils.git [submodule "deps/structures"] path = deps/structures url = https://github.com/vgteam/structures.git From 9a6206e4667167257ec918802e89675e5aaebd5d Mon Sep 17 00:00:00 2001 From: Jouni Siren Date: Tue, 13 Aug 2024 14:24:09 +0300 Subject: [PATCH 02/54] Better error handling in vg gbwt --- src/subcommand/gbwt_main.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/subcommand/gbwt_main.cpp b/src/subcommand/gbwt_main.cpp index db1175b426e..b22f0c0ce8e 100644 --- a/src/subcommand/gbwt_main.cpp +++ b/src/subcommand/gbwt_main.cpp @@ -999,9 +999,11 @@ void validate_gbwt_config(GBWTConfig& config) { if (!config.to_remove.empty()) { if (config.build == GBWTConfig::build_gbz) { std::cerr << "error: [vg gbwt] the GBWT extracted from GBZ cannot have paths modified" << std::endl; + std::exit(EXIT_FAILURE); } if (config.build == GBWTConfig::build_gbwtgraph) { std::cerr << "error: [vg gbwt] the GBWT loaded with a GBWTGraph cannot have paths modified" << std::endl; + std::exit(EXIT_FAILURE); } if (!(config.input_filenames.size() == 1 || config.merge != GBWTConfig::merge_none) || !has_gbwt_output) { std::cerr << "error: [vg gbwt] removing a sample requires one input GBWT and output GBWT" << std::endl; From 25716504b69c7b357f5a61ad4934aba695244d62 Mon Sep 17 00:00:00 2001 From: Xian Date: Wed, 11 Sep 2024 17:58:34 +0200 Subject: [PATCH 03/54] Add unit test for a reverse loop in a chain without a snarl followed by a snarl and the bdsg change to fix it --- deps/libbdsg | 2 +- src/unittest/snarl_distance_index.cpp | 54 +++++++++++++++++++++++---- 2 files changed, 48 insertions(+), 8 deletions(-) diff --git a/deps/libbdsg b/deps/libbdsg index 757896a678e..c8a10cb4cb9 160000 --- a/deps/libbdsg +++ b/deps/libbdsg @@ -1 +1 @@ -Subproject commit 757896a678eb7af7215147ef38cdb9315775dfe0 +Subproject commit c8a10cb4cb96c74093d1dc2883f2f4379a622b4e diff --git a/src/unittest/snarl_distance_index.cpp b/src/unittest/snarl_distance_index.cpp index 15ef5388edc..ca30acb6511 100644 --- a/src/unittest/snarl_distance_index.cpp +++ b/src/unittest/snarl_distance_index.cpp @@ -1782,7 +1782,7 @@ namespace vg { } TEST_CASE( "Snarl decomposition can handle chains with nodes in different directions", - "[snarl_distance][bug]" ) { + "[snarl_distance]" ) { // This graph will have a snarl from 1 to 8, a snarl from 2 to 7, @@ -6678,7 +6678,7 @@ namespace vg { } } //End test case - TEST_CASE("top level chain subgraph", "[snarl_distance][snarl_distance_subgraph][bug]") { + TEST_CASE("top level chain subgraph", "[snarl_distance][snarl_distance_subgraph]") { VG graph; Node* n1 = graph.create_node("GCA"); @@ -6712,8 +6712,6 @@ namespace vg { Edge* e16 = graph.create_edge(n10, n12); Edge* e17 = graph.create_edge(n11, n12); - ofstream out ("test_graph.vg"); - graph.serialize(out); IntegratedSnarlFinder snarl_finder(graph); @@ -6749,9 +6747,6 @@ namespace vg { SnarlDistanceIndex distance_index; fill_in_distance_index(&distance_index, &graph, &snarl_finder); subgraph_in_distance_range(distance_index, path, &graph, 11, 14, sub_graph, true); - for (auto& id : sub_graph) { - cerr << id << endl; - } REQUIRE(!sub_graph.count(3)); REQUIRE(!sub_graph.count(8)); @@ -6876,6 +6871,51 @@ namespace vg { }//end test case + TEST_CASE( "Loop in chain not connected to snarl", "[snarl_distance]" ) { + + VG graph; + + Node* n1 = graph.create_node("GCAA"); + Node* n2 = graph.create_node("GCAA"); + Node* n3 = graph.create_node("AAAT"); + Node* n4 = graph.create_node("T"); + Node* n5 = graph.create_node("G"); + Node* n6 = graph.create_node("AAAA"); + Node* n7 = graph.create_node("GGAA"); + Node* n8 = graph.create_node("TTT"); + + Edge* e1 = graph.create_edge(n1, n2); + Edge* e2 = graph.create_edge(n2, n2, true, false); + Edge* e3 = graph.create_edge(n2, n3); + + Edge* e4 = graph.create_edge(n3, n4); + Edge* e5 = graph.create_edge(n3, n5); + Edge* e6 = graph.create_edge(n4, n6); + Edge* e7 = graph.create_edge(n5, n6); + Edge* e8 = graph.create_edge(n6, n7); + Edge* e9 = graph.create_edge(n6, n6, false, true); + Edge* e10 = graph.create_edge(n7, n8); + + + //get the snarls + IntegratedSnarlFinder snarl_finder(graph); + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder); + + + SECTION("Traversal of chain") { + net_handle_t chain1_6 = distance_index.get_parent(distance_index.get_node_net_handle(n1->id())); + distance_index.for_each_child(chain1_6, [&](const net_handle_t& child) { + assert(distance_index.get_parent(child) == chain1_6); + }); + } + SECTION("Minimum distances are correct") { + REQUIRE(distance_index.minimum_distance(n1->id(),false, 0, n1->id(), true, 0) == 30); + } + + } + + TEST_CASE("random test subgraph", "[snarl_distance][snarl_distance_subgraph]") { int64_t min = 20; int64_t max = 50; From 7657aa38eed73b676639d8ac44e0d150598d5369 Mon Sep 17 00:00:00 2001 From: Xian Date: Sat, 1 Jul 2023 18:30:37 +0200 Subject: [PATCH 04/54] Topologically-ish sort children of a snarl and make their ranks the order of the sort --- src/snarl_distance_index.cpp | 198 ++++++++++++++++++++++++++++++++++- 1 file changed, 193 insertions(+), 5 deletions(-) diff --git a/src/snarl_distance_index.cpp b/src/snarl_distance_index.cpp index c0a950838bc..f4338f58e76 100644 --- a/src/snarl_distance_index.cpp +++ b/src/snarl_distance_index.cpp @@ -789,14 +789,202 @@ void populate_snarl_index( return curr_index; }; + //TODO: Copying the list + vector> all_children = temp_snarl_record.children; + + /* + * Do a topological sort of the children and re-assign ranks based on the sort + * TODO: Snarls aren't guaranteed to be DAGs, so ideally this will be a sort + * that minimizes back edges and the number of times a node is traversed backwards + * For now though, just do a topological sort and don't take any loops or reversing edges + */ + if (!temp_snarl_record.is_root_snarl) { + + //This will hold the new order of the children. Each value is an index into all_children, which + //matches the ranks(-2) of the children + vector topological_sort_order; + topological_sort_order.reserve(all_children.size()); + + // This holds everything in the topological order, to check which nodes (and therefore edges) + // have already been added + // Unlike the topological order, this stores the orientation as well. + // Each node is only added once to the topological order, but the reverse orientation + // may still be traversed to ensure that all nodes are found + unordered_set> visited_nodes; + visited_nodes.reserve(all_children.size()); + + //All nodes that have no incoming edges + vector> source_nodes; + + /* Add all sources. This will start out as the start node and any tips or nodes that + are only reachable from the end node + */ + //unordered_set> children_seen_from_start; + //vector> dfs_stack_from_start + + //// Look for tips and loops from the end node that never reach the start node + //vector> dfs_stack_from_end; + //dfs_stack_from_end.emplace_back(std::numeric_limits::max(), false); //To indicate end node + //while (dfs_stack_from_end.size() != 0) { + // // Go through all nodes from the end and search for anything that is a tip or that loops without + // // reaching anything seen on from the start + //} + + //Add max() to indicate that we start at the start node, since the start node doesn't actually have a + //rank. This gets added last so it is traversed first + source_nodes.emplace_back(std::numeric_limits::max(), false); + + //We'll be done sorting when everything is in the sorted vector + while (!source_nodes.empty()) { + + //Pick a child with no incoming edges + pair current_child_index = source_nodes.back(); + source_nodes.pop_back(); + + //Mark it as being visited + assert(visited_nodes.count(current_child_index) == 0); + visited_nodes.emplace(current_child_index); + + //Get the graph handle for that child, pointing out from the end of the chain + handle_t current_graph_handle; + if (current_child_index.first == std::numeric_limits::max()) { + //If the current child is the start bound, then get the start node pointing in + current_graph_handle = graph->get_handle(temp_snarl_record.start_node_id,temp_snarl_record.start_node_rev); + } else { + pair current_index = all_children[current_child_index.first]; + if (current_index.first == SnarlDistanceIndex::TEMP_NODE) { + //If the current child is a node, then get the node pointing in the correct direction + current_graph_handle = graph->get_handle(current_index.second, current_child_index.second); + } else if (current_child_index.second) { + //If the current child is a chain, and we're traversing the chain backwards + current_graph_handle = graph->get_handle(temp_index.temp_chain_records[current_index.second].start_node_id, + !temp_index.temp_chain_records[current_index.second].start_node_rev); + } else { + //Otherwise, the current child is a chain and we're traversing the chain forwards + current_graph_handle = graph->get_handle(temp_index.temp_chain_records[current_index.second].end_node_id, + temp_index.temp_chain_records[current_index.second].end_node_rev); + } + } + + //Add everything reachable from the start boundary node that has no other incoming edges + graph->follow_edges(current_graph_handle, false, [&](const handle_t next_handle) { +#ifdef debug_distance_indexing + cerr << "Following forward edges from " << graph->get_id(current_graph_handle) << " to " << graph->get_id(next_handle) << endl; +#endif + if (graph->get_id(next_handle) == temp_snarl_record.start_node_id || + graph->get_id(next_handle) == temp_snarl_record.end_node_id) { + //If this is trying to leave the snarl, skip it + return true; + } + //Check the next_handle going in the other direction, to see if it could be a new source node. + //If it reaches anything unseen, then it can't be a source node - /*Now go through each of the children and add distances from that child to everything reachable from it + //Get the index of next_handle + pair next_index = get_ancestor_of_node(make_pair(SnarlDistanceIndex::TEMP_NODE, graph->get_id(next_handle))); + size_t next_rank = next_index.first == SnarlDistanceIndex::TEMP_NODE + ? temp_index.temp_node_records.at(next_index.second-temp_index.min_node_id).rank_in_parent + : temp_index.temp_chain_records[next_index.second].rank_in_parent; + assert(all_children[next_rank-2] == next_index); + bool next_rev = next_index.first == SnarlDistanceIndex::TEMP_NODE || temp_index.temp_chain_records[next_index.second].is_trivial + ? graph->get_is_reverse(next_handle) + : graph->get_id(next_handle) == temp_index.temp_chain_records[next_index.second].end_node_id; + if (visited_nodes.count(make_pair(next_rank, next_rev)) != 0) { + //If this is a loop, just skip it + return true; + } + + //Get the handle from the child represented by next_handle going the other way + handle_t reverse_handle = next_index.first == SnarlDistanceIndex::TEMP_NODE ? + graph->get_handle(next_index.second, !next_rev) : + (next_rev ? graph->get_handle(temp_index.temp_chain_records[next_index.second].end_node_id, + temp_index.temp_chain_records[next_index.second].end_node_rev) + : graph->get_handle(temp_index.temp_chain_records[next_index.second].start_node_id, + !temp_index.temp_chain_records[next_index.second].start_node_rev)); + + //Does this have no unseen incoming edges? Check as we go through incoming edges + bool is_source = true; + + //Does this have no unseen incoming edges but including nodes we've seen in the other direction? + //TODO: Actually do this + graph->follow_edges(reverse_handle, false, [&](const handle_t incoming_handle) { +#ifdef debug_distance_indexing + cerr << "Getting backwards edge to " << graph->get_id(incoming_handle) << endl; +#endif + if (graph->get_id(incoming_handle) == temp_snarl_record.start_node_id || + graph->get_id(incoming_handle) == temp_snarl_record.end_node_id) { + //If this is trying to leave the snarl + return true; + } + //The index of the snarl's child that next_handle represents + pair incoming_index = get_ancestor_of_node(make_pair(SnarlDistanceIndex::TEMP_NODE, graph->get_id(incoming_handle))); + size_t incoming_rank = incoming_index.first == SnarlDistanceIndex::TEMP_NODE + ? temp_index.temp_node_records.at(incoming_index.second-temp_index.min_node_id).rank_in_parent + : temp_index.temp_chain_records[incoming_index.second].rank_in_parent; + + bool incoming_rev = incoming_index.first == SnarlDistanceIndex::TEMP_NODE || temp_index.temp_chain_records[incoming_index.second].is_trivial + ? graph->get_is_reverse(incoming_handle) + : graph->get_id(incoming_handle) == temp_index.temp_chain_records[incoming_index.second].end_node_id; + //subtract 2 to get the index from the rank + assert(incoming_rank >= 2); + incoming_rank-=2; + + //If we haven't seen the incoming node before, then this isn't a source so we break out of + //the loop and keep going + if (visited_nodes.count(std::make_pair(incoming_rank, !incoming_rev)) == 0) { + is_source = false; + } + //Keep going + return true; + }); + if (is_source) { + //If this is a new source node, then add it as a source node + + //subtract 2 to get the index from the rank + assert(next_rank >= 2); + next_rank-=2; + source_nodes.emplace_back(next_rank, next_rev); + } + return true; + }); + if (current_child_index.first != std::numeric_limits::max() && + visited_nodes.count(make_pair(current_child_index.first, !current_child_index.second)) == 0) { + //If this node wasn't already added in the other direction, add it to the topological sort + topological_sort_order.emplace_back(current_child_index.first); + } + } + + //TODO: Do this properly + // For now, we only really want a topological ordering of DAGs, and I'm going to ignore tips + // So if anything is only reachable from the end node, then add it in an arbitrary order + vector check_ranks (all_children.size(), false); + for (size_t x : topological_sort_order) { + check_ranks[x] = true; + } + //If anything wasn't in the topological order, add it now + for (size_t i = 0 ; i < check_ranks.size() ; i++) { + if (!check_ranks[i]) { + topological_sort_order.emplace_back(i); + } + } + assert(topological_sort_order.size() == all_children.size()); + + + //We've finished doing to topological sort, so update every child's rank to be the new order + for (size_t new_rank = 0 ; new_rank < topological_sort_order.size() ; new_rank++) { + size_t old_rank = topological_sort_order[new_rank]; + if (all_children[old_rank].first == SnarlDistanceIndex::TEMP_NODE) { + temp_index.temp_node_records.at(all_children[old_rank].second-temp_index.min_node_id).rank_in_parent = new_rank+2; + } else { + temp_index.temp_chain_records[all_children[old_rank].second].rank_in_parent = new_rank+2; + } + } + } + + /* + * Now go through each of the children and add distances from that child to everything reachable from it * Start a dijkstra traversal from each node side in the snarl and record all distances */ - //Add the start and end nodes to the list of children so that we include them in the traversal - //TODO: Copying the list - vector> all_children = temp_snarl_record.children; //Reserve enough space to store all possible distances temp_snarl_record.distances.reserve( (temp_snarl_record.node_count > size_limit || size_limit == 0) @@ -807,8 +995,8 @@ void populate_snarl_index( temp_index.use_oversized_snarls = true; } + //Add the start and end nodes to the list of children so that we include them in the traversal if (!temp_snarl_record.is_root_snarl) { - all_children.emplace_back(SnarlDistanceIndex::TEMP_NODE, temp_snarl_record.start_node_id); all_children.emplace_back(SnarlDistanceIndex::TEMP_NODE, temp_snarl_record.end_node_id); } From 26387cc341304e53336295a76737a67d6f0ef7e0 Mon Sep 17 00:00:00 2001 From: Xian Date: Sat, 1 Jul 2023 20:07:27 +0200 Subject: [PATCH 05/54] Topologically sort from the start of a snarl relative to the top-level chain --- src/snarl_distance_index.cpp | 84 +++++++++++++++++++++++++++++++++--- 1 file changed, 78 insertions(+), 6 deletions(-) diff --git a/src/snarl_distance_index.cpp b/src/snarl_distance_index.cpp index f4338f58e76..395d0740a69 100644 --- a/src/snarl_distance_index.cpp +++ b/src/snarl_distance_index.cpp @@ -767,7 +767,8 @@ void populate_snarl_index( /*Helper function to find the ancestor of a node that is a child of this snarl */ - auto get_ancestor_of_node = [&](pair curr_index) { + auto get_ancestor_of_node = [&](pair curr_index, + pair ancestor_snarl_index) { //This is a child that isn't a node, so it must be a chain if (curr_index.second == temp_snarl_record.start_node_id || @@ -777,7 +778,7 @@ void populate_snarl_index( //Otherwise, walk up until we hit the current snarl pair parent_index = temp_index.temp_node_records.at(curr_index.second-temp_index.min_node_id).parent; - while (parent_index != snarl_index) { + while (parent_index != ancestor_snarl_index) { curr_index=parent_index; parent_index = parent_index.first == SnarlDistanceIndex::TEMP_SNARL ? temp_index.temp_snarl_records.at(parent_index.second).parent : temp_index.temp_chain_records.at(parent_index.second).parent; @@ -800,6 +801,74 @@ void populate_snarl_index( */ if (!temp_snarl_record.is_root_snarl) { + //Is this snarl reversed relative to the top-level chain? + bool is_reversed = false; + // Walk up the snarl tree and if anything is reversed (or a chain is only reachable backwards in its parent) + // then flip is_reversed + // Since we don't have distances in snarl ancestors yet, walk out the fronts of chains and see if + // we hit the snarl start or end + pair current_index = snarl_index; + while (current_index.first != SnarlDistanceIndex::TEMP_ROOT) { + + //Get the parent of the current index + pair parent_index = + current_index.first == SnarlDistanceIndex::TEMP_SNARL ? temp_index.temp_snarl_records.at(current_index.second).parent + : temp_index.temp_chain_records.at(current_index.second).parent; + if (parent_index.first == SnarlDistanceIndex::TEMP_SNARL) { + //If the parent is a snarl, then walk out the front of the chain and see if it reaches the start of the ancestor snarl + vector to_check; + unordered_set seen; + to_check.emplace_back(graph->get_handle(temp_index.temp_chain_records[current_index.second].start_node_id, + !temp_index.temp_chain_records[current_index.second].start_node_rev)); + seen.emplace(to_check.back()); + bool reaches_start = false; + while (!to_check.empty()) { + handle_t current_handle = to_check.back(); + to_check.pop_back(); + graph->follow_edges(current_handle, false, [&](const handle_t next_handle) { + if (seen.count(next_handle) == 0) { + if (graph->get_id(next_handle) == temp_index.temp_snarl_records[parent_index.second].start_node_id) { + //If this reached the start node, then we consider the chain to be oriented forward + // so we can stop + reaches_start = true; + //Stop iterating + return false; + } else if (graph->get_id(next_handle) != temp_index.temp_snarl_records[parent_index.second].end_node_id) { + //If this isn't leaving the snarl, then continue traversing + //We need to jump to the end of the current chain + + //First, find the temp_chain_record for the chain we just entered + pair next_index = + get_ancestor_of_node(make_pair(SnarlDistanceIndex::TEMP_NODE, graph->get_id(next_handle)), parent_index); + + to_check.emplace_back( next_index.first == SnarlDistanceIndex::TEMP_NODE + ? next_handle : + (graph->get_id(next_handle) == temp_index.temp_chain_records[next_index.second].start_node_id + ? graph->get_handle(temp_index.temp_chain_records[next_index.second].end_node_id, + temp_index.temp_chain_records[next_index.second].end_node_rev) + : graph->get_handle(temp_index.temp_chain_records[next_index.second].start_node_id, + !temp_index.temp_chain_records[next_index.second].start_node_rev))); + + } + seen.emplace(next_handle); + } + return true; + }); + if (!reaches_start) { + //If we couldn't reach the start of the parent from the start of the child, then assume the child + //was reversed + is_reversed = !is_reversed; + } + } + } + current_index=parent_index; + } + + //Where do we start the topological sort? The start or end bound + handle_t topological_sort_start = is_reversed ? graph->get_handle(temp_snarl_record.start_node_id,temp_snarl_record.start_node_rev) + : graph->get_handle(temp_snarl_record.end_node_id,!temp_snarl_record.end_node_rev); + + //This will hold the new order of the children. Each value is an index into all_children, which //matches the ranks(-2) of the children vector topological_sort_order; @@ -849,7 +918,7 @@ void populate_snarl_index( handle_t current_graph_handle; if (current_child_index.first == std::numeric_limits::max()) { //If the current child is the start bound, then get the start node pointing in - current_graph_handle = graph->get_handle(temp_snarl_record.start_node_id,temp_snarl_record.start_node_rev); + current_graph_handle = topological_sort_start; } else { pair current_index = all_children[current_child_index.first]; if (current_index.first == SnarlDistanceIndex::TEMP_NODE) { @@ -880,7 +949,8 @@ void populate_snarl_index( //If it reaches anything unseen, then it can't be a source node //Get the index of next_handle - pair next_index = get_ancestor_of_node(make_pair(SnarlDistanceIndex::TEMP_NODE, graph->get_id(next_handle))); + pair next_index = + get_ancestor_of_node(make_pair(SnarlDistanceIndex::TEMP_NODE, graph->get_id(next_handle)), snarl_index); size_t next_rank = next_index.first == SnarlDistanceIndex::TEMP_NODE ? temp_index.temp_node_records.at(next_index.second-temp_index.min_node_id).rank_in_parent : temp_index.temp_chain_records[next_index.second].rank_in_parent; @@ -916,7 +986,8 @@ void populate_snarl_index( return true; } //The index of the snarl's child that next_handle represents - pair incoming_index = get_ancestor_of_node(make_pair(SnarlDistanceIndex::TEMP_NODE, graph->get_id(incoming_handle))); + pair incoming_index = + get_ancestor_of_node(make_pair(SnarlDistanceIndex::TEMP_NODE, graph->get_id(incoming_handle)), snarl_index); size_t incoming_rank = incoming_index.first == SnarlDistanceIndex::TEMP_NODE ? temp_index.temp_node_records.at(incoming_index.second-temp_index.min_node_id).rank_in_parent : temp_index.temp_chain_records[incoming_index.second].rank_in_parent; @@ -1144,7 +1215,8 @@ void populate_snarl_index( auto& node_record = temp_index.temp_node_records.at(graph->get_id(next_handle)-temp_index.min_node_id); //The index of the snarl's child that next_handle represents - pair next_index = get_ancestor_of_node(make_pair(SnarlDistanceIndex::TEMP_NODE, graph->get_id(next_handle))); + pair next_index = + get_ancestor_of_node(make_pair(SnarlDistanceIndex::TEMP_NODE, graph->get_id(next_handle)), snarl_index); bool next_is_tip = start_index.first == SnarlDistanceIndex::TEMP_NODE ? temp_index.temp_node_records.at(start_index.second-temp_index.min_node_id).is_tip From c5ff4208eb7b03e0f063d3a163c06aadd8a12ed8 Mon Sep 17 00:00:00 2001 From: Xian Date: Sat, 1 Jul 2023 22:15:06 +0200 Subject: [PATCH 06/54] Fix orientation of topological sort --- src/snarl_distance_index.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/snarl_distance_index.cpp b/src/snarl_distance_index.cpp index 395d0740a69..4a0f95e8786 100644 --- a/src/snarl_distance_index.cpp +++ b/src/snarl_distance_index.cpp @@ -863,10 +863,15 @@ void populate_snarl_index( } current_index=parent_index; } +#ifdef debug_distance_indexing + if (is_reversed) { + cerr << "\tsnarl is reversed relative to the top-level chain" << endl; + } +#endif //Where do we start the topological sort? The start or end bound - handle_t topological_sort_start = is_reversed ? graph->get_handle(temp_snarl_record.start_node_id,temp_snarl_record.start_node_rev) - : graph->get_handle(temp_snarl_record.end_node_id,!temp_snarl_record.end_node_rev); + handle_t topological_sort_start = is_reversed ? graph->get_handle(temp_snarl_record.end_node_id,!temp_snarl_record.end_node_rev) + : graph->get_handle(temp_snarl_record.start_node_id,temp_snarl_record.start_node_rev); //This will hold the new order of the children. Each value is an index into all_children, which From f335c9ca544849c1a51b0edc98906cb6317b59c7 Mon Sep 17 00:00:00 2001 From: Xian Date: Thu, 12 Sep 2024 09:13:36 +0200 Subject: [PATCH 07/54] Fix unit test --- src/unittest/snarl_distance_index.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/unittest/snarl_distance_index.cpp b/src/unittest/snarl_distance_index.cpp index ca30acb6511..fc7b808bd35 100644 --- a/src/unittest/snarl_distance_index.cpp +++ b/src/unittest/snarl_distance_index.cpp @@ -6906,7 +6906,7 @@ namespace vg { SECTION("Traversal of chain") { net_handle_t chain1_6 = distance_index.get_parent(distance_index.get_node_net_handle(n1->id())); distance_index.for_each_child(chain1_6, [&](const net_handle_t& child) { - assert(distance_index.get_parent(child) == chain1_6); + assert(distance_index.canonical(distance_index.get_parent(child)) == distance_index.canonical(chain1_6)); }); } SECTION("Minimum distances are correct") { From eb2ba80a4f76660804b3b9e7d7d6eb97b10e2b3e Mon Sep 17 00:00:00 2001 From: Xian Date: Thu, 12 Sep 2024 09:43:59 +0200 Subject: [PATCH 08/54] Silence warnings for oversized snarls --- src/snarl_distance_index.cpp | 4 ++-- src/snarl_distance_index.hpp | 2 +- src/subcommand/index_main.cpp | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/snarl_distance_index.cpp b/src/snarl_distance_index.cpp index 4a0f95e8786..d60cd780356 100644 --- a/src/snarl_distance_index.cpp +++ b/src/snarl_distance_index.cpp @@ -20,13 +20,13 @@ size_t maximum_distance(const SnarlDistanceIndex& distance_index, pos_t pos1, po get_id(pos2), get_is_rev(pos2), get_offset(pos2)); } -void fill_in_distance_index(SnarlDistanceIndex* distance_index, const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit) { +void fill_in_distance_index(SnarlDistanceIndex* distance_index, const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit, bool silence_warnings) { distance_index->set_snarl_size_limit(size_limit); //Build the temporary distance index from the graph SnarlDistanceIndex::TemporaryDistanceIndex temp_index = make_temporary_distance_index(graph, snarl_finder, size_limit); - if (temp_index.use_oversized_snarls) { + if (!silence_warnings && temp_index.use_oversized_snarls) { cerr << "warning: distance index uses oversized snarls, which may make mapping slow" << endl; cerr << "\ttry increasing --snarl-limit when building the distance index" << endl; } diff --git a/src/snarl_distance_index.hpp b/src/snarl_distance_index.hpp index 33f3c6c8490..13e8777301c 100644 --- a/src/snarl_distance_index.hpp +++ b/src/snarl_distance_index.hpp @@ -22,7 +22,7 @@ size_t maximum_distance(const SnarlDistanceIndex& distance_index, pos_t pos1, po //Fill in the index //size_limit is a limit on the number of nodes in a snarl, after which the index won't store pairwise distances -void fill_in_distance_index(SnarlDistanceIndex* distance_index, const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit = 50000); +void fill_in_distance_index(SnarlDistanceIndex* distance_index, const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit = 50000, bool silence_warnings=true); //Fill in the temporary snarl record with distances void populate_snarl_index(SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, diff --git a/src/subcommand/index_main.cpp b/src/subcommand/index_main.cpp index 3ffb69dc70a..26a761162f2 100644 --- a/src/subcommand/index_main.cpp +++ b/src/subcommand/index_main.cpp @@ -701,7 +701,7 @@ int main_index(int argc, char** argv) { SnarlDistanceIndex distance_index; //Fill it in - fill_in_distance_index(&distance_index, xg.get(), &snarl_finder, snarl_limit); + fill_in_distance_index(&distance_index, xg.get(), &snarl_finder, snarl_limit, false); // Save it distance_index.serialize(dist_name); } else { From 22039a119a4d7dc21e47b29ae3365d6310397459 Mon Sep 17 00:00:00 2001 From: Xian Date: Thu, 12 Sep 2024 11:52:14 +0200 Subject: [PATCH 09/54] Update bdsg to fix distanceless distance indexing --- deps/libbdsg | 2 +- src/unittest/snarl_distance_index.cpp | 26 +++++++++++++++++++++----- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/deps/libbdsg b/deps/libbdsg index c8a10cb4cb9..33e2fc213eb 160000 --- a/deps/libbdsg +++ b/deps/libbdsg @@ -1 +1 @@ -Subproject commit c8a10cb4cb96c74093d1dc2883f2f4379a622b4e +Subproject commit 33e2fc213eb492a0db38a30ecc1b3b4e3146e769 diff --git a/src/unittest/snarl_distance_index.cpp b/src/unittest/snarl_distance_index.cpp index fc7b808bd35..5067b31040c 100644 --- a/src/unittest/snarl_distance_index.cpp +++ b/src/unittest/snarl_distance_index.cpp @@ -150,7 +150,7 @@ namespace vg { REQUIRE(std::get<2>(traceback.second.back()) == -5); } } - TEST_CASE( "Nested chain with loop", "[snarl_distance]" ) { + TEST_CASE( "Nested chain with loop", "[snarl_distance][bug]" ) { VG graph; @@ -190,9 +190,9 @@ namespace vg { //get the snarls IntegratedSnarlFinder snarl_finder(graph); - SnarlDistanceIndex distance_index; - fill_in_distance_index(&distance_index, &graph, &snarl_finder); SECTION("Traversal of chain") { + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder); net_handle_t chain1_13 = distance_index.get_parent(distance_index.get_node_net_handle(n1->id())); distance_index.for_each_child(chain1_13, [&](const net_handle_t& child) { if (distance_index.is_node(child)) { @@ -200,6 +200,8 @@ namespace vg { }); } SECTION("Minimum distances are correct") { + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder); net_handle_t node2 = distance_index.get_node_net_handle(n2->id()); net_handle_t chain1_13 = distance_index.get_parent(node2); REQUIRE(distance_index.distance_in_parent(chain1_13, distance_index.flip(node2), distance_index.flip(node2)) == 0); @@ -211,6 +213,8 @@ namespace vg { REQUIRE(distance_index.minimum_distance(n7->id(),false, 0, n8->id(), true, 0) == 1); } SECTION("Paths are correct") { + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder); size_t traversal_i = 0; vector> actual_path; @@ -231,6 +235,8 @@ namespace vg { //REQUIRE(traversal_i == 7); } SECTION("Path that leaves lowest common ancestor") { + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder); size_t traversal_i = 0; vector> actual_path; @@ -254,6 +260,10 @@ namespace vg { //}); //REQUIRE(traversal_i == 8); } + SECTION("Distanceless index") { + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder, 0); + } } TEST_CASE( "Snarl decomposition can deal with multiple connected components", "[snarl_distance]" ) { @@ -730,9 +740,9 @@ namespace vg { //get the snarls IntegratedSnarlFinder snarl_finder(graph); - SnarlDistanceIndex distance_index; - fill_in_distance_index(&distance_index, &graph, &snarl_finder); SECTION("Traverse the root") { + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder); net_handle_t chain1 = distance_index.get_parent(distance_index.get_node_net_handle(n1->id())); net_handle_t node2 = distance_index.get_node_net_handle(n2->id()); net_handle_t chain2 = distance_index.get_parent(node2); @@ -748,6 +758,8 @@ namespace vg { REQUIRE(found == 2); } SECTION("into_which_snarl") { + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder); REQUIRE((distance_index.into_which_snarl(n4->id(), false) == std::make_tuple(4, false, true) || distance_index.into_which_snarl(n4->id(), false) == std::make_tuple(5, true, true))); REQUIRE((distance_index.into_which_snarl(n5->id(), true) == std::make_tuple(4, false, true) || @@ -757,6 +769,10 @@ namespace vg { REQUIRE((distance_index.into_which_snarl(n4->id(), true) == std::make_tuple(4, true, false) || distance_index.into_which_snarl(n4->id(), true) == std::make_tuple(2, false, false))); } + SECTION("distanceless index") { + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder, 0); + } } From 59f3f246b5ffc7374973f38e6d69241a1a831f65 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 17 Sep 2024 11:22:01 -0400 Subject: [PATCH 10/54] fix typo in paths test --- test/t/11_vg_paths.t | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/t/11_vg_paths.t b/test/t/11_vg_paths.t index 36729a7a0ba..e21a8fe2c09 100644 --- a/test/t/11_vg_paths.t +++ b/test/t/11_vg_paths.t @@ -90,10 +90,10 @@ diff original.fa norm_x4.fa is $? 0 "path normalizer doesnt alter path sequences" # note: x3 is x4 in reverse, so we key on that -grep x3 norm_x2.gfa | awk '{print $3}' > x4.path -grep x3 norm_x2.gfa | awk '{print $3}' >> x4.path -grep x3 norm_x2.gfa | awk '{print $3}'> x4.norm.path -grep x5 norm_x2.gfa | awk '{print $3}' >> x4.norm.path +grep x3 norm_x4.gfa | awk '{print $3}' > x4.path +grep x3 norm_x4.gfa | awk '{print $3}' >> x4.path +grep x3 norm_x4.gfa | awk '{print $3}'> x4.norm.path +grep x5 norm_x4.gfa | awk '{print $3}' >> x4.norm.path diff x4.path x4.norm.path is $? 0 "path normalizere correctly snapped all equivalent paths to x4" From c78dce76ed83832f2e6d37e7b98d65992163e612 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 17 Sep 2024 11:22:23 -0400 Subject: [PATCH 11/54] set snarl distance to 0 in path normalize --- src/traversal_clusters.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/traversal_clusters.cpp b/src/traversal_clusters.cpp index 7e361c84f6a..d96a82f3b98 100644 --- a/src/traversal_clusters.cpp +++ b/src/traversal_clusters.cpp @@ -384,8 +384,7 @@ void merge_equivalent_traversals_in_graph(MutablePathHandleGraph* graph, const u SnarlDistanceIndex distance_index; { IntegratedSnarlFinder snarl_finder(*graph); - // todo: why can't I pass in 0 below -- I don't want any dinstances! - fill_in_distance_index(&distance_index, graph, &snarl_finder, 1); + fill_in_distance_index(&distance_index, graph, &snarl_finder, 0); } // only consider embedded paths that span snarl From e49db317e080763e29bdd651166293d71ca59ee8 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 19 Sep 2024 10:42:38 -0400 Subject: [PATCH 12/54] Remove no-longer-needed kff CMake patching The upstream problem we're working around was fixed in https://github.com/Kmer-File-Format/kff-cpp-api/commit/2a5ede6a11bde054f5304e95b69252e9cdfb7e0b, and manually editing the CMakeLists.txt upsets git when we go to change submodule versions. --- Makefile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index c0becd6407a..67c1e556f0e 100644 --- a/Makefile +++ b/Makefile @@ -613,12 +613,11 @@ endif $(INC_DIR)/kff_io.hpp: $(LIB_DIR)/libkff.a -# We need to drop the hardcoderd CMAKE_CXX_FLAGS. See $(LIB_DIR)/libkff.a: $(KFF_DIR)/kff_io.cpp $(KFF_DIR)/kff_io.hpp.in ifeq ($(shell uname -s),Darwin) - +. ./source_me.sh && cd $(KFF_DIR) && sed -i.bak '/set(CMAKE_CXX_FLAGS/d' CMakeLists.txt && rm -Rf build && mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-fPIC -Wall -Ofast -g $(CXXFLAGS)" .. && AS_INTEGRATED_ASSEMBLER=1 $(MAKE) $(FILTER) && cp kff_io.hpp $(CWD)/$(INC_DIR) && mv libkff.a $(CWD)/$(LIB_DIR) + +. ./source_me.sh && cd $(KFF_DIR) && rm -Rf build && mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-fPIC -Wall -Ofast -g $(CXXFLAGS)" .. && AS_INTEGRATED_ASSEMBLER=1 $(MAKE) $(FILTER) && cp kff_io.hpp $(CWD)/$(INC_DIR) && mv libkff.a $(CWD)/$(LIB_DIR) else - +. ./source_me.sh && cd $(KFF_DIR) && sed -i.bak '/set(CMAKE_CXX_FLAGS/d' CMakeLists.txt && rm -Rf build && mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-fPIC -Wall -Ofast -g $(CXXFLAGS)" .. && $(MAKE) $(FILTER) && cp kff_io.hpp $(CWD)/$(INC_DIR) && mv libkff.a $(CWD)/$(LIB_DIR) + +. ./source_me.sh && cd $(KFF_DIR) && rm -Rf build && mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-fPIC -Wall -Ofast -g $(CXXFLAGS)" .. && $(MAKE) $(FILTER) && cp kff_io.hpp $(CWD)/$(INC_DIR) && mv libkff.a $(CWD)/$(LIB_DIR) endif $(INC_DIR)/BooPHF.h: $(BBHASH_DIR)/BooPHF.h From c4e40f5f8dcf5f5b22ecfb24fb62b8a4be5a4170 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 23 Sep 2024 14:35:34 -0400 Subject: [PATCH 13/54] Add more codenames --- src/version.cpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/version.cpp b/src/version.cpp index 984627fb6d8..a488bd89447 100644 --- a/src/version.cpp +++ b/src/version.cpp @@ -103,7 +103,17 @@ const unordered_map Version::codenames = { {"v1.57.0", "Franchini"}, {"v1.58.0", "Cartari"}, {"v1.59.0", "Casatico"}, - {"v1.69.0", "Bologna"} // Reserved + {"v1.60.0", "Annicco"}, + {"v1.61.0", "Plodio"}, + {"v1.62.0", "Ranzano"}, + {"v1.63.0", "Boccaleone"}, + {"v1.64.0", "Vibbiana"}, + {"v1.65.0", "Carfon"}, + {"v1.66.0", "Navetta"}, + {"v1.67.0", "Vetria"}, + {"v1.68.0", "Rimbocchi"}, + {"v1.69.0", "Bologna"}, + {"v1.70.0", "Zebedassi"} // Add more codenames here }; From 17fd1b39d818f0b9ba4c9d88576e8b702d21b50e Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 26 Sep 2024 11:29:04 -0400 Subject: [PATCH 14/54] Add error checking around writing kmer files This might help with problems like https://github.com/vgteam/vg/issues/4404#issuecomment-2373287057 by letting us tell whether the kmer files were actually accepted by the filesystem or not. --- src/kmer.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/kmer.cpp b/src/kmer.cpp index 52d63713096..188e3f01ca6 100644 --- a/src/kmer.cpp +++ b/src/kmer.cpp @@ -348,7 +348,15 @@ string write_gcsa_kmers_to_tmpfile(const HandleGraph& graph, int kmer_size, size temp_file::remove(tmpfile); throw ex; } + if (!out) { + std::cerr << "error[write_gcsa_kmers_to_tmpfile]: I/O error while writing kmers to " << tmpfile << std::endl; + exit(1); + } out.close(); + if (!out) { + std::cerr << "error[write_gcsa_kmers_to_tmpfile]: I/O error while closing kmer file " << tmpfile << std::endl; + exit(1); + } return tmpfile; } From 015e7ee460e69cb8459edda55484ba05a3e639df Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 3 Oct 2024 11:33:49 -0400 Subject: [PATCH 15/54] Pass $DEBIAN_FRONTEND through make get-deps This lets you stop `make get-deps` from demanding your timezone. --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 67c1e556f0e..e94642c0ca3 100644 --- a/Makefile +++ b/Makefile @@ -521,7 +521,7 @@ static-docker: static scripts/* # We have system-level deps to install # We want the One True Place for them to be in the Dockerfile. get-deps: - sudo apt-get install -qq -y --no-upgrade $(shell cat Dockerfile | sed -n '/^###DEPS_BEGIN###/,$${p;/^###DEPS_END###/q}' | grep -v '^ *#' | grep -v "^RUN" | tr '\n' ' ' | tr -d '\\') + sudo DEBIAN_FRONTEND=$(DEBIAN_FRONTEND) apt-get install -qq -y --no-upgrade $(shell cat Dockerfile | sed -n '/^###DEPS_BEGIN###/,$${p;/^###DEPS_END###/q}' | grep -v '^ *#' | grep -v "^RUN" | tr '\n' ' ' | tr -d '\\') # And we have submodule deps to build deps: $(DEPS) From 638ec669a64814d682de51d3a917c0b72af3a0df Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 10 Oct 2024 16:28:29 -0400 Subject: [PATCH 16/54] fix validate bug on protobuf graphs --- src/subcommand/validate_main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/subcommand/validate_main.cpp b/src/subcommand/validate_main.cpp index 9fe64ca05c9..531f5a287ae 100644 --- a/src/subcommand/validate_main.cpp +++ b/src/subcommand/validate_main.cpp @@ -125,7 +125,7 @@ int main_validate(int argc, char** argv) { if (!gam_only) { VG* vg_graph = dynamic_cast(graph.get()); if (vg_graph != nullptr) { - if (!vg_graph->is_valid(true, true, check_orphans, true)) { + if (!vg_graph->is_valid(true, true, true, check_orphans)) { valid_graph = false; } } From 51e512ded330c6887555830713043ab20ad930fe Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 21 Oct 2024 13:49:34 -0400 Subject: [PATCH 17/54] revert from distance index to snarl manager in path normalizer --- src/subcommand/paths_main.cpp | 2 +- src/traversal_clusters.cpp | 80 +++++++++++++++++++++++------------ src/traversal_clusters.hpp | 5 ++- 3 files changed, 59 insertions(+), 28 deletions(-) diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp index fb668d7edd4..9a4e868a06a 100644 --- a/src/subcommand/paths_main.cpp +++ b/src/subcommand/paths_main.cpp @@ -610,7 +610,7 @@ int main_paths(int argc, char** argv) { for_each_selected_path([&](const path_handle_t& path_handle) { selected_paths.insert(path_handle); }); - merge_equivalent_traversals_in_graph(mutable_graph, selected_paths); + merge_equivalent_traversals_in_graph(mutable_graph, selected_paths, true); } if (!to_destroy.empty()) { mutable_graph->destroy_paths(to_destroy); diff --git a/src/traversal_clusters.cpp b/src/traversal_clusters.cpp index d96a82f3b98..130b65f69e1 100644 --- a/src/traversal_clusters.cpp +++ b/src/traversal_clusters.cpp @@ -2,6 +2,7 @@ #include "traversal_finder.hpp" #include "integrated_snarl_finder.hpp" #include "snarl_distance_index.hpp" +#include "snarls.hpp" //#define debug @@ -379,37 +380,64 @@ static void merge_equivalent_traversals_in_snarl(MutablePathHandleGraph* graph, } } -void merge_equivalent_traversals_in_graph(MutablePathHandleGraph* graph, const unordered_set& selected_paths) { - // compute the snarls - SnarlDistanceIndex distance_index; - { - IntegratedSnarlFinder snarl_finder(*graph); - fill_in_distance_index(&distance_index, graph, &snarl_finder, 0); - } +void merge_equivalent_traversals_in_graph(MutablePathHandleGraph* graph, const unordered_set& selected_paths, + bool use_snarl_manager) { // only consider embedded paths that span snarl PathTraversalFinder path_trav_finder(*graph); - // do every snarl top-down. this is because it's possible (tho probably rare) for a child snarl to - // be redundant after normalizing its parent. don't think the opposite (normalizing child) - // causes redundant parent.. todo: can we guarantee?! - net_handle_t root = distance_index.get_root(); - deque queue = {root}; - - while (!queue.empty()) { - net_handle_t net_handle = queue.front(); - queue.pop_front(); - if (distance_index.is_snarl(net_handle)) { - net_handle_t start_bound = distance_index.get_bound(net_handle, false, true); - net_handle_t end_bound = distance_index.get_bound(net_handle, true, false); - handle_t start_handle = distance_index.get_handle(start_bound, graph); - handle_t end_handle = distance_index.get_handle(end_bound, graph); + if (use_snarl_manager) { + // compute the snarls using the old snarl manager + IntegratedSnarlFinder finder(*graph); + SnarlManager snarl_manager(std::move(finder.find_snarls_parallel())); + + deque queue; + snarl_manager.for_each_top_level_snarl([&](const Snarl* snarl) { + queue.push_back(snarl); + }); + + while (!queue.empty()) { + const Snarl* snarl = queue.front(); + queue.pop_front(); + handle_t start_handle = graph->get_handle(snarl->start().node_id(), snarl->start().backward()); + handle_t end_handle = graph->get_handle(snarl->end().node_id(), snarl->end().backward()); merge_equivalent_traversals_in_snarl(graph, selected_paths, path_trav_finder, start_handle, end_handle); - } - if (net_handle == root || distance_index.is_snarl(net_handle) || distance_index.is_chain(net_handle)) { - distance_index.for_each_child(net_handle, [&](net_handle_t child_handle) { - queue.push_back(child_handle); - }); + const vector& children = snarl_manager.children_of(snarl); + for (const Snarl* child : children) { + queue.push_back(child); + } + } + } else { + // compute the snarls using the distance index + // this is what we want to do going forward since it uses the new api, no protobuf etc, + // but unfortunately it seems way slower on some graphs, hence + SnarlDistanceIndex distance_index; + { + IntegratedSnarlFinder snarl_finder(*graph); + fill_in_distance_index(&distance_index, graph, &snarl_finder, 0); + } + + // do every snarl top-down. this is because it's possible (tho probably rare) for a child snarl to + // be redundant after normalizing its parent. don't think the opposite (normalizing child) + // causes redundant parent.. todo: can we guarantee?! + net_handle_t root = distance_index.get_root(); + deque queue = {root}; + + while (!queue.empty()) { + net_handle_t net_handle = queue.front(); + queue.pop_front(); + if (distance_index.is_snarl(net_handle)) { + net_handle_t start_bound = distance_index.get_bound(net_handle, false, true); + net_handle_t end_bound = distance_index.get_bound(net_handle, true, false); + handle_t start_handle = distance_index.get_handle(start_bound, graph); + handle_t end_handle = distance_index.get_handle(end_bound, graph); + merge_equivalent_traversals_in_snarl(graph, selected_paths, path_trav_finder, start_handle, end_handle); + } + if (net_handle == root || distance_index.is_snarl(net_handle) || distance_index.is_chain(net_handle)) { + distance_index.for_each_child(net_handle, [&](net_handle_t child_handle) { + queue.push_back(child_handle); + }); + } } } } diff --git a/src/traversal_clusters.hpp b/src/traversal_clusters.hpp index 646cbf46409..b35bbb3d730 100644 --- a/src/traversal_clusters.hpp +++ b/src/traversal_clusters.hpp @@ -99,6 +99,9 @@ vector> assign_child_snarls_to_traversals(const PathHandleGraph* gra /// Note: this doesn't modify the graph toplogy, so uncovered nodes and edges as a result of path editing /// would usually need removale with vg clip afterwards /// -void merge_equivalent_traversals_in_graph(MutablePathHandleGraph* graph, const unordered_set& selected_paths); +/// the use_snarl_manager toggles between distnace index and snarl manager for computing snarls +/// (adding this option to (hopefully) temporarily revert to the snarl manager for performance reasons) +void merge_equivalent_traversals_in_graph(MutablePathHandleGraph* graph, const unordered_set& selected_paths, + bool use_snarl_manager=false); } From a335429503055975e1be8d38b416797fe7649c2e Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 21 Oct 2024 16:49:30 -0400 Subject: [PATCH 18/54] Make padded window size parity match anchor parity for orientation independence --- src/surjector.cpp | 48 ++++++++++++++++++++++++++++++++++++++++------- src/surjector.hpp | 4 ++++ 2 files changed, 45 insertions(+), 7 deletions(-) diff --git a/src/surjector.cpp b/src/surjector.cpp index be0a2e067d5..9c510bde9aa 100644 --- a/src/surjector.cpp +++ b/src/surjector.cpp @@ -4038,7 +4038,7 @@ using namespace std; for (int i = 0; i < path_chunks.size(); ++i) { auto& chunk = path_chunks[i]; // Mark anchors that are themselves suspicious as not to be kept. - if (chunk.first.first == path_chunks.front().first.first && chunk.first.second == path_chunks.back().first.second + if (chunk.first.first == path_chunks.front().first.first && chunk.first.second == path_chunks.back().first.second // TODO: shouldn't this be || if we want to match *either* tail? && (anchor_lengths[i] <= max_tail_anchor_prune || chunk.first.second - chunk.first.first <= max_tail_anchor_prune)) { #ifdef debug_anchored_surject cerr << "anchor " << i << " (read interval " << (chunk.first.first - sequence.begin()) << " : " << (chunk.first.second - sequence.begin()) << ") pruned for being a short tail" << endl; @@ -4050,19 +4050,53 @@ using namespace std; if ((anchor_lengths[i] <= max_low_complexity_anchor_prune || chunk.first.second - chunk.first.first <= max_low_complexity_anchor_prune)) { SeqComplexity<6> chunk_complexity(chunk.first.first, chunk.first.second); if (chunk.first.second - chunk.first.first < pad_suspicious_anchors_to_length) { - auto read_context_begin = max(sequence.begin(), chunk.first.first - (pad_suspicious_anchors_to_length - (chunk.first.second - chunk.first.first)) / 2); - auto read_context_end = min(sequence.end(), read_context_begin + pad_suspicious_anchors_to_length); - if (read_context_end == sequence.end()) { - // try to ensure enough bases if we're near the end of the read - read_context_begin = max(sequence.begin(), read_context_end - pad_suspicious_anchors_to_length); + // We need to fetch out a sequence at least pad_suspicious_anchors_to_length bp long (unless the whole sequence is shorter) and analyze that. + + // There's no way to symmetrically (and thus + // independently of orientation) get an even-length + // window around an odd-length window, or visa versa. + // So express everything as per-side padding, and round it up. + size_t chunk_length = chunk.first.second - chunk.first.first; + size_t padding_per_side = (pad_suspicious_anchors_to_length - chunk_length + 1) / 2; + + // Pad separately on each side to avoid read bounds + size_t left_padding = padding_per_side; + size_t right_padding = padding_per_side; + + // Shift the padded window right if we hit the start + size_t start_offset = chunk.first.first - sequence.begin(); + if (start_offset < left_padding) { + right_padding += (left_padding - start_offset); + left_padding = start_offset; } + + // Shift the padded window left if we hit the end + size_t remaining_until_end = sequence.end() - chunk.first.second; + if (remaining_until_end < right_padding) { + left_padding += (right_padding - remaining_until_end); + right_padding = remaining_until_end; + } + + // If we hit the start again, clip since the whole window doesn't fit. + left_padding = min(left_padding, start_offset); + + // TODO: Is there a more closed-form way to budge the padding? + + // Now expand the iterator range without ever constructing pre-begin iterators + auto read_context_begin = chunk.first.first - left_padding; + auto read_context_end = chunk.first.second + right_padding; + +#ifdef debug_anchored_surject + std::cerr << "For read interval " << (chunk.first.first - sequence.begin()) << ":" << (chunk.first.second - sequence.begin()) << " length " << (chunk.first.second - chunk.first.first) << " use context " << (read_context_begin - sequence.begin()) << ":" << (read_context_end - sequence.begin()) << " length " << (read_context_end - read_context_begin) << std::endl; +#endif + SeqComplexity<6> context_complexity(read_context_begin, read_context_end); // TODO: repetitive for (int order = 1, max_order = 6; order <= max_order; ++order) { //cerr << "padded anchor " << i << " (read[" << (chunk.first.first - sequence.begin()) << ":" << (chunk.first.second - sequence.begin()) << "]), seq " << string(read_context_begin, read_context_end) << ", order " << order << " with p-value " << context_complexity.p_value(order) << ", repetitive fraction " << chunk_complexity.repetitiveness(order) << endl; if (context_complexity.p_value(order) < low_complexity_p_value) { #ifdef debug_anchored_surject - cerr << "anchor " << i << " (read[" << (chunk.first.first - sequence.begin()) << ":" << (chunk.first.second - sequence.begin()) << "]) pruned being for having context with low complexity at order " << order << ", p-value " << context_complexity.p_value(order) << " and anchor repetitive fraction " << chunk_complexity.repetitiveness(order) << endl; + cerr << "anchor " << i << " (read[" << (chunk.first.first - sequence.begin()) << ":" << (chunk.first.second - sequence.begin()) << "]) pruned for having context with low complexity at order " << order << ", p-value " << context_complexity.p_value(order) << " and anchor repetitive fraction " << chunk_complexity.repetitiveness(order) << endl; #endif // the sequences is repetitive at this order keep[i] = false; diff --git a/src/surjector.hpp b/src/surjector.hpp index a582df64128..84bacead647 100644 --- a/src/surjector.hpp +++ b/src/surjector.hpp @@ -132,6 +132,10 @@ using namespace std; double low_complexity_p_value = .0075; int64_t max_low_complexity_anchor_prune = 40; int64_t max_low_complexity_anchor_trim = 65; + /// When examining anchors for suspiciousness, try and make them at + /// least this long. To ensure orientation symmetry, we will make + /// anchors with the oppsite parity (even if this is odd, or odd if + /// this is even) 1bp longer. int64_t pad_suspicious_anchors_to_length = 12; // A function for computing band padding From cabfe9fc66f31fed06eae727b04c3cb4ea16b61b Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 21 Oct 2024 16:50:01 -0400 Subject: [PATCH 19/54] Remove extra debugging --- src/surjector.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/surjector.cpp b/src/surjector.cpp index 9c510bde9aa..0c2ff88df30 100644 --- a/src/surjector.cpp +++ b/src/surjector.cpp @@ -4086,10 +4086,6 @@ using namespace std; auto read_context_begin = chunk.first.first - left_padding; auto read_context_end = chunk.first.second + right_padding; -#ifdef debug_anchored_surject - std::cerr << "For read interval " << (chunk.first.first - sequence.begin()) << ":" << (chunk.first.second - sequence.begin()) << " length " << (chunk.first.second - chunk.first.first) << " use context " << (read_context_begin - sequence.begin()) << ":" << (read_context_end - sequence.begin()) << " length " << (read_context_end - read_context_begin) << std::endl; -#endif - SeqComplexity<6> context_complexity(read_context_begin, read_context_end); // TODO: repetitive for (int order = 1, max_order = 6; order <= max_order; ++order) { From 68fc4284eaf31de6fcbb62f3aacfa26354e79112 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 21 Oct 2024 17:05:54 -0400 Subject: [PATCH 20/54] Change short tail pruning to only require being one tail and not both --- src/surjector.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/surjector.cpp b/src/surjector.cpp index 0c2ff88df30..954adb0b279 100644 --- a/src/surjector.cpp +++ b/src/surjector.cpp @@ -4038,8 +4038,8 @@ using namespace std; for (int i = 0; i < path_chunks.size(); ++i) { auto& chunk = path_chunks[i]; // Mark anchors that are themselves suspicious as not to be kept. - if (chunk.first.first == path_chunks.front().first.first && chunk.first.second == path_chunks.back().first.second // TODO: shouldn't this be || if we want to match *either* tail? - && (anchor_lengths[i] <= max_tail_anchor_prune || chunk.first.second - chunk.first.first <= max_tail_anchor_prune)) { + if ((chunk.first.first == path_chunks.front().first.first || chunk.first.second == path_chunks.back().first.second) // Is at either tail + && (anchor_lengths[i] <= max_tail_anchor_prune || chunk.first.second - chunk.first.first <= max_tail_anchor_prune)) { // And is too short #ifdef debug_anchored_surject cerr << "anchor " << i << " (read interval " << (chunk.first.first - sequence.begin()) << " : " << (chunk.first.second - sequence.begin()) << ") pruned for being a short tail" << endl; #endif From 0c826dc84bed3697294a5fda31102b9cd9520a5f Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 21 Oct 2024 17:15:54 -0400 Subject: [PATCH 21/54] Add a test for surjection orientation independence --- test/surject/opposite_strands.gaf | 2 ++ test/surject/opposite_strands.gfa | 43 +++++++++++++++++++++++++++++++ test/t/15_vg_surject.t | 4 ++- 3 files changed, 48 insertions(+), 1 deletion(-) create mode 100644 test/surject/opposite_strands.gaf create mode 100644 test/surject/opposite_strands.gfa diff --git a/test/surject/opposite_strands.gaf b/test/surject/opposite_strands.gaf new file mode 100644 index 00000000000..73e07638fe5 --- /dev/null +++ b/test/surject/opposite_strands.gaf @@ -0,0 +1,2 @@ +forward 35 0 35 + >56629809>56629810>56629811>56629814>56629815>56629819>56629820>56629822 132 97 132 35 35 0 cs:Z::35 +reverse 35 0 35 + <56629822<56629820<56629819<56629815<56629814<56629811<56629810<56629809 132 0 35 35 35 0 cs:Z::35 diff --git a/test/surject/opposite_strands.gfa b/test/surject/opposite_strands.gfa new file mode 100644 index 00000000000..8177180d24c --- /dev/null +++ b/test/surject/opposite_strands.gfa @@ -0,0 +1,43 @@ +H VN:Z:1.1 RS:Z:GRCh38 CHM13 +S 56629807 G +S 56629808 T +S 56629809 TTTCTGATTATAAATATTGCACATGTATTGATTACATAAATCCATATACTATAAAACTGATATTTAAGAGAATAAAAGTCCCAACCTCAGAATTAACTACTG +S 56629810 C +S 56629811 A +S 56629812 A +S 56629813 C +S 56629814 CCCCC +S 56629815 C +S 56629816 G +S 56629817 T +S 56629818 T +S 56629819 T +S 56629820 TTTTTTTTTTT +S 56629821 T +S 56629822 GATGGAGTCT +S 56629823 C +S 56629824 G +W CHM13 0 chr8 81560724 81560860 >56629807>56629809>56629810>56629813>56629814>56629817>56629818>56629819>56629820>56629821>56629822>56629824 +W GRCh38 0 chr8 35183939 35184073 >56629807>56629809>56629810>56629811>56629814>56629815>56629819>56629820>56629822>56629823 +L 56629807 + 56629809 + 0M +L 56629808 + 56629809 + 0M +L 56629809 + 56629812 - 0M +L 56629809 + 56629810 + 0M +L 56629810 + 56629813 + 0M +L 56629810 + 56629811 + 0M +L 56629811 + 56629814 + 0M +L 56629812 - 56629813 + 0M +L 56629813 + 56629814 + 0M +L 56629814 + 56629817 + 0M +L 56629814 + 56629815 + 0M +L 56629815 + 56629819 + 0M +L 56629815 + 56629816 - 0M +L 56629816 - 56629819 + 0M +L 56629817 + 56629818 + 0M +L 56629818 + 56629819 + 0M +L 56629819 + 56629820 + 0M +L 56629820 + 56629822 + 0M +L 56629820 + 56629821 + 0M +L 56629821 + 56629822 + 0M +L 56629822 + 56629824 + 0M +L 56629822 + 56629823 + 0M diff --git a/test/t/15_vg_surject.t b/test/t/15_vg_surject.t index 6e0ddff37e2..3704ec823c2 100644 --- a/test/t/15_vg_surject.t +++ b/test/t/15_vg_surject.t @@ -6,7 +6,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 47 +plan tests 48 vg construct -r small/x.fa >j.vg vg index -x j.xg j.vg @@ -198,3 +198,5 @@ is "$(vg surject -x x.xg -M -m -s -i -t 1 mapped.gamp | grep -v '@' | wc -l)" 80 rm x.vg x.pathdup.vg x.xg x.gcsa x.gcsa.lcp x.gam mapped.gam mapped.gamp +is "$(vg surject -p CHM13#0#chr8 -x surject/opposite_strands.gfa --prune-low-cplx --sam-output --gaf-input surject/opposite_strands.gaf | grep -v "^@" | cut -f3-12 | sort | uniq | wc -l)" "1" "vg surject low compelxity pruning gets the same alignment regardless of read orientation" + From b0304a29c177d6dae88ac7c92e4198e1111f8b85 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 25 Oct 2024 16:34:46 -0400 Subject: [PATCH 22/54] Add tests for chunk on haplotype paths --- test/t/30_vg_chunk.t | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/test/t/30_vg_chunk.t b/test/t/30_vg_chunk.t index 20c8466ea88..3eccc903a2e 100644 --- a/test/t/30_vg_chunk.t +++ b/test/t/30_vg_chunk.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 32 +plan tests 33 # Construct a graph with alt paths so we can make a GBWT and a GBZ vg construct -m 1000 -r small/x.fa -v small/x.vcf.gz -a >x.vg @@ -126,5 +126,14 @@ is "$?" 0 "components finds subgraphs" rm -f xy.vg x.vg y.vg x_nodes.txt y_nodes.txt convert path_chunk_x.vg convert path_chunk_y.vg pc_x_nodes.txt pc_y_nodes.txt x_paths.txt pc_x_paths.txt components_chunk_0.vg components_chunk_1.vg comp_0_nodes.txt comp_1_nodes.txt comp_nodes.txt nodes.txt x.gam y.gam xy.gam path_chunk_x.gam path_chunk_y.gam +vg gbwt --gbz-format --graph-name graph.gbz --gfa-input graphs/gfa_with_reference.gfa +vg chunk -x graph.gbz -p sample1#1#chr1#0:1-2 -c 1 >/dev/null 2>log.txt +grep "out_of_range" log.txt +is "$?" "1" "chunking on a haplotype path does not produce an out of range error" +grep "not found in" log.txt +is "$?" "1" "chunking on a haplotype path does not produce a path not found error" + +rm -f graph.gbz log.txt + From a04b6bcace475e23aefaf6a272643219c843f29c Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 25 Oct 2024 17:59:19 -0400 Subject: [PATCH 23/54] Allow vg chunk to reference-index particular hapolotype paths requested --- deps/libbdsg | 2 +- src/subcommand/chunk_main.cpp | 78 ++++++++++++++++++++--------------- test/t/30_vg_chunk.t | 7 ++-- 3 files changed, 49 insertions(+), 38 deletions(-) diff --git a/deps/libbdsg b/deps/libbdsg index 33e2fc213eb..fd5c186d04c 160000 --- a/deps/libbdsg +++ b/deps/libbdsg @@ -1 +1 @@ -Subproject commit 33e2fc213eb492a0db38a30ecc1b3b4e3146e769 +Subproject commit fd5c186d04c9e00c088b4b3c98f687b82e3e6e15 diff --git a/src/subcommand/chunk_main.cpp b/src/subcommand/chunk_main.cpp index 22f6e02651e..57397ed6713 100644 --- a/src/subcommand/chunk_main.cpp +++ b/src/subcommand/chunk_main.cpp @@ -349,6 +349,36 @@ int main_chunk(int argc, char** argv) { bool chunk_gam = !gam_files.empty() && gam_split_size == 0; bool chunk_graph = gam_and_graph || (!chunk_gam && gam_split_size == 0); + // parse the regions into a list before loading the graph, if we're + // specifying regions by path name. + vector regions; + if (!region_strings.empty()) { + for (auto& region_string : region_strings) { + Region region; + parse_region(region_string, region); + regions.push_back(region); + } + } + if (!path_list_file.empty()) { + ifstream pr_stream(path_list_file.c_str()); + if (!pr_stream) { + cerr << "error:[vg chunk] unable to open path regions: " << path_list_file << endl; + return 1; + } + while (pr_stream) { + string buf; + std::getline(pr_stream, buf); + if (!buf.empty()) { + Region region; + parse_region(buf, region); + regions.push_back(region); + } + } + } + if (!in_bed_file.empty()) { + parse_bed_regions(in_bed_file, regions); + } + // Load the snarls unique_ptr snarl_manager; if (!snarl_filename.empty()) { @@ -377,9 +407,17 @@ int main_chunk(int argc, char** argv) { return 1; } in.close(); + + // To support the regions we were asked for, we might need to ensure + // the paths they are on are actually indexed for reference style + // offset lookups. + std::unordered_set ensure_indexed; + for (auto& region : regions) { + ensure_indexed.insert(region.seq); + } path_handle_graph = vg::io::VPKG::load_one(xg_file); - graph = overlay_helper.apply(path_handle_graph.get()); + graph = overlay_helper.apply(path_handle_graph.get(), ensure_indexed); in.close(); } @@ -463,35 +501,7 @@ int main_chunk(int argc, char** argv) { // (instead of an index) unordered_map node_to_component; - // parse the regions into a list - vector regions; - if (!region_strings.empty()) { - for (auto& region_string : region_strings) { - Region region; - parse_region(region_string, region); - regions.push_back(region); - } - } - else if (!path_list_file.empty()) { - ifstream pr_stream(path_list_file.c_str()); - if (!pr_stream) { - cerr << "error:[vg chunk] unable to open path regions: " << path_list_file << endl; - return 1; - } - while (pr_stream) { - string buf; - std::getline(pr_stream, buf); - if (!buf.empty()) { - Region region; - parse_region(buf, region); - regions.push_back(region); - } - } - } - else if (!in_bed_file.empty()) { - parse_bed_regions(in_bed_file, regions); - } - else if (id_range) { + if (id_range) { if (n_chunks) { // Determine the ranges from the source graph itself. // how many nodes per range? @@ -556,9 +566,9 @@ int main_chunk(int argc, char** argv) { delete range_stream; } } - else if (graph != nullptr && (!components || path_components)) { - // every path - graph->for_each_path_handle([&](path_handle_t path_handle) { + if (graph != nullptr && path_components) { + // every reference or generic path (guaranteed to be reference indexed) + graph->for_each_path_matching({PathSense::REFERENCE, PathSense::GENERIC}, {}, {}, [&](path_handle_t path_handle) { Region region; region.seq = graph->get_path_name(path_handle); if (!Paths::is_alt(region.seq)) { @@ -596,7 +606,7 @@ int main_chunk(int argc, char** argv) { if (!id_range) { for (auto& region : regions) { if (!graph->has_path(region.seq)) { - cerr << "error[vg chunk]: input path " << region.seq << " not found in xg index" << endl; + cerr << "error[vg chunk]: input path " << region.seq << " not found in graph" << endl; return 1; } region.start = max((int64_t)0, region.start); diff --git a/test/t/30_vg_chunk.t b/test/t/30_vg_chunk.t index 3eccc903a2e..142de98066a 100644 --- a/test/t/30_vg_chunk.t +++ b/test/t/30_vg_chunk.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 33 +plan tests 35 # Construct a graph with alt paths so we can make a GBWT and a GBZ vg construct -m 1000 -r small/x.fa -v small/x.vcf.gz -a >x.vg @@ -127,13 +127,14 @@ is "$?" 0 "components finds subgraphs" rm -f xy.vg x.vg y.vg x_nodes.txt y_nodes.txt convert path_chunk_x.vg convert path_chunk_y.vg pc_x_nodes.txt pc_y_nodes.txt x_paths.txt pc_x_paths.txt components_chunk_0.vg components_chunk_1.vg comp_0_nodes.txt comp_1_nodes.txt comp_nodes.txt nodes.txt x.gam y.gam xy.gam path_chunk_x.gam path_chunk_y.gam vg gbwt --gbz-format --graph-name graph.gbz --gfa-input graphs/gfa_with_reference.gfa -vg chunk -x graph.gbz -p sample1#1#chr1#0:1-2 -c 1 >/dev/null 2>log.txt +vg chunk -x graph.gbz -p sample1#1#chr1#0:1-2 -c 1 >part.vg 2>log.txt grep "out_of_range" log.txt is "$?" "1" "chunking on a haplotype path does not produce an out of range error" grep "not found in" log.txt is "$?" "1" "chunking on a haplotype path does not produce a path not found error" +is "$(vg stats -z part.vg | grep nodes | cut -f2)" "4" "chunking on a haplotype produces the correct size graph" -rm -f graph.gbz log.txt +rm -f graph.gbz part.vg log.txt From 9f2cd0bcaf3c8278d11670cdb3e26855e17ae510 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 28 Oct 2024 10:11:21 -0400 Subject: [PATCH 24/54] Add -o to vg paths to look at paths through a reference overlay --- src/subcommand/paths_main.cpp | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp index fb668d7edd4..cda4be4256b 100644 --- a/src/subcommand/paths_main.cpp +++ b/src/subcommand/paths_main.cpp @@ -18,6 +18,7 @@ #include "../gbwt_helper.hpp" #include "../traversal_clusters.hpp" #include "../io/save_handle_graph.hpp" +#include #include #include #include @@ -57,6 +58,8 @@ void help_paths(char** argv) { << " -G, --generic-paths select the generic, non-reference, non-haplotype paths" << endl << " -R, --reference-paths select the reference paths" << endl << " -H, --haplotype-paths select the haplotype paths paths" << endl + << " configuration:" << endl + << " -o, --overlay apply a ReferencePathOverlayHelper to the graph" << endl << " -t, --threads N number of threads to use [all available]. applies only to snarl finding within -n" << endl; } @@ -122,6 +125,7 @@ int main_paths(int argc, char** argv) { bool coverage = false; const size_t coverage_bins = 10; bool normalize_paths = false; + bool overlay = false; int c; optind = 2; // force optind past command positional argument @@ -151,6 +155,7 @@ int main_paths(int argc, char** argv) { {"reference-paths", no_argument, 0, 'R'}, {"haplotype-paths", no_argument, 0, 'H'}, {"coverage", no_argument, 0, 'c'}, + {"overlay", no_argument, 0, 'o'}, // Hidden options for backward compatibility. {"threads", no_argument, 0, 'T'}, @@ -160,7 +165,7 @@ int main_paths(int argc, char** argv) { }; int option_index = 0; - c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:drnaGRHp:ct:", + c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:drnaGRHp:coTq:t:", long_options, &option_index); // Detect the end of the options. @@ -276,6 +281,10 @@ int main_paths(int argc, char** argv) { output_formats++; break; + case 'o': + overlay = true; + break; + case 'T': std::cerr << "warning: [vg paths] option --threads is obsolete and unnecessary" << std::endl; break; @@ -375,11 +384,23 @@ int main_paths(int argc, char** argv) { // Load whatever indexes we were given // Note: during handlifiction, distinction between -v and -x options disappeared. - unique_ptr graph; + unique_ptr path_handle_graph; if (!graph_file.empty()) { // Load the graph - graph = vg::io::VPKG::load_one(graph_file); + path_handle_graph = vg::io::VPKG::load_one(graph_file); + } + bdsg::ReferencePathOverlayHelper overlay_helper; + PathHandleGraph* graph = nullptr; + if (path_handle_graph) { + if (overlay) { + // Try to apply the overlay if the user wanted it and this isn't + // already a graph implementing position lookups. + graph = overlay_helper.apply(path_handle_graph.get()); + } else { + graph = path_handle_graph.get(); + } } + unique_ptr gbwt_index; if (!gbwt_file.empty()) { // We want a gbwt @@ -418,7 +439,7 @@ int main_paths(int argc, char** argv) { if (extract_as_gam || extract_as_gaf) { // Open up a GAM/GAF output stream aln_emitter = vg::io::get_non_hts_alignment_emitter("-", extract_as_gaf ? "GAF" : "GAM", {}, get_thread_count(), - graph.get()); + graph); } else if (extract_as_vg) { // Open up a VG Graph chunk output stream graph_emitter = unique_ptr>(new vg::io::ProtobufEmitter(cout)); @@ -589,7 +610,7 @@ int main_paths(int argc, char** argv) { }; if (drop_paths || retain_paths || normalize_paths) { - MutablePathMutableHandleGraph* mutable_graph = dynamic_cast(graph.get()); + MutablePathMutableHandleGraph* mutable_graph = dynamic_cast(graph); if (!mutable_graph) { std::cerr << "error[vg paths]: graph cannot be modified" << std::endl; exit(1); From b4f5c3ee9009152941a1dccc3ba45a60cae122a8 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 28 Oct 2024 11:00:51 -0400 Subject: [PATCH 25/54] Update error and bump libbdsg Update vg deconstruct error message to not be readable as about missing _alt_ paths, and use a libbdsg where the reference path overlay indexes all non-hidden paths (including e.g. haplotype paths from GFA). --- deps/libbdsg | 2 +- src/deconstructor.cpp | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/deps/libbdsg b/deps/libbdsg index fd5c186d04c..88181dbc808 160000 --- a/deps/libbdsg +++ b/deps/libbdsg @@ -1 +1 @@ -Subproject commit fd5c186d04c9e00c088b4b3c98f687b82e3e6e15 +Subproject commit 88181dbc80835e93de681e84e245c891e250f77c diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp index aa052db955a..0f15b8139d7 100644 --- a/src/deconstructor.cpp +++ b/src/deconstructor.cpp @@ -1168,9 +1168,10 @@ string Deconstructor::get_vcf_header() { } if (sample_to_haps.empty()) { - cerr << "Error [vg deconstruct]: No paths found for alt alleles in the graph. Note that " - << "exhaustive path-free traversal finding is no longer supported, and vg deconstruct " - << "now only works on embedded paths and GBWT threads." << endl; + cerr << "Error [vg deconstruct]: No paths other than selected reference(s) found in the graph, " + << "so no alt alleles can be generated. Note that exhaustive path-free traversal finding " + << "is no longer supported, and vg deconstruct now only works on embedded paths and GBWT " + << "threads." << endl; exit(1); } From 151ae72d23fe65f00c29868d2b9dac59c3835581 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 28 Oct 2024 11:13:55 -0400 Subject: [PATCH 26/54] Bump macos version for tests --- .github/workflows/testmac.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/testmac.yml b/.github/workflows/testmac.yml index 11d656e7a05..802d656ccfb 100644 --- a/.github/workflows/testmac.yml +++ b/.github/workflows/testmac.yml @@ -15,7 +15,7 @@ on: jobs: testmac: name: Test on Mac - runs-on: macos-12 + runs-on: macos-15 steps: - name: Use cache @@ -26,14 +26,14 @@ jobs: lib include bin - key: ${{ runner.os }}-12-${{ github.ref }} + key: ${{ runner.os }}-15-${{ github.ref }} # Restore keys are a "list", but really only a multiline string is # accepted. Also we match by prefix. And the most recent cache is # used, not the most specific. # See: https://docs.github.com/en/actions/guides/caching-dependencies-to-speed-up-workflows#matching-a-cache-key restore-keys: | - ${{ runner.os }}-12-${{ github.base_ref }} - ${{ runner.os }}-12 + ${{ runner.os }}-15-${{ github.base_ref }} + ${{ runner.os }}-15 - name: Checkout code without submodules uses: actions/checkout@v2 From 23364decdc3832b70cceee02de5fb1869ebed346 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 28 Oct 2024 11:51:51 -0400 Subject: [PATCH 27/54] Remove source_me.sh which makes system samtools use our dylibs --- Dockerfile | 7 +-- Makefile | 139 +++++++++++++++++++++---------------------- README.md | 4 +- configure.py | 44 -------------- deps/libbdsg | 2 +- scripts/setup-server | 2 +- source_me.sh | 23 ------- test/Makefile | 2 +- vgci/vgci.sh | 1 - 9 files changed, 77 insertions(+), 147 deletions(-) delete mode 100644 configure.py delete mode 100755 source_me.sh diff --git a/Dockerfile b/Dockerfile index ff1433eccfc..cf4279d6045 100644 --- a/Dockerfile +++ b/Dockerfile @@ -48,7 +48,6 @@ RUN apt-get -qq -y update && apt-get -qq -y upgrade && apt-get -qq -y install \ ###DEPS_END### # Prepare to build submodule dependencies -COPY source_me.sh /vg/source_me.sh COPY deps /vg/deps # To increase portability of the docker image, when building for amd64, set the # target CPU architecture to Nehalem (2008) rather than auto-detecting the @@ -59,17 +58,17 @@ RUN if [ -z "${TARGETARCH}" ] || [ "${TARGETARCH}" = "amd64" ] ; then sed -i s/m RUN find . -name CMakeCache.txt | xargs rm -f # Build the dependencies COPY Makefile /vg/Makefile -RUN . ./source_me.sh && CXXFLAGS="$(if [ -z "${TARGETARCH}" ] || [ "${TARGETARCH}" = "amd64" ] ; then echo " -march=nehalem "; fi)" CFLAGS="$(if [ -z "${TARGETARCH}" ] || [ "${TARGETARCH}" = "amd64" ] ; then echo " -march=nehalem "; fi)" make -j $((THREADS < $(nproc) ? THREADS : $(nproc))) deps +RUN CXXFLAGS="$(if [ -z "${TARGETARCH}" ] || [ "${TARGETARCH}" = "amd64" ] ; then echo " -march=nehalem "; fi)" CFLAGS="$(if [ -z "${TARGETARCH}" ] || [ "${TARGETARCH}" = "amd64" ] ; then echo " -march=nehalem "; fi)" make -j $((THREADS < $(nproc) ? THREADS : $(nproc))) deps # Bring in the sources, which we need in order to build. COPY src /vg/src # Build all the object files for vg, but don't link. # Also pass the arch here -RUN . ./source_me.sh && CXXFLAGS="$(if [ -z "${TARGETARCH}" ] || [ "${TARGETARCH}" = "amd64" ] ; then echo " -march=nehalem "; fi)" make -j $((THREADS < $(nproc) ? THREADS : $(nproc))) objs +RUN CXXFLAGS="$(if [ -z "${TARGETARCH}" ] || [ "${TARGETARCH}" = "amd64" ] ; then echo " -march=nehalem "; fi)" make -j $((THREADS < $(nproc) ? THREADS : $(nproc))) objs # Do the final build and link, knowing the version. Trim down the resulting binary but make sure to include enough debug info for profiling. -RUN . ./source_me.sh && CXXFLAGS="$(if [ -z "${TARGETARCH}" ] || [ "${TARGETARCH}" = "amd64" ] ; then echo " -march=nehalem "; fi)" make -j $((THREADS < $(nproc) ? THREADS : $(nproc))) static && strip -d bin/vg +RUN CXXFLAGS="$(if [ -z "${TARGETARCH}" ] || [ "${TARGETARCH}" = "amd64" ] ; then echo " -march=nehalem "; fi)" make -j $((THREADS < $(nproc) ? THREADS : $(nproc))) static && strip -d bin/vg # Ship the scripts COPY scripts /vg/scripts diff --git a/Makefile b/Makefile index 67c1e556f0e..ff91be32fa5 100644 --- a/Makefile +++ b/Makefile @@ -494,12 +494,12 @@ $(LIB_DIR)/libvg.$(SHARED_SUFFIX): $(LIBVG_SHARED_DEPS) # Each test set can have its own binary, and not link everything static $(UNITTEST_EXE): $(UNITTEST_BIN_DIR)/%: $(UNITTEST_OBJ_DIR)/%.o $(UNITTEST_SUPPORT_OBJ) $(CONFIG_OBJ) $(LIB_DIR)/libvg.$(SHARED_SUFFIX) - . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) -o $@ $< $(UNITTEST_SUPPORT_OBJ) $(CONFIG_OBJ) $(LIB_DIR)/libvg.$(SHARED_SUFFIX) $(LD_LIB_DIR_FLAGS) $(LDFLAGS) $(LD_LIB_FLAGS) $(LD_STATIC_LIB_FLAGS) $(LD_STATIC_LIB_DEPS) $(LD_EXE_LIB_FLAGS) + $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) -o $@ $< $(UNITTEST_SUPPORT_OBJ) $(CONFIG_OBJ) $(LIB_DIR)/libvg.$(SHARED_SUFFIX) $(LD_LIB_DIR_FLAGS) $(LDFLAGS) $(LD_LIB_FLAGS) $(LD_STATIC_LIB_FLAGS) $(LD_STATIC_LIB_DEPS) $(LD_EXE_LIB_FLAGS) # For a normal dynamic build we remove the static build marker $(BIN_DIR)/$(EXE): $(LIB_DIR)/libvg.a $(EXE_DEPS) -rm -f $(LIB_DIR)/vg_is_static - . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) -o $(BIN_DIR)/$(EXE) $(OBJ_DIR)/main.o $(UNITTEST_OBJ) $(SUBCOMMAND_OBJ) $(CONFIG_OBJ) $(LD_LIB_DIR_FLAGS) $(LDFLAGS) $(LIB_DIR)/libvg.a $(LD_LIB_FLAGS) $(START_STATIC) $(LD_STATIC_LIB_FLAGS) $(END_STATIC) $(LD_STATIC_LIB_DEPS) $(LD_EXE_LIB_FLAGS) + $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) -o $(BIN_DIR)/$(EXE) $(OBJ_DIR)/main.o $(UNITTEST_OBJ) $(SUBCOMMAND_OBJ) $(CONFIG_OBJ) $(LD_LIB_DIR_FLAGS) $(LDFLAGS) $(LIB_DIR)/libvg.a $(LD_LIB_FLAGS) $(START_STATIC) $(LD_STATIC_LIB_FLAGS) $(END_STATIC) $(LD_STATIC_LIB_DEPS) $(LD_EXE_LIB_FLAGS) # We keep a file that we touch on the last static build. # If the vg linkables are newer than the last static build, we do a build $(LIB_DIR)/vg_is_static: $(OBJ_DIR)/main.o $(LIB_DIR)/libvg.a $(UNITTEST_OBJ) $(SUBCOMMAND_OBJ) $(CONFIG_OBJ) $(DEPS) $(LINK_DEPS) @@ -527,10 +527,9 @@ get-deps: deps: $(DEPS) test: $(BIN_DIR)/$(EXE) $(LIB_DIR)/libvg.a test/build_graph $(BIN_DIR)/shuf $(BIN_DIR)/vcf2tsv $(FASTAHACK_DIR)/fastahack $(BIN_DIR)/rapper - . ./source_me.sh && cd test && prove -v t + cd test && prove -v t # Hide the compiler configuration from the doc tests, so that the ones that - # build code can't pick up libraries out of the bg build itself. Definitely - # don't source source_me.sh! + # build code can't pick up libraries out of the vg build itself. # But still supply vg on the PATH. Hope it knows where its own libraries are. CFLAGS= CXXFLAGS= CPPFLAGS= LDFLAGS= INCLUDE_FLAGS= LIBRARY_PATH= LD_LIBRARY_PATH= DYLD_LIBRARY_PATH= DYLD_FALLBACK_LIBRARY_PATH= LD_INCLUDE_PATH= CC= CXX= CXX_STANDARD= PATH=$(CWD)/bin:$(PATH) doc/test-docs.sh @@ -557,67 +556,67 @@ else endif test/build_graph: test/build_graph.cpp $(LIB_DIR)/libvg.a $(SRC_DIR)/vg.hpp - . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) -o test/build_graph test/build_graph.cpp $(LD_LIB_DIR_FLAGS) $(LDFLAGS) $(LIB_DIR)/libvg.a $(LD_LIB_FLAGS) $(START_STATIC) $(LD_STATIC_LIB_FLAGS) $(END_STATIC) $(FILTER) + $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) -o test/build_graph test/build_graph.cpp $(LD_LIB_DIR_FLAGS) $(LDFLAGS) $(LIB_DIR)/libvg.a $(LD_LIB_FLAGS) $(START_STATIC) $(LD_STATIC_LIB_FLAGS) $(END_STATIC) $(FILTER) # TODO: The normal and debug jemalloc builds can't safely be run at the same time. $(LIB_DIR)/libjemalloc.a: $(JEMALLOC_DIR)/src/*.c - +. ./source_me.sh && rm -f $(LIB_DIR)/libjemalloc*.* && rm -Rf $(CWD)/$(INC_DIR)/jemalloc && cd $(JEMALLOC_DIR) && ./autogen.sh && ./configure --enable-prof --disable-libdl --prefix=`pwd` $(FILTER) && $(MAKE) clean && $(MAKE) $(FILTER) && cp lib/libjemalloc.a $(CWD)/$(LIB_DIR)/ && cp -r include/* $(CWD)/$(INC_DIR)/ + +rm -f $(LIB_DIR)/libjemalloc*.* && rm -Rf $(CWD)/$(INC_DIR)/jemalloc && cd $(JEMALLOC_DIR) && ./autogen.sh && ./configure --enable-prof --disable-libdl --prefix=`pwd` $(FILTER) && $(MAKE) clean && $(MAKE) $(FILTER) && cp lib/libjemalloc.a $(CWD)/$(LIB_DIR)/ && cp -r include/* $(CWD)/$(INC_DIR)/ $(LIB_DIR)/libjemalloc_debug.a: $(JEMALLOC_DIR)/src/*.c - +. ./source_me.sh && rm -f $(LIB_DIR)/libjemalloc*.* && rm -Rf $(CWD)/$(INC_DIR)/jemalloc && cd $(JEMALLOC_DIR) && ./autogen.sh && ./configure --enable-prof --disable-libdl --enable-debug --enable-fill --prefix=`pwd` $(FILTER) && $(MAKE) clean && $(MAKE) $(FILTER) && cp lib/libjemalloc.a $(CWD)/$(LIB_DIR)/libjemalloc_debug.a && cp -r include/* $(CWD)/$(INC_DIR)/ + +rm -f $(LIB_DIR)/libjemalloc*.* && rm -Rf $(CWD)/$(INC_DIR)/jemalloc && cd $(JEMALLOC_DIR) && ./autogen.sh && ./configure --enable-prof --disable-libdl --enable-debug --enable-fill --prefix=`pwd` $(FILTER) && $(MAKE) clean && $(MAKE) $(FILTER) && cp lib/libjemalloc.a $(CWD)/$(LIB_DIR)/libjemalloc_debug.a && cp -r include/* $(CWD)/$(INC_DIR)/ # Use fake patterns to tell Make that this rule generates all these files when run once. # Here % should always match "lib" which is a common substring. # See https://stackoverflow.com/a/19822767 $(LIB_DIR)/%sdsl.a $(LIB_DIR)/%divsufsort.a $(LIB_DIR)/%divsufsort64.a : $(SDSL_DIR)/lib/*.cpp $(SDSL_DIR)/include/sdsl/*.hpp ifeq ($(shell uname -s),Darwin) - +. ./source_me.sh && cd $(SDSL_DIR) && AS_INTEGRATED_ASSEMBLER=1 BUILD_PORTABLE=1 CXXFLAGS="-fPIC $(CPPFLAGS) $(CXXFLAGS)" ./install.sh $(CWD) $(FILTER) + +cd $(SDSL_DIR) && AS_INTEGRATED_ASSEMBLER=1 BUILD_PORTABLE=1 CXXFLAGS="-fPIC $(CPPFLAGS) $(CXXFLAGS)" ./install.sh $(CWD) $(FILTER) else - +. ./source_me.sh && cd $(SDSL_DIR) && BUILD_PORTABLE=1 CXXFLAGS="-fPIC $(CPPFLAGS) $(CXXFLAGS)" ./install.sh $(CWD) $(FILTER) + +cd $(SDSL_DIR) && BUILD_PORTABLE=1 CXXFLAGS="-fPIC $(CPPFLAGS) $(CXXFLAGS)" ./install.sh $(CWD) $(FILTER) endif $(LIB_DIR)/libssw.a: $(SSW_DIR)/*.c $(SSW_DIR)/*.cpp $(SSW_DIR)/*.h - +. ./source_me.sh && cd $(SSW_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && ar rs $(CWD)/$(LIB_DIR)/libssw.a ssw.o ssw_cpp.o && cp ssw_cpp.h ssw.h $(CWD)/$(INC_DIR) + +cd $(SSW_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && ar rs $(CWD)/$(LIB_DIR)/libssw.a ssw.o ssw_cpp.o && cp ssw_cpp.h ssw.h $(CWD)/$(INC_DIR) # We need to hide -Xpreprocessor -fopenmp from Snappy, at least on Mac, because # it will drop the -Xpreprocessor and keep the -fopenmp and upset Clang. $(LIB_DIR)/libsnappy.a: $(SNAPPY_DIR)/*.cc $(SNAPPY_DIR)/*.h - +. ./source_me.sh && cd $(SNAPPY_DIR) && ./autogen.sh && CXXFLAGS="-fPIC $(filter-out -Xpreprocessor -fopenmp,$(CXXFLAGS))" ./configure --prefix=$(CWD) $(FILTER) && CXXFLAGS="-fPIC $(filter-out -Xpreprocessor -fopenmp,$(CXXFLAGS))" $(MAKE) libsnappy.la $(FILTER) && cp .libs/libsnappy.a $(CWD)/lib/ && cp snappy-c.h snappy-sinksource.h snappy-stubs-public.h snappy.h $(CWD)/include/ + +cd $(SNAPPY_DIR) && ./autogen.sh && CXXFLAGS="-fPIC $(filter-out -Xpreprocessor -fopenmp,$(CXXFLAGS))" ./configure --prefix=$(CWD) $(FILTER) && CXXFLAGS="-fPIC $(filter-out -Xpreprocessor -fopenmp,$(CXXFLAGS))" $(MAKE) libsnappy.la $(FILTER) && cp .libs/libsnappy.a $(CWD)/lib/ && cp snappy-c.h snappy-sinksource.h snappy-stubs-public.h snappy.h $(CWD)/include/ $(INC_DIR)/gcsa/gcsa.h: $(LIB_DIR)/libgcsa2.a $(LIB_DIR)/libgcsa2.a: $(LIB_DIR)/libsdsl.a $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a $(wildcard $(GCSA2_DIR)/*.cpp) $(wildcard $(GCSA2_DIR)/include/gcsa/*.h) ifeq ($(shell uname -s),Darwin) - +. ./source_me.sh && cp -r $(GCSA2_DIR)/include/gcsa $(CWD)/$(INC_DIR)/ && cd $(GCSA2_DIR) && $(MAKE) clean && make directories && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" AS_INTEGRATED_ASSEMBLER=1 $(MAKE) lib/libgcsa2.a $(FILTER) && mv lib/libgcsa2.a $(CWD)/$(LIB_DIR) + +cp -r $(GCSA2_DIR)/include/gcsa $(CWD)/$(INC_DIR)/ && cd $(GCSA2_DIR) && $(MAKE) clean && make directories && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" AS_INTEGRATED_ASSEMBLER=1 $(MAKE) lib/libgcsa2.a $(FILTER) && mv lib/libgcsa2.a $(CWD)/$(LIB_DIR) else - +. ./source_me.sh && cp -r $(GCSA2_DIR)/include/gcsa $(CWD)/$(INC_DIR)/ && cd $(GCSA2_DIR) && $(MAKE) clean && make directories && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) lib/libgcsa2.a $(FILTER) && mv lib/libgcsa2.a $(CWD)/$(LIB_DIR) + +cp -r $(GCSA2_DIR)/include/gcsa $(CWD)/$(INC_DIR)/ && cd $(GCSA2_DIR) && $(MAKE) clean && make directories && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) lib/libgcsa2.a $(FILTER) && mv lib/libgcsa2.a $(CWD)/$(LIB_DIR) endif $(INC_DIR)/gbwt/dynamic_gbwt.h: $(LIB_DIR)/libgbwt.a $(LIB_DIR)/libgbwt.a: $(LIB_DIR)/libsdsl.a $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a $(wildcard $(GBWT_DIR)/src/*.cpp) $(wildcard $(GBWT_DIR)/include/gbwt/*.h) ifeq ($(shell uname -s),Darwin) - +. ./source_me.sh && cp -r $(GBWT_DIR)/include/gbwt $(CWD)/$(INC_DIR)/ && cd $(GBWT_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" AS_INTEGRATED_ASSEMBLER=1 $(MAKE) $(FILTER) && mv lib/libgbwt.a $(CWD)/$(LIB_DIR) + +cp -r $(GBWT_DIR)/include/gbwt $(CWD)/$(INC_DIR)/ && cd $(GBWT_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" AS_INTEGRATED_ASSEMBLER=1 $(MAKE) $(FILTER) && mv lib/libgbwt.a $(CWD)/$(LIB_DIR) else - +. ./source_me.sh && cp -r $(GBWT_DIR)/include/gbwt $(CWD)/$(INC_DIR)/ && cd $(GBWT_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && mv lib/libgbwt.a $(CWD)/$(LIB_DIR) + +cp -r $(GBWT_DIR)/include/gbwt $(CWD)/$(INC_DIR)/ && cd $(GBWT_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && mv lib/libgbwt.a $(CWD)/$(LIB_DIR) endif $(INC_DIR)/gbwtgraph/gbwtgraph.h: $(LIB_DIR)/libgbwtgraph.a $(LIB_DIR)/libgbwtgraph.a: $(LIB_DIR)/libgbwt.a $(LIB_DIR)/libsdsl.a $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a $(LIB_DIR)/libhandlegraph.a $(wildcard $(GBWTGRAPH_DIR)/src/*.cpp) $(wildcard $(GBWTGRAPH_DIR)/include/gbwtgraph/*.h) ifeq ($(shell uname -s),Darwin) - +. ./source_me.sh && cp -r $(GBWTGRAPH_DIR)/include/gbwtgraph $(CWD)/$(INC_DIR)/ && cd $(GBWTGRAPH_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" AS_INTEGRATED_ASSEMBLER=1 $(MAKE) $(FILTER) && mv lib/libgbwtgraph.a $(CWD)/$(LIB_DIR) + +cp -r $(GBWTGRAPH_DIR)/include/gbwtgraph $(CWD)/$(INC_DIR)/ && cd $(GBWTGRAPH_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" AS_INTEGRATED_ASSEMBLER=1 $(MAKE) $(FILTER) && mv lib/libgbwtgraph.a $(CWD)/$(LIB_DIR) else - +. ./source_me.sh && cp -r $(GBWTGRAPH_DIR)/include/gbwtgraph $(CWD)/$(INC_DIR)/ && cd $(GBWTGRAPH_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && mv lib/libgbwtgraph.a $(CWD)/$(LIB_DIR) + +cp -r $(GBWTGRAPH_DIR)/include/gbwtgraph $(CWD)/$(INC_DIR)/ && cd $(GBWTGRAPH_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && mv lib/libgbwtgraph.a $(CWD)/$(LIB_DIR) endif $(INC_DIR)/kff_io.hpp: $(LIB_DIR)/libkff.a $(LIB_DIR)/libkff.a: $(KFF_DIR)/kff_io.cpp $(KFF_DIR)/kff_io.hpp.in ifeq ($(shell uname -s),Darwin) - +. ./source_me.sh && cd $(KFF_DIR) && rm -Rf build && mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-fPIC -Wall -Ofast -g $(CXXFLAGS)" .. && AS_INTEGRATED_ASSEMBLER=1 $(MAKE) $(FILTER) && cp kff_io.hpp $(CWD)/$(INC_DIR) && mv libkff.a $(CWD)/$(LIB_DIR) + +cd $(KFF_DIR) && rm -Rf build && mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-fPIC -Wall -Ofast -g $(CXXFLAGS)" .. && AS_INTEGRATED_ASSEMBLER=1 $(MAKE) $(FILTER) && cp kff_io.hpp $(CWD)/$(INC_DIR) && mv libkff.a $(CWD)/$(LIB_DIR) else - +. ./source_me.sh && cd $(KFF_DIR) && rm -Rf build && mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-fPIC -Wall -Ofast -g $(CXXFLAGS)" .. && $(MAKE) $(FILTER) && cp kff_io.hpp $(CWD)/$(INC_DIR) && mv libkff.a $(CWD)/$(LIB_DIR) + +cd $(KFF_DIR) && rm -Rf build && mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-fPIC -Wall -Ofast -g $(CXXFLAGS)" .. && $(MAKE) $(FILTER) && cp kff_io.hpp $(CWD)/$(INC_DIR) && mv libkff.a $(CWD)/$(LIB_DIR) endif $(INC_DIR)/BooPHF.h: $(BBHASH_DIR)/BooPHF.h @@ -627,17 +626,17 @@ $(INC_DIR)/progress_bar.hpp: $(PROGRESS_BAR_DIR)/progress_bar.hpp +cp $(PROGRESS_BAR_DIR)/progress_bar.hpp $(CWD)/$(INC_DIR) $(OBJ_DIR)/progress_bar.o: $(PROGRESS_BAR_DIR)/progress_bar.cpp $(PROGRESS_BAR_DIR)/*.hpp - +. ./source_me.sh && $(CXX) -I$(FASTAHACK_DIR) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -c -o $@ $< + +$(CXX) -I$(FASTAHACK_DIR) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -c -o $@ $< $(SHARED_OBJ_DIR)/progress_bar.o: $(PROGRESS_BAR_DIR)/progress_bar.cpp $(PROGRESS_BAR_DIR)/*.hpp - +. ./source_me.sh && $(CXX) -I$(FASTAHACK_DIR) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -fPIC -c -o $@ $< + +$(CXX) -I$(FASTAHACK_DIR) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -fPIC -c -o $@ $< $(INC_DIR)/Fasta.h: $(FASTAHACK_DIR)/Fasta.h - +. ./source_me.sh && cd $(FASTAHACK_DIR) && cp Fasta.h $(CWD)/$(INC_DIR) + +cd $(FASTAHACK_DIR) && cp Fasta.h $(CWD)/$(INC_DIR) $(OBJ_DIR)/Fasta.o: $(FASTAHACK_DIR)/Fasta.cpp $(INC_DIR)/Fasta.h - +. ./source_me.sh && $(CXX) -I$(FASTAHACK_DIR) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -c -o $@ $< $(FILTER) + +$(CXX) -I$(FASTAHACK_DIR) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -c -o $@ $< $(FILTER) $(SHARED_OBJ_DIR)/Fasta.o: $(FASTAHACK_DIR)/Fasta.cpp $(INC_DIR)/Fasta.h - +. ./source_me.sh && $(CXX) -I$(FASTAHACK_DIR) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -fPIC -c -o $@ $< $(FILTER) + +$(CXX) -I$(FASTAHACK_DIR) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -fPIC -c -o $@ $< $(FILTER) # We have this target to clean up the old Protobuf we used to have. # We can remove it after we no longer care about building properly on a dirty @@ -663,10 +662,10 @@ $(LIB_DIR)/cleaned_old_elfutils: $(LIB_DIR)/libvgio.a: $(LIB_DIR)/libhts.a $(LIB_DIR)/libhandlegraph.a $(LIB_DIR)/pkgconfig/htslib.pc $(LIB_DIR)/cleaned_old_protobuf_v003 $(LIBVGIO_DIR)/CMakeLists.txt $(LIBVGIO_DIR)/src/*.cpp $(LIBVGIO_DIR)/include/vg/io/*.hpp $(LIBVGIO_DIR)/deps/vg.proto +rm -f $(CWD)/$(INC_DIR)/vg.pb.h $(CWD)/$(INC_DIR)/vg/vg.pb.h +rm -Rf $(CWD)/$(INC_DIR)/vg/io/ - +. ./source_me.sh && export CXXFLAGS="$(CPPFLAGS) $(CXXFLAGS)" && export LDFLAGS="$(LD_LIB_DIR_FLAGS) $(LDFLAGS)" && cd $(LIBVGIO_DIR) && rm -Rf CMakeCache.txt CMakeFiles *.cmake install_manifest.txt *.pb.cc *.pb.h *.a && rm -rf build-vg && mkdir build-vg && cd build-vg && PKG_CONFIG_PATH=$(CWD)/$(LIB_DIR)/pkgconfig:$(PKG_CONFIG_PATH) cmake -DCMAKE_CXX_STANDARD=$(CXX_STANDARD) -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_PREFIX_PATH="/usr;$(OMP_PREFIXES)" -DCMAKE_INSTALL_PREFIX=$(CWD) -DCMAKE_INSTALL_LIBDIR=lib .. $(FILTER) && $(MAKE) clean && VERBOSE=1 $(MAKE) $(FILTER) && $(MAKE) install + +export CXXFLAGS="$(CPPFLAGS) $(CXXFLAGS)" && export LDFLAGS="$(LD_LIB_DIR_FLAGS) $(LDFLAGS)" && cd $(LIBVGIO_DIR) && rm -Rf CMakeCache.txt CMakeFiles *.cmake install_manifest.txt *.pb.cc *.pb.h *.a && rm -rf build-vg && mkdir build-vg && cd build-vg && PKG_CONFIG_PATH=$(CWD)/$(LIB_DIR)/pkgconfig:$(PKG_CONFIG_PATH) cmake -DCMAKE_CXX_STANDARD=$(CXX_STANDARD) -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_PREFIX_PATH="/usr;$(OMP_PREFIXES)" -DCMAKE_INSTALL_PREFIX=$(CWD) -DCMAKE_INSTALL_LIBDIR=lib .. $(FILTER) && $(MAKE) clean && VERBOSE=1 $(MAKE) $(FILTER) && $(MAKE) install $(LIB_DIR)/libhandlegraph.a: $(LIBHANDLEGRAPH_DIR)/src/include/handlegraph/*.hpp $(LIBHANDLEGRAPH_DIR)/src/*.cpp - +. ./source_me.sh && cd $(LIBHANDLEGRAPH_DIR) && rm -Rf build CMakeCache.txt CMakeFiles && mkdir build && cd build && CXXFLAGS="$(CXXFLAGS) $(CPPFLAGS)" cmake -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_INSTALL_PREFIX=$(CWD) -DCMAKE_INSTALL_LIBDIR=lib .. && $(MAKE) $(FILTER) && $(MAKE) install + +cd $(LIBHANDLEGRAPH_DIR) && rm -Rf build CMakeCache.txt CMakeFiles && mkdir build && cd build && CXXFLAGS="$(CXXFLAGS) $(CPPFLAGS)" cmake -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_INSTALL_PREFIX=$(CWD) -DCMAKE_INSTALL_LIBDIR=lib .. && $(MAKE) $(FILTER) && $(MAKE) install # On Linux, libdeflate builds a .so. @@ -682,7 +681,7 @@ ifeq ($(shell uname -s),Darwin) endif $(LIB_DIR)/libdeflate.a: $(LIBDEFLATE_DIR)/*.h $(LIBDEFLATE_DIR)/lib/*.h $(LIBDEFLATE_DIR)/lib/*/*.h $(LIBDEFLATE_DIR)/lib/*.c $(LIBDEFLATE_DIR)/lib/*/*.c - +. ./source_me.sh && cd $(LIBDEFLATE_DIR) && V=1 LDFLAGS="$(LDFLAGS) $(LD_RENAMEABLE_FLAGS)" $(MAKE) $(FILTER) && cp libdeflate.a $(CWD)/$(LIB_DIR) && cp libdeflate.h $(CWD)/$(INC_DIR) + +cd $(LIBDEFLATE_DIR) && V=1 LDFLAGS="$(LDFLAGS) $(LD_RENAMEABLE_FLAGS)" $(MAKE) $(FILTER) && cp libdeflate.a $(CWD)/$(LIB_DIR) && cp libdeflate.h $(CWD)/$(INC_DIR) # We build htslib after libdeflate so it can use libdeflate. # We need to do some wizardry to get it to pick up the right build and target system types on modern autotools. @@ -695,13 +694,13 @@ $(LIB_DIR)/libdeflate.a: $(LIBDEFLATE_DIR)/*.h $(LIBDEFLATE_DIR)/lib/*.h $(LIBDE # a system path, in case another htslib is installed on the system. Some HTSlib # headers look for the current HTSlib with <>. $(LIB_DIR)/libhts%a $(LIB_DIR)/pkgconfig/htslib%pc $(LIB_DIR)/libhts%$(SHARED_SUFFIX): $(LIB_DIR)/libdeflate.a $(LIB_DIR)/libdeflate.$(SHARED_SUFFIX) $(HTSLIB_DIR)/*.c $(HTSLIB_DIR)/*.h $(HTSLIB_DIR)/htslib/*.h $(HTSLIB_DIR)/cram/*.c $(HTSLIB_DIR)/cram/*.h - +. ./source_me.sh && cd $(HTSLIB_DIR) && rm -Rf $(CWD)/$(INC_DIR)/htslib $(CWD)/$(LIB_DIR)/libhts* && autoreconf -i && autoheader && autoconf || true - +. ./source_me.sh && cd $(HTSLIB_DIR) && (./configure -n 2>&1 || true) | grep "build system type" | rev | cut -f1 -d' ' | rev >systype.txt - +. ./source_me.sh && cd $(HTSLIB_DIR) && CFLAGS="-I$(CWD)/$(HTSLIB_DIR) -isystem $(CWD)/$(HTSLIB_DIR) -I$(CWD)/$(INC_DIR) $(CFLAGS)" LDFLAGS="$(LDFLAGS) -L$(CWD)/$(LIB_DIR) $(LD_UTIL_RPATH_FLAGS)" ./configure --with-libdeflate --disable-s3 --disable-gcs --disable-libcurl --disable-plugins --prefix=$(CWD) --host=$$(cat systype.txt) $(FILTER) && $(MAKE) clean && $(MAKE) $(FILTER) && $(MAKE) install + +cd $(HTSLIB_DIR) && rm -Rf $(CWD)/$(INC_DIR)/htslib $(CWD)/$(LIB_DIR)/libhts* && autoreconf -i && autoheader && autoconf || true + +cd $(HTSLIB_DIR) && (./configure -n 2>&1 || true) | grep "build system type" | rev | cut -f1 -d' ' | rev >systype.txt + +cd $(HTSLIB_DIR) && CFLAGS="-I$(CWD)/$(HTSLIB_DIR) -isystem $(CWD)/$(HTSLIB_DIR) -I$(CWD)/$(INC_DIR) $(CFLAGS)" LDFLAGS="$(LDFLAGS) -L$(CWD)/$(LIB_DIR) $(LD_UTIL_RPATH_FLAGS)" ./configure --with-libdeflate --disable-s3 --disable-gcs --disable-libcurl --disable-plugins --prefix=$(CWD) --host=$$(cat systype.txt) $(FILTER) && $(MAKE) clean && $(MAKE) $(FILTER) && $(MAKE) install # Build and install tabixpp for vcflib. $(LIB_DIR)/libtabixpp.a: $(LIB_DIR)/libhts.a $(TABIXPP_DIR)/*.cpp $(TABIXPP_DIR)/*.hpp - +. ./source_me.sh && cd $(TABIXPP_DIR) && rm -f tabix.o libtabixpp.a && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" INCLUDES="-I$(CWD)/$(INC_DIR)" HTS_HEADERS="" $(MAKE) tabix.o $(FILTER) && ar rcs libtabixpp.a tabix.o + +cd $(TABIXPP_DIR) && rm -f tabix.o libtabixpp.a && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" INCLUDES="-I$(CWD)/$(INC_DIR)" HTS_HEADERS="" $(MAKE) tabix.o $(FILTER) && ar rcs libtabixpp.a tabix.o +cp $(TABIXPP_DIR)/libtabixpp.a $(LIB_DIR) && cp $(TABIXPP_DIR)/tabix.hpp $(INC_DIR) +echo "Name: tabixpp" > $(LIB_DIR)/pkgconfig/tabixpp.pc +echo "Description: Self-packaged tabixpp" >> $(LIB_DIR)/pkgconfig/tabixpp.pc @@ -717,7 +716,7 @@ $(LIB_DIR)/libtabixpp.a: $(LIB_DIR)/libhts.a $(TABIXPP_DIR)/*.cpp $(TABIXPP_DIR) # We need to use /usr first for CMake search or Ubuntu 22.04 will decide pybind11 is installed in / when actually it is only fully installed in /usr. $(LIB_DIR)/libvcflib.a: $(LIB_DIR)/libhts.a $(LIB_DIR)/libtabixpp.a $(VCFLIB_DIR)/src/*.cpp $(VCFLIB_DIR)/src/*.hpp $(VCFLIB_DIR)/contrib/*/*.cpp $(VCFLIB_DIR)/contrib/*/*.h +rm -f $(VCFLIB_DIR)/contrib/WFA2-lib/VERSION - +. ./source_me.sh && cd $(VCFLIB_DIR) && rm -Rf build && mkdir build && cd build && PKG_CONFIG_PATH="$(CWD)/$(LIB_DIR)/pkgconfig:$(PKG_CONFIG_PATH)" cmake -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON -DZIG=OFF -DCMAKE_C_FLAGS="$(CFLAGS)" -DCMAKE_CXX_FLAGS="$(CXXFLAGS) ${CPPFLAGS}" -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_INSTALL_PREFIX=$(CWD) -DCMAKE_PREFIX_PATH="/usr;$(OMP_PREFIXES)" .. && cmake --build . + +cd $(VCFLIB_DIR) && rm -Rf build && mkdir build && cd build && PKG_CONFIG_PATH="$(CWD)/$(LIB_DIR)/pkgconfig:$(PKG_CONFIG_PATH)" cmake -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON -DZIG=OFF -DCMAKE_C_FLAGS="$(CFLAGS)" -DCMAKE_CXX_FLAGS="$(CXXFLAGS) ${CPPFLAGS}" -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_INSTALL_PREFIX=$(CWD) -DCMAKE_PREFIX_PATH="/usr;$(OMP_PREFIXES)" .. && cmake --build . +cp $(VCFLIB_DIR)/contrib/filevercmp/*.h* $(INC_DIR) +cp $(VCFLIB_DIR)/contrib/fastahack/*.h* $(INC_DIR) +cp $(VCFLIB_DIR)/contrib/smithwaterman/*.h* $(INC_DIR) @@ -731,10 +730,10 @@ $(BIN_DIR)/vcf2tsv: $(VCFLIB_DIR)/src/*.cpp $(VCFLIB_DIR)/src/*.h $(LIB_DIR)/lib +cp $(VCFLIB_DIR)/build/vcf2tsv $(BIN_DIR) $(FASTAHACK_DIR)/fastahack: $(FASTAHACK_DIR)/*.c $(FASTAHACK_DIR)/*.h $(FASTAHACK_DIR)/*.cpp - +. ./source_me.sh && cd $(FASTAHACK_DIR) && $(MAKE) $(FILTER) + +cd $(FASTAHACK_DIR) && $(MAKE) $(FILTER) $(LIB_DIR)/libgssw.a: $(GSSW_DIR)/src/gssw.c $(GSSW_DIR)/src/gssw.h - +. ./source_me.sh && cd $(GSSW_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && cp lib/libgssw.a $(CWD)/$(LIB_DIR)/ && cp src/gssw.h $(CWD)/$(INC_DIR)/ + +cd $(GSSW_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && cp lib/libgssw.a $(CWD)/$(LIB_DIR)/ && cp src/gssw.h $(CWD)/$(INC_DIR)/ $(INC_DIR)/lru_cache.h: $(DEP_DIR)/lru_cache/*.h $(DEP_DIR)/lru_cache/*.cc +cd $(DEP_DIR)/lru_cache && cp *.h* $(CWD)/$(INC_DIR)/ @@ -749,26 +748,26 @@ $(INC_DIR)/dynamic/dynamic.hpp: $(DYNAMIC_DIR)/include/dynamic/*.hpp $(DYNAMIC_D +mkdir -p $(INC_DIR)/dynamic && cp -r $(CWD)/$(DYNAMIC_DIR)/include/dynamic/* $(INC_DIR)/dynamic/ $(INC_DIR)/sparsehash/sparse_hash_map: $(wildcard $(SPARSEHASH_DIR)/**/*.cc) $(wildcard $(SPARSEHASH_DIR)/**/*.h) - +. ./source_me.sh && cd $(SPARSEHASH_DIR) && ./autogen.sh && LDFLAGS="$(LD_LIB_DIR_FLAGS) $(LDFLAGS)" ./configure --prefix=$(CWD) $(FILTER) && $(MAKE) $(FILTER) && $(MAKE) install + +cd $(SPARSEHASH_DIR) && ./autogen.sh && LDFLAGS="$(LD_LIB_DIR_FLAGS) $(LDFLAGS)" ./configure --prefix=$(CWD) $(FILTER) && $(MAKE) $(FILTER) && $(MAKE) install $(INC_DIR)/sparsepp/spp.h: $(wildcard $(SPARSEHASH_DIR)/sparsepp/*.h) +cp -r $(SPARSEPP_DIR)/sparsepp $(INC_DIR)/ #$(INC_DIR)/Variant.h $(LIB_DIR)/libvcfh.a: $(DEP_DIR)/libVCFH/*.cpp $(DEP_DIR)/libVCFH/*.hpp - +. ./source_me.sh && cd $(DEP_DIR)/libVCFH && $(MAKE) $(FILTER) && cp libvcfh.a $(CWD)/$(LIB_DIR)/ && cp vcfheader.hpp $(CWD)/$(INC_DIR)/ + +cd $(DEP_DIR)/libVCFH && $(MAKE) $(FILTER) && cp libvcfh.a $(CWD)/$(LIB_DIR)/ && cp vcfheader.hpp $(CWD)/$(INC_DIR)/ $(LIB_DIR)/libsonlib.a: $(CWD)/$(DEP_DIR)/sonLib/C/inc/*.h $(CWD)/$(DEP_DIR)/sonLib/C/impl/*.c - +. ./source_me.sh && cd $(DEP_DIR)/sonLib && $(MAKE) clean && kyotoTycoonLib="" CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && cp lib/sonLib.a $(CWD)/$(LIB_DIR)/libsonlib.a && mkdir -p $(CWD)/$(INC_DIR)/sonLib && cp lib/*.h $(CWD)/$(INC_DIR)/sonLib + +cd $(DEP_DIR)/sonLib && $(MAKE) clean && kyotoTycoonLib="" CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && cp lib/sonLib.a $(CWD)/$(LIB_DIR)/libsonlib.a && mkdir -p $(CWD)/$(INC_DIR)/sonLib && cp lib/*.h $(CWD)/$(INC_DIR)/sonLib $(LIB_DIR)/libpinchesandcacti.a: $(LIB_DIR)/libsonlib.a $(CWD)/$(DEP_DIR)/pinchesAndCacti/inc/*.h $(CWD)/$(DEP_DIR)/pinchesAndCacti/impl/*.c - +. ./source_me.sh && cd $(DEP_DIR)/pinchesAndCacti && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && cd $(CWD)/$(DEP_DIR)/sonLib && cp lib/stPinchesAndCacti.a $(CWD)/$(LIB_DIR)/libpinchesandcacti.a && cp lib/3EdgeConnected.a $(CWD)/$(LIB_DIR)/lib3edgeconnected.a && mkdir -p $(CWD)/$(INC_DIR)/sonLib && cp lib/*.h $(CWD)/$(INC_DIR)/sonLib + +cd $(DEP_DIR)/pinchesAndCacti && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && cd $(CWD)/$(DEP_DIR)/sonLib && cp lib/stPinchesAndCacti.a $(CWD)/$(LIB_DIR)/libpinchesandcacti.a && cp lib/3EdgeConnected.a $(CWD)/$(LIB_DIR)/lib3edgeconnected.a && mkdir -p $(CWD)/$(INC_DIR)/sonLib && cp lib/*.h $(CWD)/$(INC_DIR)/sonLib # When building raptor we need to make sure to pre-generate and fix up the lexer # We also need to clear out its cmake stuff in case it found a wrong Bison and cached it. $(LIB_DIR)/libraptor2.a: $(RAPTOR_DIR)/src/* $(wildcard $(RAPTOR_DIR)/build/*) which bison - +. ./source_me.sh && cd $(RAPTOR_DIR)/build && rm -Rf CMakeCache.txt CMakeFiles CTestTestfile.cmake Makefile cmake_install.cmake src tests utils && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" cmake .. && rm -f src/turtle_parser.c && rm -f src/turtle_lexer.c && make turtle_lexer_tgt && make -f src/CMakeFiles/raptor2.dir/build.make src/turtle_lexer.c && sed -i.bak '/yycleanup/d' src/turtle_lexer.c && $(MAKE) $(FILTER) && cp src/libraptor2.a $(CWD)/$(LIB_DIR) + +cd $(RAPTOR_DIR)/build && rm -Rf CMakeCache.txt CMakeFiles CTestTestfile.cmake Makefile cmake_install.cmake src tests utils && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" cmake .. && rm -f src/turtle_parser.c && rm -f src/turtle_lexer.c && make turtle_lexer_tgt && make -f src/CMakeFiles/raptor2.dir/build.make src/turtle_lexer.c && sed -i.bak '/yycleanup/d' src/turtle_lexer.c && $(MAKE) $(FILTER) && cp src/libraptor2.a $(CWD)/$(LIB_DIR) +touch $(LIB_DIR)/libraptor2.a # We need rapper from Raptor for the tests @@ -782,7 +781,7 @@ $(INC_DIR)/raptor2/raptor2.h: $(LIB_DIR)/libraptor2.a $(RAPTOR_DIR)/build/* +touch $(INC_DIR)/raptor2/raptor2.h $(LIB_DIR)/libstructures.a: $(STRUCTURES_DIR)/src/include/structures/*.hpp $(STRUCTURES_DIR)/src/*.cpp $(STRUCTURES_DIR)/Makefile - +. ./source_me.sh && cd $(STRUCTURES_DIR) && $(MAKE) clean && $(MAKE) lib/libstructures.a $(FILTER) && cp lib/libstructures.a $(CWD)/$(LIB_DIR)/ && cp -r src/include/structures $(CWD)/$(INC_DIR)/ + +cd $(STRUCTURES_DIR) && $(MAKE) clean && $(MAKE) lib/libstructures.a $(FILTER) && cp lib/libstructures.a $(CWD)/$(LIB_DIR)/ && cp -r src/include/structures $(CWD)/$(INC_DIR)/ $(INC_DIR)/sha1.hpp: $(SHA1_DIR)/sha1.hpp +cp $(SHA1_DIR)/*.h* $(CWD)/$(INC_DIR)/ @@ -812,15 +811,15 @@ $(LIB_DIR)/libdwfl.a: $(LIB_DIR)/libelf.a # running on. $(LIB_DIR)/libelf.a: $(ELFUTILS_DIR)/libebl/*.c $(ELFUTILS_DIR)/libebl/*.h $(ELFUTILS_DIR)/libdw/*.c $(ELFUTILS_DIR)/libdw/*.h $(ELFUTILS_DIR)/libelf/*.c $(ELFUTILS_DIR)/libelf/*.h $(ELFUTILS_DIR)/src/*.c $(ELFUTILS_DIR)/src/*.h $(LIB_DIR)/cleaned_old_elfutils +cd $(CWD)/$(INC_DIR)/ && rm -Rf elfutils gelf.h libelf.h dwarf.h libdwflP.h libdwfl.h libebl.h libelf.h - +. ./source_me.sh && cd $(ELFUTILS_DIR) && autoreconf -i -f && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" ./configure --enable-maintainer-mode --disable-libdebuginfod --disable-debuginfod --prefix=$(CWD) $(FILTER) - +. ./source_me.sh && cd $(ELFUTILS_DIR)/libelf && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) libelf.a $(FILTER) - +. ./source_me.sh && cd $(ELFUTILS_DIR)/libebl && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) libebl.a $(FILTER) - +. ./source_me.sh && cd $(ELFUTILS_DIR)/libdwfl && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) libdwfl.a $(FILTER) - +. ./source_me.sh && cd $(ELFUTILS_DIR)/libdwelf && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) libdwelf.a $(FILTER) - +. ./source_me.sh && cd $(ELFUTILS_DIR)/lib && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) libeu.a $(FILTER) - +. ./source_me.sh && cd $(ELFUTILS_DIR)/libcpu && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) libcpu.a $(FILTER) - +. ./source_me.sh && cd $(ELFUTILS_DIR)/backends && $(MAKE) clean CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" && $(MAKE) libebl_backends.a $(FILTER) - +. ./source_me.sh && cd $(ELFUTILS_DIR)/libdw && $(MAKE) clean CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" && $(MAKE) libdw.a known-dwarf.h $(FILTER) + +cd $(ELFUTILS_DIR) && autoreconf -i -f && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" ./configure --enable-maintainer-mode --disable-libdebuginfod --disable-debuginfod --prefix=$(CWD) $(FILTER) + +cd $(ELFUTILS_DIR)/libelf && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) libelf.a $(FILTER) + +cd $(ELFUTILS_DIR)/libebl && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) libebl.a $(FILTER) + +cd $(ELFUTILS_DIR)/libdwfl && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) libdwfl.a $(FILTER) + +cd $(ELFUTILS_DIR)/libdwelf && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) libdwelf.a $(FILTER) + +cd $(ELFUTILS_DIR)/lib && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) libeu.a $(FILTER) + +cd $(ELFUTILS_DIR)/libcpu && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) libcpu.a $(FILTER) + +cd $(ELFUTILS_DIR)/backends && $(MAKE) clean CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" && $(MAKE) libebl_backends.a $(FILTER) + +cd $(ELFUTILS_DIR)/libdw && $(MAKE) clean CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" && $(MAKE) libdw.a known-dwarf.h $(FILTER) +cd $(ELFUTILS_DIR) && mkdir -p $(CWD)/$(INC_DIR)/elfutils && cp libdw/known-dwarf.h libdw/libdw.h libebl/libebl.h libelf/elf-knowledge.h version.h libdwfl/libdwfl.h libdwelf/libdwelf.h $(CWD)/$(INC_DIR)/elfutils && cp libelf/gelf.h libelf/libelf.h libdw/dwarf.h $(CWD)/$(INC_DIR) && cp libebl/libebl.a libdw/libdw.a libdwfl/libdwfl.a libdwelf/libdwelf.a libelf/libelf.a $(CWD)/$(LIB_DIR)/ $(OBJ_DIR)/sha1.o: $(SHA1_DIR)/sha1.cpp $(SHA1_DIR)/sha1.hpp @@ -829,30 +828,30 @@ $(SHARED_OBJ_DIR)/sha1.o: $(SHA1_DIR)/sha1.cpp $(SHA1_DIR)/sha1.hpp +$(CXX) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -fPIC -c -o $@ $< $(FILTER) $(LIB_DIR)/libfml.a: $(FERMI_DIR)/*.h $(FERMI_DIR)/*.c - . ./source_me.sh && cd $(FERMI_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && cp *.h $(CWD)/$(INC_DIR)/ && cp libfml.a $(CWD)/$(LIB_DIR)/ + cd $(FERMI_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && cp *.h $(CWD)/$(INC_DIR)/ && cp libfml.a $(CWD)/$(LIB_DIR)/ # We don't need to hack the build to point at our htslib because sublinearLS gets its htslib from the include flags we set # But we do need to hack out the return type error to work around https://github.com/yoheirosen/sublinear-Li-Stephens/issues/6 # TODO: This probably means actually calling some things in the library is unsafe! $(LIB_DIR)/libsublinearLS.a: $(LINLS_DIR)/src/*.cpp $(LINLS_DIR)/src/*.hpp $(LIB_DIR)/libhts.a - . ./source_me.sh && cd $(LINLS_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(filter-out -Werror=return-type,$(CFLAGS))" CXXFLAGS="-fPIC $(filter-out -Werror=return-type,$(CXXFLAGS))" INCLUDE_FLAGS="-I$(CWD)/$(INC_DIR)" $(MAKE) libs $(FILTER) && cp lib/libsublinearLS.a $(CWD)/$(LIB_DIR)/ && mkdir -p $(CWD)/$(INC_DIR)/sublinearLS && cp src/*.hpp $(CWD)/$(INC_DIR)/sublinearLS/ + cd $(LINLS_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(filter-out -Werror=return-type,$(CFLAGS))" CXXFLAGS="-fPIC $(filter-out -Werror=return-type,$(CXXFLAGS))" INCLUDE_FLAGS="-I$(CWD)/$(INC_DIR)" $(MAKE) libs $(FILTER) && cp lib/libsublinearLS.a $(CWD)/$(LIB_DIR)/ && mkdir -p $(CWD)/$(INC_DIR)/sublinearLS && cp src/*.hpp $(CWD)/$(INC_DIR)/sublinearLS/ $(LIB_DIR)/libbdsg.a: $(INC_DIR)/BooPHF.h $(LIBBDSG_DIR)/Makefile $(LIBBDSG_DIR)/bdsg/src/*.cpp $(LIBBDSG_DIR)/bdsg/include/bdsg/*.hpp $(LIBBDSG_DIR)/bdsg/include/bdsg/internal/*.hpp $(LIBBDSG_DIR)/bdsg/include/bdsg/overlays/*.hpp $(LIB_DIR)/libhandlegraph.a $(LIB_DIR)/libsdsl.a $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a $(INC_DIR)/sparsepp/spp.h $(INC_DIR)/dynamic/dynamic.hpp $(INC_DIR)/mio/mmap.hpp - +. ./source_me.sh && rm -Rf $(CWD)/$(INC_DIR)/bdsg && cd $(LIBBDSG_DIR) && $(MAKE) clean && CPLUS_INCLUDE_PATH=$(CWD)/$(INC_DIR):$(CWD)/$(INC_DIR)/dynamic:$(CPLUS_INCLUDE_PATH) CXXFLAGS="$(INCLUDE_FLAGS) -fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && cp lib/libbdsg.a $(CWD)/$(LIB_DIR) && cp -r bdsg/include/* $(CWD)/$(INC_DIR) + +rm -Rf $(CWD)/$(INC_DIR)/bdsg && cd $(LIBBDSG_DIR) && $(MAKE) clean && CPLUS_INCLUDE_PATH=$(CWD)/$(INC_DIR):$(CWD)/$(INC_DIR)/dynamic:$(CPLUS_INCLUDE_PATH) CXXFLAGS="$(INCLUDE_FLAGS) -fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && cp lib/libbdsg.a $(CWD)/$(LIB_DIR) && cp -r bdsg/include/* $(CWD)/$(INC_DIR) $(INC_DIR)/mio/mmap.hpp: $(MIO_DIR)/include/mio/* - +. ./source_me.sh && cp -r $(MIO_DIR)/include/mio $(CWD)/$(INC_DIR)/ + +cp -r $(MIO_DIR)/include/mio $(CWD)/$(INC_DIR)/ # It would be better to copy the atomic_queue directory rather than its contents, but to avoid re-writing mmmultimap... $(INC_DIR)/atomic_queue.h: $(ATOMIC_QUEUE_DIR)/include/* - +. ./source_me.sh && cp -r $(ATOMIC_QUEUE_DIR)/include/atomic_queue/* $(CWD)/$(INC_DIR)/ + +cp -r $(ATOMIC_QUEUE_DIR)/include/atomic_queue/* $(CWD)/$(INC_DIR)/ $(INC_DIR)/mmmultiset.hpp: $(MMMULTIMAP_DIR)/src/mmmultiset.hpp $(INC_DIR)/mmmultimap.hpp $(INC_DIR)/mmmultimap.hpp: $(MMMULTIMAP_DIR)/src/mmmultimap.hpp $(MMMULTIMAP_DIR)/src/mmmultiset.hpp $(INC_DIR)/mio/mmap.hpp $(INC_DIR)/atomic_queue.h - +. ./source_me.sh && cp $(MMMULTIMAP_DIR)/src/mmmultimap.hpp $(MMMULTIMAP_DIR)/src/mmmultiset.hpp $(CWD)/$(INC_DIR)/ + +cp $(MMMULTIMAP_DIR)/src/mmmultimap.hpp $(MMMULTIMAP_DIR)/src/mmmultiset.hpp $(CWD)/$(INC_DIR)/ $(INC_DIR)/ips4o.hpp: $(IPS4O_DIR)/ips4o.hpp $(IPS4O_DIR)/ips4o/* - +. ./source_me.sh && cp -r $(IPS4O_DIR)/ips4o* $(CWD)/$(INC_DIR)/ + +cp -r $(IPS4O_DIR)/ips4o* $(CWD)/$(INC_DIR)/ # The xg repo has a cmake build system based all around external projects, and # we need it to use our installed versions of everything instead. @@ -860,7 +859,7 @@ $(INC_DIR)/ips4o.hpp: $(IPS4O_DIR)/ips4o.hpp $(IPS4O_DIR)/ips4o/* $(LIB_DIR)/libxg.a: $(XG_DIR)/src/*.hpp $(XG_DIR)/src/*.cpp $(INC_DIR)/mmmultimap.hpp $(INC_DIR)/ips4o.hpp $(LIB_DIR)/libhandlegraph.a $(LIB_DIR)/libsdsl.a $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a $(INC_DIR)/mio/mmap.hpp $(INC_DIR)/atomic_queue.h +rm -f $@ +cp -r $(XG_DIR)/src/*.hpp $(CWD)/$(INC_DIR) - +. ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -fPIC -DNO_GFAKLUGE -c -o $(XG_DIR)/xg.o $(XG_DIR)/src/xg.cpp $(FILTER) + +$(CXX) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -fPIC -DNO_GFAKLUGE -c -o $(XG_DIR)/xg.o $(XG_DIR)/src/xg.cpp $(FILTER) +ar rs $@ $(XG_DIR)/xg.o # Auto-git-versioning @@ -922,42 +921,42 @@ $(OBJ_DIR)/version.o: $(SRC_DIR)/version.cpp $(SRC_DIR)/version.hpp $(SRC_DIR)/v # Use static pattern rules so the dependency files will not be ignored if the output exists # See $(OBJ) $(OBJ_DIR)/main.o: $(OBJ_DIR)/%.o : $(SRC_DIR)/%.cpp $(OBJ_DIR)/%.d $(DEPS) - . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) + $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) @touch $@ $(SHARED_OBJ): $(SHARED_OBJ_DIR)/%.o : $(SRC_DIR)/%.cpp $(SHARED_OBJ_DIR)/%.d $(DEPS) - . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -fPIC -c -o $@ $< $(FILTER) + $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -fPIC -c -o $@ $< $(FILTER) @touch $@ $(ALGORITHMS_OBJ): $(ALGORITHMS_OBJ_DIR)/%.o : $(ALGORITHMS_SRC_DIR)/%.cpp $(ALGORITHMS_OBJ_DIR)/%.d $(DEPS) - . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) + $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) @touch $@ $(ALGORITHMS_SHARED_OBJ): $(ALGORITHMS_SHARED_OBJ_DIR)/%.o : $(ALGORITHMS_SRC_DIR)/%.cpp $(ALGORITHMS_SHARED_OBJ_DIR)/%.d $(DEPS) - . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -fPIC -c -o $@ $< $(FILTER) + $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -fPIC -c -o $@ $< $(FILTER) @touch $@ $(IO_OBJ): $(IO_OBJ_DIR)/%.o : $(IO_SRC_DIR)/%.cpp $(IO_OBJ_DIR)/%.d $(DEPS) - . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) + $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) @touch $@ $(IO_SHARED_OBJ): $(IO_SHARED_OBJ_DIR)/%.o : $(IO_SRC_DIR)/%.cpp $(IO_SHARED_OBJ_DIR)/%.d $(DEPS) - . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -fPIC -c -o $@ $< $(FILTER) + $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -fPIC -c -o $@ $< $(FILTER) @touch $@ $(SUBCOMMAND_OBJ): $(SUBCOMMAND_OBJ_DIR)/%.o : $(SUBCOMMAND_SRC_DIR)/%.cpp $(SUBCOMMAND_OBJ_DIR)/%.d $(DEPS) - . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) + $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) @touch $@ $(UNITTEST_OBJ): $(UNITTEST_OBJ_DIR)/%.o : $(UNITTEST_SRC_DIR)/%.cpp $(UNITTEST_OBJ_DIR)/%.d $(DEPS) - . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) + $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) @touch $@ $(UNITTEST_SUPPORT_OBJ): $(UNITTEST_SUPPORT_OBJ_DIR)/%.o : $(UNITTEST_SUPPORT_SRC_DIR)/%.cpp $(UNITTEST_SUPPORT_OBJ_DIR)/%.d $(DEPS) - . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) + $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) @touch $@ # Config objects get individual rules $(CONFIG_OBJ_DIR)/allocator_config_jemalloc.o: $(CONFIG_SRC_DIR)/allocator_config_jemalloc.cpp $(CONFIG_OBJ_DIR)/allocator_config_jemalloc.d $(DEPS) $(LIB_DIR)/libjemalloc.a - . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) + $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) @touch $@ $(CONFIG_OBJ_DIR)/allocator_config_jemalloc_debug.o: $(CONFIG_SRC_DIR)/allocator_config_jemalloc_debug.cpp $(CONFIG_OBJ_DIR)/allocator_config_jemalloc_debug.d $(DEPS) $(LIB_DIR)/libjemalloc_debug.a - . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) + $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) @touch $@ $(CONFIG_OBJ_DIR)/allocator_config_system.o: $(CONFIG_SRC_DIR)/allocator_config_system.cpp $(CONFIG_OBJ_DIR)/allocator_config_system.d $(DEPS) - . ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) + $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER) @touch $@ # Use a fake rule to build .d files, so we don't complain if they don't exist. diff --git a/README.md b/README.md index 200ecff311c..b80954cadff 100644 --- a/README.md +++ b/README.md @@ -101,7 +101,7 @@ Note that a 64-bit OS is required. Ubuntu 20.04 should work. #### Linux: Build -When you are ready, build with `. ./source_me.sh && make`. You can use `make -j16` to run 16 build threads at a time, which greatly accelerates the process. If you have more CPU cores, you can use higher numbers. +When you are ready, build with `make`. You can use `make -j16` to run 16 build threads at a time, which greatly accelerates the process. If you have more CPU cores, you can use higher numbers. Note that vg can take anywhere from 10 minutes to more than an hour to compile depending on your machine and the number of threads used. @@ -161,7 +161,7 @@ Homebrew provides another package management solution for OSX, and may be prefer With dependencies installed, VG can now be built: - . ./source_me.sh && make + make As with Linux, you can add `-j16` or other numbers at the end to run multiple build tasks at once, if your computer can handle them. diff --git a/configure.py b/configure.py deleted file mode 100644 index 1290b07696d..00000000000 --- a/configure.py +++ /dev/null @@ -1,44 +0,0 @@ -import os -import sys -import shutil - - -def make_source_me(): - var_dict = { - "LIBRARY_PATH":"lib", - "LD_LIBRARY_PATH":"lib", - "LD_INCLUDE_PATH":"include", - "C_INCLUDE_PATH":"include", - "CPLUS_INCLUDE_PATH":"include", - "INCLUDE_PATH":"include", - "PATH":"bin" } - - my_wd = os.getcwd() - out_file = "source_me.sh" - - with open(out_file, "w") as ofi: - for i in var_dict: - o_line = "export" + " " + i + "=" + my_wd + "/" + var_dict[i] + ":$" + i + "\n" - ofi.write(o_line) - -def set_compiler(): - compiler_dict = { - "CC": "gcc", - "CXX": "g++" - } - - out_file = "source_me.sh" - with open(out_file, "a") as ofi: - for i in compiler_dict: - o_line = "export" + " " + i + "=" + "$(which " + compiler_dict[i] + ")\n" - ofi.write(o_line) - -def check_deps(): - return - -## pulled these out with this line: for i in `cat source_me.sh | cut -f 2 -d " " | cut -f 1 -d "="`; do echo "\"\$$i\":\"\","; done -if __name__ == "__main__": - make_source_me() - set_compiler() -## "$CC":"", - #"$CXX":"", diff --git a/deps/libbdsg b/deps/libbdsg index 88181dbc808..e98cda26c25 160000 --- a/deps/libbdsg +++ b/deps/libbdsg @@ -1 +1 @@ -Subproject commit 88181dbc80835e93de681e84e245c891e250f77c +Subproject commit e98cda26c2522bee80292705494dca8dcffa1c5c diff --git a/scripts/setup-server b/scripts/setup-server index e3965b61ca9..794f12d1917 100644 --- a/scripts/setup-server +++ b/scripts/setup-server @@ -36,7 +36,7 @@ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 100 --slave git clone --recursive https://github.com/vgteam/vg.git # build vg -cd vg && source ./source_me.sh && make -j 32 static && sudo cp bin/vg /usr/local/bin/ +cd vg && make -j 32 static && sudo cp bin/vg /usr/local/bin/ sudo cp scripts/vg_sim_pos_compare.py /usr/local/bin/ cd ~ diff --git a/source_me.sh b/source_me.sh deleted file mode 100755 index 0bb948c9280..00000000000 --- a/source_me.sh +++ /dev/null @@ -1,23 +0,0 @@ -export LIBRARY_PATH=`pwd`/lib:$LIBRARY_PATH -export LD_LIBRARY_PATH=`pwd`/lib:$LD_LIBRARY_PATH -export DYLD_LIBRARY_PATH=`pwd`/lib:$DYLD_LIBRARY_PATH -export LD_INCLUDE_PATH=`pwd`/include:$LD_INCLUDE_PATH -# Setting include directories via C_INCLUDE_PATH/CPLUS_INCLUDE_PATH will -# automatically get them demoted to the end of the search list even if a -I -# option is passed to try and bump them up earlier, before other -I options. -# We leave the Makefile in charge of finding all the include directories. -export CFLAGS="-I $(pwd)/include ${CFLAGS}" -export CXXFLAGS="-I $(pwd)/include -I$(pwd)/include/dynamic ${CXXFLAGS}" -export PATH=`pwd`/bin:`pwd`/scripts:"$PATH" -export CC=$(which gcc) -export CXX=$(which g++) - -# -# disable until file arguments work as in normal bash :( -# -# add bash autocompletion -#if test -n "$BASH_VERSION" -#then -# -# . ./autocomp.bash -#fi diff --git a/test/Makefile b/test/Makefile index 5ae913873f1..bdf0526ace7 100644 --- a/test/Makefile +++ b/test/Makefile @@ -20,7 +20,7 @@ $(vg): cd .. && $(MAKE) bin/vg build_graph: build_graph.cpp - cd .. && . ./source_me.sh && $(MAKE) test/build_graph + cd .. && $(MAKE) test/build_graph clean: rm -f build_graph diff --git a/vgci/vgci.sh b/vgci/vgci.sh index 4526dac20aa..9a5e53b08f3 100755 --- a/vgci/vgci.sh +++ b/vgci/vgci.sh @@ -252,7 +252,6 @@ then if [ "${LOCAL_BUILD}" == "1" ] then # Just build vg here - . ./source_me.sh make -j ${NUM_CORES} if [ "$?" -ne 0 ] From dd7157a7d2da845b4e2af991d91bcf8c40682e50 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 28 Oct 2024 11:59:09 -0400 Subject: [PATCH 28/54] Avoid breaking any commands that still want to source source_me.sh --- source_me.sh | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 source_me.sh diff --git a/source_me.sh b/source_me.sh new file mode 100644 index 00000000000..d650cc14afe --- /dev/null +++ b/source_me.sh @@ -0,0 +1,4 @@ +# We used to have a script here to set up all the include and library search +# paths for the vg build. But now the Makefile knows how to do it all for the +# build, and the vg binary knows where to look for its dynamic libraries. +echo 1>&2 "Sourcing source_me.sh is no longer necessary" From 570844ceb311078160dc9d0e810e5a2bfed1376c Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 28 Oct 2024 12:46:59 -0400 Subject: [PATCH 29/54] Pass CC and CXX through to SDSL build --- Makefile | 5 ++--- deps/sdsl-lite | 2 +- doc/wiki | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index ff91be32fa5..0e3afd9e14e 100644 --- a/Makefile +++ b/Makefile @@ -22,7 +22,6 @@ LIB_DIR:=lib # INC_DIR must be a relative path INC_DIR:=include CWD:=$(shell pwd) -CXX ?= g++ PKG_CONFIG ?= pkg-config SFX := @@ -570,9 +569,9 @@ $(LIB_DIR)/libjemalloc_debug.a: $(JEMALLOC_DIR)/src/*.c # See https://stackoverflow.com/a/19822767 $(LIB_DIR)/%sdsl.a $(LIB_DIR)/%divsufsort.a $(LIB_DIR)/%divsufsort64.a : $(SDSL_DIR)/lib/*.cpp $(SDSL_DIR)/include/sdsl/*.hpp ifeq ($(shell uname -s),Darwin) - +cd $(SDSL_DIR) && AS_INTEGRATED_ASSEMBLER=1 BUILD_PORTABLE=1 CXXFLAGS="-fPIC $(CPPFLAGS) $(CXXFLAGS)" ./install.sh $(CWD) $(FILTER) + +cd $(SDSL_DIR) && AS_INTEGRATED_ASSEMBLER=1 BUILD_PORTABLE=1 CXXFLAGS="-fPIC $(CPPFLAGS) $(CXXFLAGS)" CC=$(CC) CXX=$(CXX) ./install.sh $(CWD) $(FILTER) else - +cd $(SDSL_DIR) && BUILD_PORTABLE=1 CXXFLAGS="-fPIC $(CPPFLAGS) $(CXXFLAGS)" ./install.sh $(CWD) $(FILTER) + +cd $(SDSL_DIR) && BUILD_PORTABLE=1 CXXFLAGS="-fPIC $(CPPFLAGS) $(CXXFLAGS)" CC=$(CC) CXX=$(CXX) ./install.sh $(CWD) $(FILTER) endif $(LIB_DIR)/libssw.a: $(SSW_DIR)/*.c $(SSW_DIR)/*.cpp $(SSW_DIR)/*.h diff --git a/deps/sdsl-lite b/deps/sdsl-lite index 863d0118cf3..86fa3534c1b 160000 --- a/deps/sdsl-lite +++ b/deps/sdsl-lite @@ -1 +1 @@ -Subproject commit 863d0118cf303f9c9c55576e0f6b2f70ecd9689a +Subproject commit 86fa3534c1bf02b5468bbe58e2e0b8f6ae2d6fa4 diff --git a/doc/wiki b/doc/wiki index f70ea363837..f28a1e56005 160000 --- a/doc/wiki +++ b/doc/wiki @@ -1 +1 @@ -Subproject commit f70ea36383784fc731454c1b09014f65ad4f74d5 +Subproject commit f28a1e56005c729cf5c2dad6a251447bedba2949 From 66b81ed8202d1db767e6513935bed1cb5ab0a3c0 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 28 Oct 2024 12:52:12 -0400 Subject: [PATCH 30/54] Propagate CC and CXX to all build commands --- Makefile | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 0e3afd9e14e..047e845be3f 100644 --- a/Makefile +++ b/Makefile @@ -27,6 +27,9 @@ PKG_CONFIG ?= pkg-config SFX := EXE:=vg$(SFX) +# Expose compiler we want to use to all build commands as an environment variable. +export CC CXX + all: $(BIN_DIR)/$(EXE) # Magic dependencies (see ) @@ -569,9 +572,9 @@ $(LIB_DIR)/libjemalloc_debug.a: $(JEMALLOC_DIR)/src/*.c # See https://stackoverflow.com/a/19822767 $(LIB_DIR)/%sdsl.a $(LIB_DIR)/%divsufsort.a $(LIB_DIR)/%divsufsort64.a : $(SDSL_DIR)/lib/*.cpp $(SDSL_DIR)/include/sdsl/*.hpp ifeq ($(shell uname -s),Darwin) - +cd $(SDSL_DIR) && AS_INTEGRATED_ASSEMBLER=1 BUILD_PORTABLE=1 CXXFLAGS="-fPIC $(CPPFLAGS) $(CXXFLAGS)" CC=$(CC) CXX=$(CXX) ./install.sh $(CWD) $(FILTER) + +cd $(SDSL_DIR) && AS_INTEGRATED_ASSEMBLER=1 BUILD_PORTABLE=1 CXXFLAGS="-fPIC $(CPPFLAGS) $(CXXFLAGS)" ./install.sh $(CWD) $(FILTER) else - +cd $(SDSL_DIR) && BUILD_PORTABLE=1 CXXFLAGS="-fPIC $(CPPFLAGS) $(CXXFLAGS)" CC=$(CC) CXX=$(CXX) ./install.sh $(CWD) $(FILTER) + +cd $(SDSL_DIR) && BUILD_PORTABLE=1 CXXFLAGS="-fPIC $(CPPFLAGS) $(CXXFLAGS)" ./install.sh $(CWD) $(FILTER) endif $(LIB_DIR)/libssw.a: $(SSW_DIR)/*.c $(SSW_DIR)/*.cpp $(SSW_DIR)/*.h From c82ae3fbb1af426725ef43c019cd8d9ef64c9f70 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 28 Oct 2024 12:55:19 -0400 Subject: [PATCH 31/54] Pass CXX and CC in CMake compiler format to directly invoked CMakes --- Makefile | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index 047e845be3f..6b939290cc3 100644 --- a/Makefile +++ b/Makefile @@ -616,9 +616,9 @@ $(INC_DIR)/kff_io.hpp: $(LIB_DIR)/libkff.a $(LIB_DIR)/libkff.a: $(KFF_DIR)/kff_io.cpp $(KFF_DIR)/kff_io.hpp.in ifeq ($(shell uname -s),Darwin) - +cd $(KFF_DIR) && rm -Rf build && mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-fPIC -Wall -Ofast -g $(CXXFLAGS)" .. && AS_INTEGRATED_ASSEMBLER=1 $(MAKE) $(FILTER) && cp kff_io.hpp $(CWD)/$(INC_DIR) && mv libkff.a $(CWD)/$(LIB_DIR) + +cd $(KFF_DIR) && rm -Rf build && mkdir build && cd build && cmake -DCMAKE_C_COMPILER="$(CC)" -DCMAKE_CXX_COMPILER="$(CXX)" -DCMAKE_CXX_FLAGS="-fPIC -Wall -Ofast -g $(CXXFLAGS)" .. && AS_INTEGRATED_ASSEMBLER=1 $(MAKE) $(FILTER) && cp kff_io.hpp $(CWD)/$(INC_DIR) && mv libkff.a $(CWD)/$(LIB_DIR) else - +cd $(KFF_DIR) && rm -Rf build && mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-fPIC -Wall -Ofast -g $(CXXFLAGS)" .. && $(MAKE) $(FILTER) && cp kff_io.hpp $(CWD)/$(INC_DIR) && mv libkff.a $(CWD)/$(LIB_DIR) + +cd $(KFF_DIR) && rm -Rf build && mkdir build && cd build && cmake -DCMAKE_C_COMPILER="$(CC)" -DCMAKE_CXX_COMPILER="$(CXX)" -DCMAKE_CXX_FLAGS="-fPIC -Wall -Ofast -g $(CXXFLAGS)" .. && $(MAKE) $(FILTER) && cp kff_io.hpp $(CWD)/$(INC_DIR) && mv libkff.a $(CWD)/$(LIB_DIR) endif $(INC_DIR)/BooPHF.h: $(BBHASH_DIR)/BooPHF.h @@ -664,10 +664,10 @@ $(LIB_DIR)/cleaned_old_elfutils: $(LIB_DIR)/libvgio.a: $(LIB_DIR)/libhts.a $(LIB_DIR)/libhandlegraph.a $(LIB_DIR)/pkgconfig/htslib.pc $(LIB_DIR)/cleaned_old_protobuf_v003 $(LIBVGIO_DIR)/CMakeLists.txt $(LIBVGIO_DIR)/src/*.cpp $(LIBVGIO_DIR)/include/vg/io/*.hpp $(LIBVGIO_DIR)/deps/vg.proto +rm -f $(CWD)/$(INC_DIR)/vg.pb.h $(CWD)/$(INC_DIR)/vg/vg.pb.h +rm -Rf $(CWD)/$(INC_DIR)/vg/io/ - +export CXXFLAGS="$(CPPFLAGS) $(CXXFLAGS)" && export LDFLAGS="$(LD_LIB_DIR_FLAGS) $(LDFLAGS)" && cd $(LIBVGIO_DIR) && rm -Rf CMakeCache.txt CMakeFiles *.cmake install_manifest.txt *.pb.cc *.pb.h *.a && rm -rf build-vg && mkdir build-vg && cd build-vg && PKG_CONFIG_PATH=$(CWD)/$(LIB_DIR)/pkgconfig:$(PKG_CONFIG_PATH) cmake -DCMAKE_CXX_STANDARD=$(CXX_STANDARD) -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_PREFIX_PATH="/usr;$(OMP_PREFIXES)" -DCMAKE_INSTALL_PREFIX=$(CWD) -DCMAKE_INSTALL_LIBDIR=lib .. $(FILTER) && $(MAKE) clean && VERBOSE=1 $(MAKE) $(FILTER) && $(MAKE) install + +export CXXFLAGS="$(CPPFLAGS) $(CXXFLAGS)" && export LDFLAGS="$(LD_LIB_DIR_FLAGS) $(LDFLAGS)" && cd $(LIBVGIO_DIR) && rm -Rf CMakeCache.txt CMakeFiles *.cmake install_manifest.txt *.pb.cc *.pb.h *.a && rm -rf build-vg && mkdir build-vg && cd build-vg && PKG_CONFIG_PATH=$(CWD)/$(LIB_DIR)/pkgconfig:$(PKG_CONFIG_PATH) cmake -DCMAKE_C_COMPILER="$(CC)" -DCMAKE_CXX_COMPILER="$(CXX)" -DCMAKE_CXX_STANDARD=$(CXX_STANDARD) -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_PREFIX_PATH="/usr;$(OMP_PREFIXES)" -DCMAKE_INSTALL_PREFIX=$(CWD) -DCMAKE_INSTALL_LIBDIR=lib .. $(FILTER) && $(MAKE) clean && VERBOSE=1 $(MAKE) $(FILTER) && $(MAKE) install $(LIB_DIR)/libhandlegraph.a: $(LIBHANDLEGRAPH_DIR)/src/include/handlegraph/*.hpp $(LIBHANDLEGRAPH_DIR)/src/*.cpp - +cd $(LIBHANDLEGRAPH_DIR) && rm -Rf build CMakeCache.txt CMakeFiles && mkdir build && cd build && CXXFLAGS="$(CXXFLAGS) $(CPPFLAGS)" cmake -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_INSTALL_PREFIX=$(CWD) -DCMAKE_INSTALL_LIBDIR=lib .. && $(MAKE) $(FILTER) && $(MAKE) install + +cd $(LIBHANDLEGRAPH_DIR) && rm -Rf build CMakeCache.txt CMakeFiles && mkdir build && cd build && CXXFLAGS="$(CXXFLAGS) $(CPPFLAGS)" cmake -DCMAKE_C_COMPILER="$(CC)" -DCMAKE_CXX_COMPILER="$(CXX)" -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_INSTALL_PREFIX=$(CWD) -DCMAKE_INSTALL_LIBDIR=lib .. && $(MAKE) $(FILTER) && $(MAKE) install # On Linux, libdeflate builds a .so. @@ -718,7 +718,7 @@ $(LIB_DIR)/libtabixpp.a: $(LIB_DIR)/libhts.a $(TABIXPP_DIR)/*.cpp $(TABIXPP_DIR) # We need to use /usr first for CMake search or Ubuntu 22.04 will decide pybind11 is installed in / when actually it is only fully installed in /usr. $(LIB_DIR)/libvcflib.a: $(LIB_DIR)/libhts.a $(LIB_DIR)/libtabixpp.a $(VCFLIB_DIR)/src/*.cpp $(VCFLIB_DIR)/src/*.hpp $(VCFLIB_DIR)/contrib/*/*.cpp $(VCFLIB_DIR)/contrib/*/*.h +rm -f $(VCFLIB_DIR)/contrib/WFA2-lib/VERSION - +cd $(VCFLIB_DIR) && rm -Rf build && mkdir build && cd build && PKG_CONFIG_PATH="$(CWD)/$(LIB_DIR)/pkgconfig:$(PKG_CONFIG_PATH)" cmake -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON -DZIG=OFF -DCMAKE_C_FLAGS="$(CFLAGS)" -DCMAKE_CXX_FLAGS="$(CXXFLAGS) ${CPPFLAGS}" -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_INSTALL_PREFIX=$(CWD) -DCMAKE_PREFIX_PATH="/usr;$(OMP_PREFIXES)" .. && cmake --build . + +cd $(VCFLIB_DIR) && rm -Rf build && mkdir build && cd build && PKG_CONFIG_PATH="$(CWD)/$(LIB_DIR)/pkgconfig:$(PKG_CONFIG_PATH)" cmake -DCMAKE_C_COMPILER="$(CC)" -DCMAKE_CXX_COMPILER="$(CXX)" -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON -DZIG=OFF -DCMAKE_C_FLAGS="$(CFLAGS)" -DCMAKE_CXX_FLAGS="$(CXXFLAGS) ${CPPFLAGS}" -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_INSTALL_PREFIX=$(CWD) -DCMAKE_PREFIX_PATH="/usr;$(OMP_PREFIXES)" .. && cmake --build . +cp $(VCFLIB_DIR)/contrib/filevercmp/*.h* $(INC_DIR) +cp $(VCFLIB_DIR)/contrib/fastahack/*.h* $(INC_DIR) +cp $(VCFLIB_DIR)/contrib/smithwaterman/*.h* $(INC_DIR) @@ -744,7 +744,7 @@ $(INC_DIR)/lru_cache.h: $(DEP_DIR)/lru_cache/*.h $(DEP_DIR)/lru_cache/*.cc $(INC_DIR)/dynamic/dynamic.hpp: $(DYNAMIC_DIR)/include/dynamic/*.hpp $(DYNAMIC_DIR)/include/dynamic/*/*.hpp +rm -Rf $(INC_DIR)/dynamic.hpp $(INC_DIR)/dynamic # annoyingly doesn't have an install option on the cmake, so we manually move their external dependency headers - +cd $(CWD)/$(DYNAMIC_DIR) && rm -Rf build && mkdir -p build && cd build && export CXXFLAGS="$(CPPFLAGS) $(CXXFLAGS)" && cmake -DCMAKE_VERBOSE_MAKEFILE=ON .. && make && cp -r $(CWD)/$(DYNAMIC_DIR)/deps/hopscotch_map/include/* $(CWD)/$(INC_DIR)/ + +cd $(CWD)/$(DYNAMIC_DIR) && rm -Rf build && mkdir -p build && cd build && export CXXFLAGS="$(CPPFLAGS) $(CXXFLAGS)" && cmake -DCMAKE_C_COMPILER="$(CC)" -DCMAKE_CXX_COMPILER="$(CXX)" -DCMAKE_VERBOSE_MAKEFILE=ON .. && make && cp -r $(CWD)/$(DYNAMIC_DIR)/deps/hopscotch_map/include/* $(CWD)/$(INC_DIR)/ # Do the copy of the main file last so we can tell if this recipe failed and redo it. # Otherwise we get dynamic.hpp without its deps +mkdir -p $(INC_DIR)/dynamic && cp -r $(CWD)/$(DYNAMIC_DIR)/include/dynamic/* $(INC_DIR)/dynamic/ @@ -769,7 +769,7 @@ $(LIB_DIR)/libpinchesandcacti.a: $(LIB_DIR)/libsonlib.a $(CWD)/$(DEP_DIR)/pinche # We also need to clear out its cmake stuff in case it found a wrong Bison and cached it. $(LIB_DIR)/libraptor2.a: $(RAPTOR_DIR)/src/* $(wildcard $(RAPTOR_DIR)/build/*) which bison - +cd $(RAPTOR_DIR)/build && rm -Rf CMakeCache.txt CMakeFiles CTestTestfile.cmake Makefile cmake_install.cmake src tests utils && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" cmake .. && rm -f src/turtle_parser.c && rm -f src/turtle_lexer.c && make turtle_lexer_tgt && make -f src/CMakeFiles/raptor2.dir/build.make src/turtle_lexer.c && sed -i.bak '/yycleanup/d' src/turtle_lexer.c && $(MAKE) $(FILTER) && cp src/libraptor2.a $(CWD)/$(LIB_DIR) + +cd $(RAPTOR_DIR)/build && rm -Rf CMakeCache.txt CMakeFiles CTestTestfile.cmake Makefile cmake_install.cmake src tests utils && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" cmake -DCMAKE_C_COMPILER="$(CC)" -DCMAKE_CXX_COMPILER="$(CXX)" .. && rm -f src/turtle_parser.c && rm -f src/turtle_lexer.c && make turtle_lexer_tgt && make -f src/CMakeFiles/raptor2.dir/build.make src/turtle_lexer.c && sed -i.bak '/yycleanup/d' src/turtle_lexer.c && $(MAKE) $(FILTER) && cp src/libraptor2.a $(CWD)/$(LIB_DIR) +touch $(LIB_DIR)/libraptor2.a # We need rapper from Raptor for the tests From 0c75c1f4e86330e355627fe9bb4c7d0639fdf23b Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 28 Oct 2024 14:01:21 -0400 Subject: [PATCH 32/54] Drop removed files from CI build script --- .gitlab-ci.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 65d745c2e2b..62ac9ac0517 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -86,8 +86,6 @@ local-build-test-job: script: - THREADS=8 - nvm version - - python3 ./configure.py - - source ./source_me.sh - make get-deps - make -j${THREADS} - echo Testing From 89beb2f6a7557e17221da457b0d9ab0d4f8c8296 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 28 Oct 2024 16:01:51 -0400 Subject: [PATCH 33/54] Expose vg to tube map for test --- .gitlab-ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 62ac9ac0517..3fbdc26907a 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -94,6 +94,8 @@ local-build-test-job: - make test - make static -j${THREADS} # Also test as a backend for the tube map + # Tube map expects vg on PATH + - export PATH="$(pwd)/bin:${PATH}" - git clone https://github.com/vgteam/sequenceTubeMap.git - cd sequenceTubeMap # Tube map expects local IPv6 but Kubernetes won't let us have it From 2e16f837fbb8bdd2741dfbc5b841fe829a0502f6 Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Mon, 28 Oct 2024 13:50:45 -0700 Subject: [PATCH 34/54] add option to output haplotypes afte vg rna --- src/subcommand/rna_main.cpp | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/src/subcommand/rna_main.cpp b/src/subcommand/rna_main.cpp index 0e59220fae4..515c2112e16 100644 --- a/src/subcommand/rna_main.cpp +++ b/src/subcommand/rna_main.cpp @@ -54,6 +54,7 @@ void help_rna(char** argv) { << "\nOutput options:" << endl << " -b, --write-gbwt FILE write pantranscriptome transcript paths as GBWT index file" << endl + << " -v, --write-hap-gbwt FILE write input haplotypes as a GBWT with node IDs matching the output graph" << endl << " -f, --write-fasta FILE write pantranscriptome transcript sequences as fasta file" << endl << " -i, --write-info FILE write pantranscriptome transcript info table as tsv file" << endl << " -q, --out-exclude-ref exclude reference transcripts from pantranscriptome output" << endl @@ -88,6 +89,7 @@ int32_t main_rna(int32_t argc, char** argv) { bool gbwt_add_bidirectional = false; string fasta_out_filename = ""; string info_out_filename = ""; + string hap_gbwt_out_filename = ""; int32_t num_threads = 1; bool show_progress = false; @@ -110,8 +112,9 @@ int32_t main_rna(int32_t argc, char** argv) { {"remove-non-gene", no_argument, 0, 'd'}, {"do-not-sort", no_argument, 0, 'o'}, {"add-ref-paths", no_argument, 0, 'r'}, - {"add-hap-paths", no_argument, 0, 'a'}, + {"add-hap-paths", no_argument, 0, 'a'}, {"write-gbwt", required_argument, 0, 'b'}, + {"write-hap-gbwt", required_argument, 0, 'v'}, {"write-fasta", required_argument, 0, 'f'}, {"write-info", required_argument, 0, 'i'}, {"out-ref-paths", no_argument, 0, 'u'}, @@ -124,7 +127,7 @@ int32_t main_rna(int32_t argc, char** argv) { }; int32_t option_index = 0; - c = getopt_long(argc, argv, "n:m:y:s:l:zjec:k:dorab:f:i:uqgt:ph?", long_options, &option_index); + c = getopt_long(argc, argv, "n:m:y:s:l:zjec:k:dorab:v:f:i:uqgt:ph?", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) @@ -188,10 +191,14 @@ int32_t main_rna(int32_t argc, char** argv) { case 'a': add_projected_transcript_paths = true; break; - + case 'b': gbwt_out_filename = optarg; break; + + case 'v': + hap_gbwt_out_filename = optarg; + break; case 'f': fasta_out_filename = optarg; @@ -486,6 +493,19 @@ int32_t main_rna(int32_t argc, char** argv) { gbwt_builder.finish(); save_gbwt(gbwt_builder.index, gbwt_out_filename); } + + // Write a haplotype GBWT with node IDs updated to match the spliced graph. + if (!hap_gbwt_out_filename.empty()) { + if (!haplotype_index.get()) { + cerr << "[vg rna] Warning: not saving updated haplotypes to " << hap_gbwt_out_filename << " because haplotypes were not provided as input" << endl; + } + else { + ofstream hap_gbwt_ostream; + hap_gbwt_ostream.open(hap_gbwt_out_filename); + + haplotype_index->serialize(hap_gbwt_ostream); + } + } // Write transcript sequences in transcriptome as fasta file. if (!fasta_out_filename.empty()) { @@ -496,7 +516,7 @@ int32_t main_rna(int32_t argc, char** argv) { transcriptome.write_transcript_sequences(&fasta_ostream, exclude_reference_transcripts); fasta_ostream.close(); - } + } // Write transcript info in transcriptome as tsv file. if (!info_out_filename.empty()) { From 4e6c34d0b290b3fd5d8849fb297f7b96bc1fa9a8 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Wed, 30 Oct 2024 10:35:55 -0400 Subject: [PATCH 35/54] Use sublinear-Li-Stephens that can use CXX --- deps/sublinear-Li-Stephens | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/sublinear-Li-Stephens b/deps/sublinear-Li-Stephens index 241001025d7..ef54e081fae 160000 --- a/deps/sublinear-Li-Stephens +++ b/deps/sublinear-Li-Stephens @@ -1 +1 @@ -Subproject commit 241001025d7cf516c0c9577f94e858e76560269f +Subproject commit ef54e081faebb3fef7ff54cf5d4f8068836951f7 From a57b473efb79f6986b03eb5d1db394df38309c3f Mon Sep 17 00:00:00 2001 From: Jouni Siren Date: Thu, 31 Oct 2024 20:59:09 -0700 Subject: [PATCH 36/54] Start getting rid of the old path/thread terminology --- src/subcommand/gbwt_main.cpp | 75 ++++++++++++++++++------------------ test/t/37_vg_gbwt.t | 62 ++++++++++++++--------------- 2 files changed, 69 insertions(+), 68 deletions(-) diff --git a/src/subcommand/gbwt_main.cpp b/src/subcommand/gbwt_main.cpp index b22f0c0ce8e..af88d10f2e9 100644 --- a/src/subcommand/gbwt_main.cpp +++ b/src/subcommand/gbwt_main.cpp @@ -37,7 +37,7 @@ struct GBWTConfig { build_mode build = build_none; merge_mode merge = merge_none; path_cover_mode path_cover = path_cover_none; - bool metadata_mode = false, thread_mode = false; + bool metadata_mode = false, path_mode = false; // Input GBWT construction. HaplotypeIndexer haplotype_indexer; @@ -55,8 +55,8 @@ struct GBWTConfig { // Other parameters and flags. bool show_progress = false; - bool count_threads = false; - bool metadata = false, contigs = false, haplotypes = false, samples = false, list_names = false, thread_names = false, tags = false; + bool count_paths = false; + bool metadata = false, contigs = false, haplotypes = false, samples = false, list_names = false, path_names = false, tags = false; bool include_named_paths = false; size_t num_paths = default_num_paths(), context_length = default_context_length(); bool num_paths_set = false; @@ -70,7 +70,7 @@ struct GBWTConfig { // File names. std::string gbwt_output; // Output GBWT. - std::string thread_output; // Threads in SDSL format. + std::string path_output; // Paths in SDSL format. std::string graph_output; // Output GBWTGraph. std::string segment_translation; // Segment to node translation output. std::string r_index_name; // Output r-index. @@ -163,7 +163,7 @@ void step_4_path_cover(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& con void step_5_gbwtgraph(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& config); void step_6_r_index(GBWTHandler& gbwts, GBWTConfig& config); void step_7_metadata(GBWTHandler& gbwts, GBWTConfig& config); -void step_8_threads(GBWTHandler& gbwts, GBWTConfig& config); +void step_8_paths(GBWTHandler& gbwts, GBWTConfig& config); void report_time_memory(const std::string& what, double start_time, const GBWTConfig& config); void print_metadata(std::ostream& out, const GBWTHandler& gbwts); @@ -233,9 +233,9 @@ int main_gbwt(int argc, char** argv) { step_7_metadata(gbwts, config); } - // Thread options. - if (config.thread_mode) { - step_8_threads(gbwts, config); + // Path options. + if (config.path_mode) { + step_8_paths(gbwts, config); } return 0; @@ -260,7 +260,7 @@ void help_gbwt(char** argv) { std::cerr << " --id-interval N store path ids at one out of N positions (default " << gbwt::DynamicGBWT::SAMPLE_INTERVAL << ")" << std::endl; std::cerr << std::endl; std::cerr << "Multithreading:" << std::endl; - std::cerr << " --num-jobs N use at most N parallel build jobs (for -v, -G, -l, -P; default " << GBWTConfig::default_build_jobs() << ")" << std::endl; + std::cerr << " --num-jobs N use at most N parallel build jobs (for -v, -G, -A, -l, -P; default " << GBWTConfig::default_build_jobs() << ")" << std::endl; std::cerr << " --num-threads N use N parallel search threads (for -b and -r; default " << omp_get_max_threads() << ")" << std::endl; std::cerr << std::endl; std::cerr << "Step 1: GBWT construction (requires -o and one of { -v, -G, -Z, -E, A }):" << std::endl; @@ -330,12 +330,12 @@ void help_gbwt(char** argv) { std::cerr << " -H, --haplotypes print the number of haplotypes" << std::endl; std::cerr << " -S, --samples print the number of samples" << std::endl; std::cerr << " -L, --list-names list contig/sample names (use with -C or -S)" << std::endl; - std::cerr << " -T, --thread-names list thread names" << std::endl; + std::cerr << " -T, --thread-names list path names" << std::endl; // FIXME change to --path-names; make --thread-names a deprecated alias std::cerr << " --tags list GBWT tags" << std::endl; std::cerr << std::endl; - std::cerr << "Step 8: Threads (one input GBWT):" << std::endl; - std::cerr << " -c, --count-threads print the number of threads" << std::endl; - std::cerr << " -e, --extract FILE extract threads in SDSL format to FILE" << std::endl; + std::cerr << "Step 8: Paths (one input GBWT):" << std::endl; + std::cerr << " -c, --count-threads print the number of paths" << std::endl; // FIXME also here, change to --count-paths + std::cerr << " -e, --extract FILE extract paths in SDSL format to FILE" << std::endl; std::cerr << std::endl; } @@ -519,11 +519,11 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) { { "haplotypes", no_argument, 0, 'H' }, { "samples", no_argument, 0, 'S' }, { "list-names", no_argument, 0, 'L' }, - { "thread-names", no_argument, 0, 'T' }, + { "thread-names", no_argument, 0, 'T' }, // FIXME { "tags", no_argument, 0, OPT_TAGS }, - // Threads - { "count-threads", no_argument, 0, 'c' }, + // Paths + { "count-threads", no_argument, 0, 'c' }, // FIXME { "extract", required_argument, 0, 'e' }, { "help", no_argument, 0, 'h' }, @@ -864,7 +864,7 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) { config.metadata_mode = true; break; case 'T': - config.thread_names = true; + config.path_names = true; config.metadata_mode = true; break; case OPT_TAGS: @@ -872,14 +872,14 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) { config.metadata_mode = true; break; - // Threads + // Paths case 'c': - config.count_threads = true; - config.thread_mode = true; + config.count_paths = true; + config.path_mode = true; break; case 'e': - config.thread_output = optarg; - config.thread_mode = true; + config.path_output = optarg; + config.path_mode = true; break; case 'h': @@ -1077,9 +1077,9 @@ void validate_gbwt_config(GBWTConfig& config) { } } - if (config.thread_mode) { + if (config.path_mode) { if (!one_input_gbwt) { - std::cerr << "error: [vg gbwt] thread operations require one input GBWT" << std::endl; + std::cerr << "error: [vg gbwt] path operations require one input GBWT" << std::endl; std::exit(EXIT_FAILURE); } } @@ -1373,6 +1373,7 @@ void step_1_build_gbwts(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& co if (config.show_progress) { std::cerr << "Input type: " << (config.gam_format ? "GAM" : "GAF") << std::endl; } + // FIXME: Parallelize this. std::unique_ptr temp = config.haplotype_indexer.build_gbwt(*(graphs.path_graph), config.input_filenames, (config.gam_format ? "GAM" : "GAF")); gbwts.use(*temp); } @@ -1458,7 +1459,7 @@ void remove_samples(GBWTHandler& gbwts, GBWTConfig& config) { gbwts.use_dynamic(); if (!(gbwts.dynamic.hasMetadata() && gbwts.dynamic.metadata.hasPathNames() && gbwts.dynamic.metadata.hasSampleNames())) { - std::cerr << "error: [vg gbwt] the index does not contain metadata with thread and sample names" << std::endl; + std::cerr << "error: [vg gbwt] the index does not contain metadata with path and sample names" << std::endl; std::exit(EXIT_FAILURE); } @@ -1471,11 +1472,11 @@ void remove_samples(GBWTHandler& gbwts, GBWTConfig& config) { } std::vector path_ids = gbwts.dynamic.metadata.removeSample(sample_id); if (path_ids.empty()) { - std::cerr << "warning: [vg gbwt] no threads associated with sample " << sample_name << std::endl; + std::cerr << "warning: [vg gbwt] no paths associated with sample " << sample_name << std::endl; continue; } if (config.show_progress) { - std::cerr << "Removing " << path_ids.size() << " threads for sample " << sample_name << std::endl; + std::cerr << "Removing " << path_ids.size() << " paths for sample " << sample_name << std::endl; } gbwts.dynamic.remove(path_ids); } @@ -1666,7 +1667,7 @@ void step_7_metadata(GBWTHandler& gbwts, GBWTConfig& config) { } } - if (config.thread_names) { + if (config.path_names) { auto& metadata = get_metadata(); if (metadata.hasPathNames()) { // Precompute some metadata @@ -1676,7 +1677,7 @@ void step_7_metadata(GBWTHandler& gbwts, GBWTConfig& config) { std::cout << gbwtgraph::compose_path_name(gbwts.compressed, i, sense) << std::endl; } } else { - std::cerr << "error: [vg gbwt] the metadata does not contain thread names" << std::endl; + std::cerr << "error: [vg gbwt] the metadata does not contain path names" << std::endl; } } @@ -1692,19 +1693,19 @@ void step_7_metadata(GBWTHandler& gbwts, GBWTConfig& config) { //---------------------------------------------------------------------------- -void step_8_threads(GBWTHandler& gbwts, GBWTConfig& config) { - // Extract threads in SDSL format. - if (!config.thread_output.empty()) { +void step_8_paths(GBWTHandler& gbwts, GBWTConfig& config) { + // Extract paths in SDSL format. + if (!config.path_output.empty()) { double start = gbwt::readTimer(); if (config.show_progress) { - std::cerr << "Extracting threads to " << config.thread_output << std::endl; + std::cerr << "Extracting paths to " << config.path_output << std::endl; } gbwts.use_compressed(); if (config.show_progress) { std::cerr << "Starting the extraction" << std::endl; } gbwt::size_type node_width = gbwt::bit_length(gbwts.compressed.sigma() - 1); - gbwt::text_buffer_type out(config.thread_output, std::ios::out, gbwt::MEGABYTE, node_width); + gbwt::text_buffer_type out(config.path_output, std::ios::out, gbwt::MEGABYTE, node_width); for (gbwt::size_type id = 0; id < gbwts.compressed.sequences(); id += 2) { // Ignore reverse complements. gbwt::vector_type sequence = gbwts.compressed.extract(id); for (auto node : sequence) { @@ -1713,11 +1714,11 @@ void step_8_threads(GBWTHandler& gbwts, GBWTConfig& config) { out.push_back(gbwt::ENDMARKER); } out.close(); - report_time_memory("Threads extracted", start, config); + report_time_memory("Paths extracted", start, config); } - // There are two sequences for each thread. - if (config.count_threads) { + // There are two sequences for each path. + if (config.count_paths) { gbwts.use_compressed(); std::cout << (gbwts.compressed.sequences() / 2) << std::endl; } diff --git a/test/t/37_vg_gbwt.t b/test/t/37_vg_gbwt.t index 6c11196fe7a..9fdc0c8a4fb 100644 --- a/test/t/37_vg_gbwt.t +++ b/test/t/37_vg_gbwt.t @@ -33,11 +33,11 @@ cmp x.gbwt parse_x.gbwt is $? 0 "identical construction results with vg gbwt and from VCF parse" # Single chromosome: metadata for haplotypes -is $(vg gbwt -c x.gbwt) 2 "chromosome X: 2 threads" +is $(vg gbwt -c x.gbwt) 2 "chromosome X: 2 paths" is $(vg gbwt -C x.gbwt) 1 "chromosome X: 1 contig" is $(vg gbwt -H x.gbwt) 2 "chromosome X: 2 haplotypes" is $(vg gbwt -S x.gbwt) 1 "chromosome X: 1 sample" -is $(vg gbwt -T x.gbwt | wc -l) 2 "chromosome X: 2 thread names" +is $(vg gbwt -T x.gbwt | wc -l) 2 "chromosome X: 2 path names" is $(vg gbwt -C -L x.gbwt | wc -l) 1 "chromosome X: 1 contig name" is $(vg gbwt -S -L x.gbwt | wc -l) 1 "chromosome X: 1 sample name" @@ -52,7 +52,7 @@ vg index -G x2.ref.gbwt -T x.vg is $? 0 "chromosome X reference GBWT with vg index" cmp x.ref.gbwt x2.ref.gbwt is $? 0 "identical construction results with vg gbwt and vg index" -is $(vg gbwt -c x.ref.gbwt) 1 "chromosome X reference: 1 thread" +is $(vg gbwt -c x.ref.gbwt) 1 "chromosome X reference: 1 path" rm -f x.ref.gbwt x2.ref.gbwt @@ -115,7 +115,7 @@ vg gbwt -x xy-alt.xg -o xy.1000gp.gbwt --preset 1000gp -v small/xy2.vcf.gz is $? 0 "construction preset: 1000gp" # Multiple chromosomes: metadata for haplotypes -is $(vg gbwt -c xy.merge.gbwt) 4 "multiple chromosomes: 4 threads" +is $(vg gbwt -c xy.merge.gbwt) 4 "multiple chromosomes: 4 paths" is $(vg gbwt -C xy.merge.gbwt) 2 "multiple chromosomes: 2 contigs" is $(vg gbwt -H xy.merge.gbwt) 2 "multiple chromosomes: 2 haplotypes" is $(vg gbwt -S xy.merge.gbwt) 1 "multiple chromosomes: 1 sample" @@ -131,7 +131,7 @@ vg index -G xy2.contigs.gbwt -T xy.xg is $? 0 "paths as contigs with vg index" cmp xy.contigs.gbwt xy2.contigs.gbwt is $? 0 "identical construction results with vg gbwt and vg index" -is $(vg gbwt -c xy.contigs.gbwt) 2 "paths as contigs: 2 threads" +is $(vg gbwt -c xy.contigs.gbwt) 2 "paths as contigs: 2 paths" is $(vg gbwt -C xy.contigs.gbwt) 2 "paths as contigs: 2 contigs" is $(vg gbwt -H xy.contigs.gbwt) 1 "paths as contigs: 1 haplotype" is $(vg gbwt -S xy.contigs.gbwt) 1 "paths as contigs: 1 sample" @@ -194,35 +194,35 @@ rm -f x.gbwt empty.gbwt x2.gbwt vg gbwt -x xy-alt.xg -o xy.gbwt -v small/xy2.vcf.gz vg gbwt -E -o xy.ref.gbwt -x xy.xg vg gbwt -m -o xy.both.gbwt xy.gbwt xy.ref.gbwt -is $(vg gbwt -c xy.both.gbwt) 6 "haplotypes and paths: 6 threads" +is $(vg gbwt -c xy.both.gbwt) 6 "haplotypes and paths: 6 paths" # Remove the reference sample that GBWTs use for paths vg gbwt -R _gbwt_ref -o xy.removed.gbwt xy.both.gbwt is $? 0 "samples can be removed from a GBWT index" -is $(vg gbwt -c xy.removed.gbwt) 4 "haplotypes only: 4 threads" +is $(vg gbwt -c xy.removed.gbwt) 4 "haplotypes only: 4 paths" rm -f xy.gbwt xy.ref.gbwt xy.both.gbwt xy.removed.gbwt # Build a three-sample GBWT from a simple GFA vg gbwt -o all.gbwt -G graphs/three_samples.gfa -is $(vg gbwt -c all.gbwt) 12 "all samples: 12 threads" +is $(vg gbwt -c all.gbwt) 12 "all samples: 12 paths" is $(vg gbwt -H all.gbwt) 6 "all samples: 6 haplotypes" # Remove samples 1 and 3 vg gbwt -R sample1 -R sample3 -o removed.gbwt all.gbwt is $? 0 "multiple samples can be removed from a GBWT index" -is $(vg gbwt -c removed.gbwt) 4 "sample 2: 4 threads" +is $(vg gbwt -c removed.gbwt) 4 "sample 2: 4 paths" is $(vg gbwt -H removed.gbwt) 2 "sample 2: 2 haplotypes" rm -f all.gbwt removed.gbwt -# Extract threads from GBWT +# Extract paths from GBWT vg gbwt -x x.vg -o x.gbwt -v small/xy2.vcf.gz vg gbwt -e x.extract x.gbwt -is $? 0 "threads can be extracted from GBWT" -is $(cat x.extract | wc -c) 121 "correct size for the thread file" +is $? 0 "paths can be extracted from GBWT" +is $(cat x.extract | wc -c) 121 "correct size for the paths file" rm -f x.gbwt x.extract @@ -271,7 +271,7 @@ rm -f extracted.gbwt extracted2.gbwt extracted2.gg vg gbwt -P -n 16 -x xy.xg -g xy.cover.gg -o xy.cover.gbwt is $? 0 "Path cover GBWTGraph construction" is $(md5sum xy.cover.gg | cut -f 1 -d\ ) 6a2738f51472e0ba1553a815a005b157 "GBWTGraph was serialized correctly" -is $(vg gbwt -c xy.cover.gbwt) 32 "path cover: 32 threads" +is $(vg gbwt -c xy.cover.gbwt) 32 "path cover: 32 paths" is $(vg gbwt -C xy.cover.gbwt) 2 "path cover: 2 contigs" is $(vg gbwt -H xy.cover.gbwt) 16 "path cover: 16 haplotypes" is $(vg gbwt -S xy.cover.gbwt) 16 "path cover: 16 samples" @@ -282,10 +282,10 @@ rm -f xy.cover.gg xy.cover.gbwt vg gbwt -P -n 16 -x xy.xg -g xy.cover.gg -o xy.cover.gbwt --pass-paths is $? 0 "Path cover GBWTGraph construction" is $(md5sum xy.cover.gg | cut -f 1 -d\ ) 6a2738f51472e0ba1553a815a005b157 "GBWTGraph was serialized correctly" -is $(vg gbwt -c xy.cover.gbwt) 34 "path cover w/ paths: 34 threads" -is $(vg gbwt -C xy.cover.gbwt) 2 "path cover w/ paths: 2 contigs" -is $(vg gbwt -H xy.cover.gbwt) 17 "path cover w/ paths: 17 haplotypes" -is $(vg gbwt -S xy.cover.gbwt) 17 "path cover w/ paths: 17 samples" +is $(vg gbwt -c xy.cover.gbwt) 34 "path cover w/ reference paths: 34 paths" +is $(vg gbwt -C xy.cover.gbwt) 2 "path cover w/ reference paths: 2 contigs" +is $(vg gbwt -H xy.cover.gbwt) 17 "path cover w/ reference paths: 17 haplotypes" +is $(vg gbwt -S xy.cover.gbwt) 17 "path cover w/ reference paths: 17 samples" rm -f xy.cover.gg xy.cover.gbwt @@ -293,7 +293,7 @@ rm -f xy.cover.gg xy.cover.gbwt vg gbwt -x xy-alt.xg -g xy.local.gg -l -n 16 -o xy.local.gbwt -v small/xy2.vcf.gz is $? 0 "Local haplotypes GBWTGraph construction" is $(md5sum xy.local.gg | cut -f 1 -d\ ) 00429586246711abcf1367a97d3c468c "GBWTGraph was serialized correctly" -is $(vg gbwt -c xy.local.gbwt) 32 "local haplotypes: 32 threads" +is $(vg gbwt -c xy.local.gbwt) 32 "local haplotypes: 32 paths" is $(vg gbwt -C xy.local.gbwt) 2 "local haplotypes: 2 contigs" is $(vg gbwt -H xy.local.gbwt) 16 "local haplotypes: 16 haplotypes" is $(vg gbwt -S xy.local.gbwt) 16 "local haplotypes: 16 samples" @@ -314,10 +314,10 @@ rm -f xy.local.gg xy.local.gbwt xy.local.gbz vg gbwt -x xy-alt.xg -g xy.local.gg -l -n 16 -o xy.local.gbwt -v small/xy2.vcf.gz --pass-paths is $? 0 "Local haplotypes GBWTGraph construction" is $(md5sum xy.local.gg | cut -f 1 -d\ ) 6a2738f51472e0ba1553a815a005b157 "GBWTGraph was serialized correctly" -is $(vg gbwt -c xy.local.gbwt) 34 "local haplotypes w/ paths: 34 threads" -is $(vg gbwt -C xy.local.gbwt) 2 "local haplotypes w/ paths: 2 contigs" -is $(vg gbwt -H xy.local.gbwt) 17 "local haplotypes w/ paths: 17 haplotypes" -is $(vg gbwt -S xy.local.gbwt) 17 "local haplotypes w/ paths: 17 samples" +is $(vg gbwt -c xy.local.gbwt) 34 "local haplotypes w/ reference paths: 34 paths" +is $(vg gbwt -C xy.local.gbwt) 2 "local haplotypes w/ reference paths: 2 contigs" +is $(vg gbwt -H xy.local.gbwt) 17 "local haplotypes w/ reference paths: 17 haplotypes" +is $(vg gbwt -S xy.local.gbwt) 17 "local haplotypes w/ reference paths: 17 samples" rm -f xy.local.gg xy.local.gbwt @@ -325,11 +325,11 @@ rm -f xy.local.gg xy.local.gbwt vg gbwt -G haplotype-sampling/micb-kir3dl1.gfa -g large.gbz --gbz-format vg gbwt -Z large.gbz -l -n 16 --pass-paths -o large.local.gbwt is $? 0 "Local haplotypes with reference paths from a larger GBZ" -is $(vg gbwt -c large.local.gbwt) 36 "local haplotypes w/ paths: 36 threads" -is $(vg gbwt -C large.local.gbwt) 2 "local haplotypes w/ paths: 2 contigs" -is $(vg gbwt -H large.local.gbwt) 18 "local haplotypes w/ paths: 18 haplotypes" -is $(vg gbwt -S large.local.gbwt) 18 "local haplotypes w/ paths: 18 samples" -is $(vg gbwt --tags large.local.gbwt | grep -c reference_samples) 1 "local haplotypes w/ paths: reference_samples set" +is $(vg gbwt -c large.local.gbwt) 36 "local haplotypes w/ reference paths: 36 paths" +is $(vg gbwt -C large.local.gbwt) 2 "local haplotypes w/ reference paths: 2 contigs" +is $(vg gbwt -H large.local.gbwt) 18 "local haplotypes w/ reference paths: 18 haplotypes" +is $(vg gbwt -S large.local.gbwt) 18 "local haplotypes w/ reference paths: 18 samples" +is $(vg gbwt --tags large.local.gbwt | grep -c reference_samples) 1 "local haplotypes w/ reference paths: reference_samples set" rm -f large.gbz large.local.gbwt @@ -339,7 +339,7 @@ vg gbwt -x x.vg -o x.gbwt -v small/xy2.vcf.gz vg gbwt -a -n 16 -x xy.xg -g augmented.gg -o augmented.gbwt x.gbwt is $? 0 "Augmented GBWTGraph construction" is $(md5sum augmented.gg | cut -f 1 -d\ ) 00429586246711abcf1367a97d3c468c "GBWTGraph was serialized correctly" -is $(vg gbwt -c augmented.gbwt) 18 "augmented: 18 threads" +is $(vg gbwt -c augmented.gbwt) 18 "augmented: 18 paths" is $(vg gbwt -C augmented.gbwt) 2 "augmented: 2 contigs" is $(vg gbwt -H augmented.gbwt) 18 "augmented: 18 haplotypes" is $(vg gbwt -S augmented.gbwt) 17 "augmented: 17 samples" @@ -355,7 +355,7 @@ rm -f x.vg y.vg x.xg xy.xg xy-alt.xg vg gbwt -o gfa.gbwt -G graphs/components_walks.gfa is $? 0 "GBWT construction from GFA" is $(md5sum gfa.gbwt | cut -f 1 -d\ ) 44c27c37c7af6911c26aea2a41008460 "GBWT was serialized correctly" -is $(vg gbwt -c gfa.gbwt) 4 "gfa: 4 threads" +is $(vg gbwt -c gfa.gbwt) 4 "gfa: 4 paths" is $(vg gbwt -C gfa.gbwt) 2 "gfa: 2 contigs" is $(vg gbwt -H gfa.gbwt) 2 "gfa: 2 haplotypes" is $(vg gbwt -S gfa.gbwt) 1 "gfa: 1 sample" @@ -376,7 +376,7 @@ is $(md5sum gfa2.gbz | cut -f 1 -d\ ) ab241a3f79a781a367b701cb8888bf01 "GBZ was # Build GBWT and GBWTGraph from GFA with node chopping vg gbwt -o chopping.gbwt -g chopping.gg --translation chopping.trans --max-node 2 -G graphs/chopping_walks.gfa is $? 0 "GBWT+GBWTGraph construction from GFA with chopping" -is $(vg gbwt -c chopping.gbwt) 3 "chopping: 3 threads" +is $(vg gbwt -c chopping.gbwt) 3 "chopping: 3 paths" is $(vg gbwt -C chopping.gbwt) 1 "chopping: 1 contig" is $(vg gbwt -H chopping.gbwt) 3 "chopping: 3 haplotypes" is $(vg gbwt -S chopping.gbwt) 2 "chopping: 2 samples" @@ -391,7 +391,7 @@ is $(wc -l < from_gbz.trans) 8 "from GBZ: 8 translations" # Build GBWT and GBWTGraph from GFA with both paths and walks vg gbwt -o ref_paths.gbwt -g ref_paths.gg --translation ref_paths.trans -G graphs/components_paths_walks.gfa is $? 0 "GBWT+GBWTGraph construction from GFA with reference paths" -is $(vg gbwt -c ref_paths.gbwt) 6 "ref paths: 6 threads" +is $(vg gbwt -c ref_paths.gbwt) 6 "ref paths: 6 paths" is $(vg gbwt -C ref_paths.gbwt) 2 "ref paths: 2 contigs" is $(vg gbwt -H ref_paths.gbwt) 3 "ref paths: 3 haplotypes" is $(vg gbwt -S ref_paths.gbwt) 2 "ref paths: 2 samples" From 6377996ef9f148b02e6e890e023d32dbf01ad96c Mon Sep 17 00:00:00 2001 From: Jouni Siren Date: Thu, 31 Oct 2024 21:13:18 -0700 Subject: [PATCH 37/54] Rename some long options --- src/subcommand/gbwt_main.cpp | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/src/subcommand/gbwt_main.cpp b/src/subcommand/gbwt_main.cpp index af88d10f2e9..5e55928a4fe 100644 --- a/src/subcommand/gbwt_main.cpp +++ b/src/subcommand/gbwt_main.cpp @@ -330,11 +330,11 @@ void help_gbwt(char** argv) { std::cerr << " -H, --haplotypes print the number of haplotypes" << std::endl; std::cerr << " -S, --samples print the number of samples" << std::endl; std::cerr << " -L, --list-names list contig/sample names (use with -C or -S)" << std::endl; - std::cerr << " -T, --thread-names list path names" << std::endl; // FIXME change to --path-names; make --thread-names a deprecated alias + std::cerr << " -T, --path-names list path names" << std::endl; std::cerr << " --tags list GBWT tags" << std::endl; std::cerr << std::endl; std::cerr << "Step 8: Paths (one input GBWT):" << std::endl; - std::cerr << " -c, --count-threads print the number of paths" << std::endl; // FIXME also here, change to --count-paths + std::cerr << " -c, --count-paths print the number of paths" << std::endl; std::cerr << " -e, --extract FILE extract paths in SDSL format to FILE" << std::endl; std::cerr << std::endl; } @@ -424,7 +424,11 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) { constexpr int OPT_PASS_PATHS = 1400; constexpr int OPT_GBZ_FORMAT = 1500; constexpr int OPT_TAGS = 1700; - + + // Deprecated options. + constexpr int OPT_THREAD_NAMES = 2000; + constexpr int OPT_COUNT_THREADS = 2001; + // Make a collection of all the known tags and their descriptions. Use an ordered map so that we can do some typo guessing. // Values are description and list of prohibited characters. const std::map>> KNOWN_TAGS = { @@ -519,11 +523,13 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) { { "haplotypes", no_argument, 0, 'H' }, { "samples", no_argument, 0, 'S' }, { "list-names", no_argument, 0, 'L' }, - { "thread-names", no_argument, 0, 'T' }, // FIXME + { "path-names", no_argument, 0, 'T' }, + { "thread-names", no_argument, 0, OPT_THREAD_NAMES }, { "tags", no_argument, 0, OPT_TAGS }, // Paths - { "count-threads", no_argument, 0, 'c' }, // FIXME + { "count-paths", no_argument, 0, 'c' }, + { "count-threads", no_argument, 0, OPT_COUNT_THREADS }, { "extract", required_argument, 0, 'e' }, { "help", no_argument, 0, 'h' }, @@ -867,6 +873,11 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) { config.path_names = true; config.metadata_mode = true; break; + case OPT_THREAD_NAMES: + std::cerr << "warning: [vg gbwt] option --thread-names is deprecated; use --path-names instead" << std::endl; + config.path_names = true; + config.metadata_mode = true; + break; case OPT_TAGS: config.tags = true; config.metadata_mode = true; @@ -877,6 +888,11 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) { config.count_paths = true; config.path_mode = true; break; + case OPT_COUNT_THREADS: + std::cerr << "warning: [vg gbwt] option --count-threads is deprecated; use --count-paths instead" << std::endl; + config.count_paths = true; + config.path_mode = true; + break; case 'e': config.path_output = optarg; config.path_mode = true; From 7c6b977c989b90e538cd7b1ab1ff422b718473d2 Mon Sep 17 00:00:00 2001 From: Jouni Siren Date: Thu, 31 Oct 2024 22:28:58 -0700 Subject: [PATCH 38/54] Remove GBWT options from vg index --- src/subcommand/index_main.cpp | 218 ++++------------------------------ test/t/05_vg_find.t | 4 +- test/t/06_vg_index.t | 73 ++---------- test/t/07_vg_map.t | 3 +- test/t/11_vg_paths.t | 3 +- test/t/13_vg_sim.t | 4 +- test/t/18_vg_call.t | 2 +- test/t/26_deconstruct.t | 3 +- test/t/33_vg_mpmap.t | 3 +- test/t/37_vg_gbwt.t | 32 +---- test/t/38_vg_prune.t | 6 +- test/t/46_vg_minimizer.t | 6 +- 12 files changed, 62 insertions(+), 295 deletions(-) diff --git a/src/subcommand/index_main.cpp b/src/subcommand/index_main.cpp index 26a761162f2..731fdcff26d 100644 --- a/src/subcommand/index_main.cpp +++ b/src/subcommand/index_main.cpp @@ -1,4 +1,4 @@ -// index.cpp: define the "vg index" subcommand, which makes xg, GCSA2, and GBWT indexes +// index.cpp: define the "vg index" subcommand, which makes xg, GCSA2, and distance indexes #include #include @@ -11,7 +11,6 @@ #include "subcommand.hpp" #include "../vg.hpp" -#include "../haplotype_indexer.hpp" #include "xg.hpp" #include #include @@ -23,12 +22,10 @@ #include "../integrated_snarl_finder.hpp" #include "../snarl_distance_index.hpp" #include "../source_sink_overlay.hpp" -#include "../gbwt_helper.hpp" #include "../gbwtgraph_helper.hpp" #include "../gcsa_helper.hpp" #include -#include #include #include @@ -47,24 +44,6 @@ void help_index(char** argv) { << "xg options:" << endl << " -x, --xg-name FILE use this file to store a succinct, queryable version of the graph(s), or read for GCSA or distance indexing" << endl << " -L, --xg-alts include alt paths in xg" << endl - << "gbwt options (more in vg gbwt):" << endl - << " -v, --vcf-phasing FILE generate threads from the haplotypes in the VCF file FILE" << endl - << " -W, --ignore-missing don't warn when variants in the VCF are missing from the graph; silently skip them" << endl - << " -T, --store-threads generate threads from the embedded paths" << endl - << " -M, --store-gam FILE generate threads from the alignments in gam FILE (many allowed)" << endl - << " -F, --store-gaf FILE generate threads from the alignments in gaf FILE (many allowed)" << endl - << " -G, --gbwt-name FILE store the threads as GBWT in FILE" << endl - << " -z, --actual-phasing do not make unphased homozygous genotypes phased"<< endl - << " -P, --force-phasing replace unphased genotypes with randomly phased ones" << endl - << " -o, --discard-overlaps skip overlapping alternate alleles if the overlap cannot be resolved" << endl - << " -B, --batch-size N number of samples per batch (default 200)" << endl - << " -u, --buffer-size N GBWT construction buffer size in millions of nodes (default 100)" << endl - << " -n, --id-interval N store haplotype ids at one out of N positions (default 1024)" << endl - << " -R, --range X..Y process samples X to Y (inclusive)" << endl - << " -r, --rename V=P rename contig V in the VCFs to path P in the graph (may repeat)" << endl - << " --rename-variants when renaming contigs, find variants in the graph based on the new name" << endl - << " -I, --region C:S-E operate on only the given 1-based region of the given VCF contig (may repeat)" << endl - << " -E, --exclude SAMPLE exclude any samples with the given name from haplotype indexing" << endl << "gcsa options:" << endl << " -g, --gcsa-out FILE output a GCSA2 index to the given file" << endl //<< " -i, --dbg-in FILE use kmers from FILE instead of input VG (may repeat)" << endl @@ -83,12 +62,6 @@ void help_index(char** argv) { << " if N is 0 then don't store distances, only the snarl tree" << endl; } -void multiple_thread_sources() { - std::cerr << "error: [vg index] cannot generate threads from multiple sources (VCF, GAM, GAF, paths)" << std::endl; - std::cerr << "error: [vg index] GBWT indexes can be built separately and merged with vg gbwt" << std::endl; - std::exit(EXIT_FAILURE); -} - int main_index(int argc, char** argv) { if (argc == 2) { @@ -101,25 +74,18 @@ int main_index(int argc, char** argv) { #define OPT_DISTANCE_SNARL_LIMIT 1002 // Which indexes to build. - bool build_xg = false, build_gbwt = false, build_gcsa = false, build_dist = false; + bool build_xg = false, build_gcsa = false, build_dist = false; // Files we should read. string vcf_name, mapping_name; vector dbg_names; // Files we should write. - string xg_name, gbwt_name, gcsa_name, dist_name; - + string xg_name, gcsa_name, dist_name; // General bool show_progress = false; - // GBWT - HaplotypeIndexer haplotype_indexer; - enum thread_source_type { thread_source_none, thread_source_vcf, thread_source_paths, thread_source_gam, thread_source_gaf }; - thread_source_type thread_source = thread_source_none; - vector aln_file_names; - // GCSA gcsa::size_type kmer_size = gcsa::Key::MAX_LENGTH; gcsa::ConstructionParameters params; @@ -152,7 +118,7 @@ int main_index(int argc, char** argv) { {"thread-db", required_argument, 0, 'F'}, {"xg-alts", no_argument, 0, 'L'}, - // GBWT + // GBWT. These have been removed and will return an error. {"vcf-phasing", required_argument, 0, 'v'}, {"ignore-missing", no_argument, 0, 'W'}, {"store-threads", no_argument, 0, 'T'}, @@ -211,7 +177,6 @@ int main_index(int argc, char** argv) { break; case 'p': show_progress = true; - haplotype_indexer.show_progress = true; break; // XG @@ -223,111 +188,26 @@ int main_index(int argc, char** argv) { xg_alts = true; break; - // GBWT - case 'v': - if (thread_source != thread_source_none) { - multiple_thread_sources(); - } - thread_source = thread_source_vcf; - vcf_name = optarg; - break; - case 'W': - haplotype_indexer.warn_on_missing_variants = false; - break; - case 'T': - if (thread_source != thread_source_none) { - multiple_thread_sources(); - } - thread_source = thread_source_paths; - break; - case 'M': - if (thread_source != thread_source_none && thread_source != thread_source_gam) { - multiple_thread_sources(); - } - thread_source = thread_source_gam; - build_gbwt = true; - aln_file_names.push_back(optarg); - break; - case 'F': - if (thread_source != thread_source_none && thread_source != thread_source_gaf) { - multiple_thread_sources(); - } - thread_source = thread_source_gaf; - build_gbwt = true; - aln_file_names.push_back(optarg); - break; - case 'G': - build_gbwt = true; - gbwt_name = optarg; - break; - case 'z': - haplotype_indexer.phase_homozygous = false; - break; - case 'P': - haplotype_indexer.force_phasing = true; - break; - case 'o': - haplotype_indexer.discard_overlaps = true; - break; - case 'B': - haplotype_indexer.samples_in_batch = std::max(parse(optarg), 1ul); - break; - case 'u': - haplotype_indexer.gbwt_buffer_size = std::max(parse(optarg), 1ul); - break; - case 'n': - haplotype_indexer.id_interval = parse(optarg); - break; - case 'R': - { - // Parse first..last - string temp(optarg); - size_t found = temp.find(".."); - if(found == string::npos || found == 0 || found + 2 == temp.size()) { - cerr << "error: [vg index] could not parse range " << temp << endl; - exit(1); - } - haplotype_indexer.sample_range.first = parse(temp.substr(0, found)); - haplotype_indexer.sample_range.second = parse(temp.substr(found + 2)) + 1; - } - break; - case 'r': - { - // Parse the rename old=new - string key_value(optarg); - auto found = key_value.find('='); - if (found == string::npos || found == 0 || found + 1 == key_value.size()) { - cerr << "error: [vg index] could not parse rename " << key_value << endl; - exit(1); - } - // Parse out the two parts - string vcf_contig = key_value.substr(0, found); - string graph_contig = key_value.substr(found + 1); - // Add the name mapping - haplotype_indexer.path_to_vcf[graph_contig] = vcf_contig; - } - break; - case OPT_RENAME_VARIANTS: - haplotype_indexer.rename_variants = true; - break; - case 'I': - { - // We want to parse this region specifier - string region(optarg); - - Region parsed; - parse_region(region, parsed); - if (parsed.start <= 0 || parsed.end <= 0) { - // We need both range bounds, and we can't accept 0 since input is 1-based. - cerr << "error: [vg index] could not parse 1-based region " << optarg << endl; - } - - // Make sure to correct the coordinates to 0-based exclusive-end, from 1-based inclusive-end - haplotype_indexer.regions[parsed.seq] = make_pair((size_t) (parsed.start - 1), (size_t) parsed.end); - } - break; + // GBWT. The options remain, but they are no longer supported. + case 'v': // Fall through + case 'W': // Fall through + case 'T': // Fall through + case 'M': // Fall through + case 'F': // Fall through + case 'G': // Fall through + case 'z': // Fall through + case 'P': // Fall through + case 'o': // Fall through + case 'B': // Fall through + case 'u': // Fall through + case 'n': // Fall through + case 'R': // Fall through + case 'r': // Fall through + case OPT_RENAME_VARIANTS: // Fall through + case 'I': // Fall through case 'E': - haplotype_indexer.excluded_samples.insert(optarg); + std::cerr << "error: [vg index] GBWT construction options have been removed; use vg gbwt instead" << std::endl; + std::exit(EXIT_FAILURE); break; // GCSA @@ -391,37 +271,11 @@ int main_index(int argc, char** argv) { } - if (xg_name.empty() && gbwt_name.empty() && - gcsa_name.empty() && !build_gai_index && !build_vgi_index && dist_name.empty()) { + if (xg_name.empty() && gcsa_name.empty() && !build_gai_index && !build_vgi_index && dist_name.empty()) { cerr << "error: [vg index] index type not specified" << endl; return 1; } - if (build_gbwt && thread_source == thread_source_none) { - cerr << "error: [vg index] cannot build GBWT without threads" << endl; - return 1; - } - - if (thread_source != thread_source_none && !build_gbwt) { - cerr << "error: [vg index] no GBWT output specified for the threads" << endl; - return 1; - } - - if (thread_source == thread_source_gam || thread_source == thread_source_gaf) { - for (const auto& name : aln_file_names) { - if (name == "-") { - cerr << "error: [vg index] GAM (-M) and GAF (-F) input files cannot be read from stdin (-)" << endl; - return 1; - } - } - } - - if (thread_source != thread_source_none && file_names.size() != 1) { - cerr << "error: [vg index] exactly one graph required for generating threads" << std::endl; - cerr << "error: [vg index] you may combine the graphs with vg index -x combined.xg --xg-alts" << std::endl; - return 1; - } - if (file_names.size() <= 0 && dbg_names.empty()){ //cerr << "No graph provided for indexing. Please provide a .vg file or GCSA2-format deBruijn graph to index." << endl; //return 1; @@ -481,30 +335,6 @@ int main_index(int argc, char** argv) { vg::io::save_handle_graph(&xg_index, xg_name); } - // Generate threads - if (thread_source != thread_source_none) { - - // Load the only input graph. - unique_ptr path_handle_graph; - path_handle_graph = vg::io::VPKG::load_one(file_names[0]); - - std::unique_ptr gbwt_index(nullptr); - if (thread_source == thread_source_vcf) { - std::vector parse_files = haplotype_indexer.parse_vcf(vcf_name, *path_handle_graph); - path_handle_graph.reset(); // Save memory by deleting the graph. - gbwt_index = haplotype_indexer.build_gbwt(parse_files); - } else if (thread_source == thread_source_paths) { - gbwt_index = haplotype_indexer.build_gbwt(*path_handle_graph); - } else if (thread_source == thread_source_gam) { - gbwt_index = haplotype_indexer.build_gbwt(*path_handle_graph, aln_file_names, "GAM"); - } else if (thread_source == thread_source_gaf) { - gbwt_index = haplotype_indexer.build_gbwt(*path_handle_graph, aln_file_names, "GAF"); - } - if (build_gbwt && gbwt_index.get() != nullptr) { - save_gbwt(*gbwt_index, gbwt_name, show_progress); - } - } // End of thread indexing. - // Build GCSA if (build_gcsa) { diff --git a/test/t/05_vg_find.t b/test/t/05_vg_find.t index f7c54e36eee..67a51741da1 100644 --- a/test/t/05_vg_find.t +++ b/test/t/05_vg_find.t @@ -112,7 +112,7 @@ rm -f t.xg t.vg t.x:30:35.vg t.x:10:20.vg q.x:30:35.vg q.x:10:20.vg t.bed vg construct -r small/xy.fa -v small/xy2.vcf.gz -R x -C -a > x.vg 2> /dev/null vg index -x x.xg x.vg -vg index -G x.gbwt -v small/xy2.vcf.gz x.vg +vg gbwt -v small/xy2.vcf.gz -o x.gbwt -x x.vg is $(vg find -p x -x x.xg -K 16 -H x.gbwt | cut -f 5 | sort | uniq -c | tail -n 1 | awk '{ print $1 }') 1510 "we find the expected number of kmers with haplotype frequency equal to 2" rm -f x.vg x.xg x.gbwt @@ -124,7 +124,7 @@ is $? 0 "GFA i/o for find -n consistent with converting both ways" # Find nodes that map to the provided ids vg construct -m 32 -r small/xy.fa -v small/xy2.vcf.gz -R x -C -a > x.vg 2> /dev/null -vg index -G x.gbwt -v small/xy2.vcf.gz x.vg +vg gbwt -v small/xy2.vcf.gz -o x.gbwt -x x.vg vg prune -u -m x.mapping -g x.gbwt -e 1 x.vg > x.unfolded.vg rm -f expected.gfa diff --git a/test/t/06_vg_index.t b/test/t/06_vg_index.t index a8ac6add8dc..0e3edf78dbc 100644 --- a/test/t/06_vg_index.t +++ b/test/t/06_vg_index.t @@ -7,7 +7,7 @@ PATH=../bin:$PATH # for vg export LC_ALL="en_US.utf8" # force ekg's favorite sort order -plan tests 54 +plan tests 45 # Single graph without haplotypes vg construct -r small/x.fa -v small/x.vcf.gz > x.vg @@ -39,9 +39,6 @@ rm -f x3.gcsa x3.gcsa.lcp # Single graph with haplotypes vg construct -r small/x.fa -v small/x.vcf.gz -a > x.vg -vg index -G x.gbwt -v small/x.vcf.gz x.vg -is $? 0 "building a GBWT index of a graph with haplotypes" - vg index -x x.xg x.vg is $? 0 "building an XG index of a graph with haplotypes" @@ -55,51 +52,25 @@ is $(vg paths -x x-ap.xg -L | wc -l) $(vg paths -v x.vg -L | wc -l) "xg index do vg index -g x.gcsa x.vg is $? 0 "building a GCSA index of a graph with haplotypes" -vg index -x x2.xg -G x2.gbwt -v small/x.vcf.gz -g x2.gcsa x.vg +vg index -x x2.xg -g x2.gcsa x.vg is $? 0 "building all indexes at once" -cmp x.xg x2.xg && cmp x.gbwt x2.gbwt && cmp x.gcsa x2.gcsa && cmp x.gcsa.lcp x2.gcsa.lcp +cmp x.xg x2.xg && cmp x.gcsa x2.gcsa && cmp x.gcsa.lcp x2.gcsa.lcp is $? 0 "the indexes are identical" -vg index -x x2-ap.xg -G x2-ap.gbwt -v small/x.vcf.gz -g x2-ap.gcsa x.vg -L +vg index -x x2-ap.xg -g x2-ap.gcsa x.vg -L is $? 0 "building all indexes at once, while leaving alt paths in xg" -cmp x.gbwt x2-ap.gbwt && cmp x.gcsa x2-ap.gcsa && cmp x.gcsa.lcp x2-ap.gcsa.lcp +cmp x.gcsa x2-ap.gcsa && cmp x.gcsa.lcp x2-ap.gcsa.lcp is $? 0 "the indexes are identical with -L" is $(vg paths -x x2-ap.xg -L | wc -l) $(vg paths -v x.vg -L | wc -l) "xg index does contains alt paths with index -L all at once" -# Exclude a sample from the GBWT index -vg index -G empty.gbwt -v small/x.vcf.gz --exclude 1 x.vg -is $? 0 "samples can be excluded from haplotype indexing" -is $(vg gbwt -c empty.gbwt) 0 "excluded samples were not included in the GBWT index" - -# Make GBWT from GAM -vg paths -v x.vg -X -Q _alt > x-alts.gam -vg index x.vg -M x-alts.gam -G x-gam.gbwt -# Make GBWT from GAF -vg convert x.vg -G x-alts.gam > x-alts.gaf -vg index x.vg -F x-alts.gaf -G x-gaf.gbwt -cmp x-gaf.gbwt x-gam.gbwt -is $? 0 "GBWT from GAF same as from GAM" - rm -f x.vg -rm -f x.xg x-ap.xg x.gbwtx.gcsa x.gcsa.lcp -rm -f x2.xg x2.gbwt x2.gcsa x2.gcsa.lcp -rm -f x2-ap.xg x2-ap.gbwt x2-ap.gcsa x2-ap.gcsa.lcp -rm -f empty.gbwt -rm -f x-alts.gam x-alts.gaf x-gam.gbwt x-gaf.gbwt - - -# Subregion graph with haplotypes -vg construct -r small/x.fa -v small/x.vcf.gz -a --region x:100-200 > x.part.vg - -vg index -x x.part.xg -G x.part.gbwt --region x:100-200 -v small/x.vcf.gz x.part.vg 2>log.txt -is $? 0 "building GBWT index for a regional graph" - -is "$(cat log.txt | wc -c)" "0" "no warnings about missing variants produced" - -rm -f x.part.vg x.part.xg x.part.gbwt log.txt +rm -f x.xg x-ap.xg x.gcsa x.gcsa.lcp +rm -f x2.xg x2.gcsa x2.gcsa.lcp +rm -f x2-ap.xg x2-ap.gcsa x2-ap.gcsa.lcp +rm -f x-alts.gam x-alts.gaf # Multiple graphs without haplotypes @@ -129,9 +100,6 @@ vg construct -r small/xy.fa -v small/xy2.vcf.gz -R x -C -a > x.vg 2> /dev/null vg construct -r small/xy.fa -v small/xy2.vcf.gz -R y -C -a > y.vg 2> /dev/null vg ids -j x.vg y.vg -vg index -G x.gbwt -v small/xy2.vcf.gz x.vg && vg index -G y.gbwt -v small/xy2.vcf.gz y.vg && vg gbwt -m -f -o xy.gbwt x.gbwt y.gbwt -is $? 0 "building a GBWT index of multiple graphs with haplotypes" - vg index -x xy.xg x.vg y.vg is $? 0 "building an XG index of multiple graphs with haplotypes" @@ -144,31 +112,14 @@ is $? 0 "building XG and GCSA indexes at once" vg index -x xy-alt.xg -L x.vg y.vg is $? 0 "building an XG index with alt paths" -vg index -G xy2.gbwt -v small/xy2.vcf.gz xy-alt.xg -is $? 0 "building a GBWT index from an XG index" - -cmp xy.xg xy2.xg && cmp xy.gcsa xy2.gcsa && cmp xy.gcsa.lcp xy2.gcsa.lcp && cmp xy.gbwt xy2.gbwt +cmp xy.xg xy2.xg && cmp xy.gcsa xy2.gcsa && cmp xy.gcsa.lcp xy2.gcsa.lcp is $? 0 "the indexes are identical" rm -f x.vg y.vg -rm -f x.gbwt y.gbwt -rm -f xy.xg xy.gbwt xy.gcsa xy.gcsa.lcp -rm -f xy2.xg xy2.gbwt xy2.gcsa xy2.gcsa.lcp +rm -f xy.xg xy.gcsa xy.gcsa.lcp +rm -f xy2.xg xy2.gcsa xy2.gcsa.lcp rm -f xy-alt.xg - -# GBWT construction options -vg construct -r small/xy.fa -v small/xy2.vcf.gz -R x -C -a > x.vg 2> /dev/null - -vg index -G x_ref.gbwt -T x.vg -is $? 0 "GBWT can be built for paths" - -rm -f x_ref.gbwt - -# We do not test GBWT construction parameters (-B, -u, -n) because they matter only for large inputs. -# We do not test chromosome-length path generation (-P, -o) for the same reason. - - # Other tests vg construct -m 1000 -r small/x.fa -v small/x.vcf.gz >x.vg vg index -x x.xg x.vg bogus123.vg 2>/dev/null diff --git a/test/t/07_vg_map.t b/test/t/07_vg_map.t index 08a99edc398..a7c1e41c9a8 100644 --- a/test/t/07_vg_map.t +++ b/test/t/07_vg_map.t @@ -152,7 +152,8 @@ is $? 1 "error on vg map -f (paired, RHS)" # Now do the GBWT vg construct -a -r small/x.fa -v small/x.vcf.gz >x.vg -vg index -x x.xg -g x.gcsa -v small/x.vcf.gz --gbwt-name x.gbwt -k 16 x.vg +vg index -x x.xg -g x.gcsa -k 16 x.vg +vg gbwt -v small/x.vcf.gz -o x.gbwt -x x.vg # This read is all ref which matches no haplotype in x.vcf.gz and visits some unused nodes is "$(vg map -x x.xg -g x.gcsa --gbwt-name x.gbwt --hap-exp 1 --full-l-bonus 0 -f reads/x.unvisited.fq -j | jq -r '.score')" "36" "mapping a read that touches unused nodes gets the base score" diff --git a/test/t/11_vg_paths.t b/test/t/11_vg_paths.t index e21a8fe2c09..e83fa8de659 100644 --- a/test/t/11_vg_paths.t +++ b/test/t/11_vg_paths.t @@ -11,7 +11,8 @@ plan tests 26 vg construct -r small/x.fa -v small/x.vcf.gz -a > x.vg vg construct -r small/x.fa -v small/x.vcf.gz > x2.vg -vg index -x x.xg -G x.gbwt -v small/x.vcf.gz x.vg +vg index -x x.xg x.vg +vg gbwt -v small/x.vcf.gz -o x.gbwt -x x.vg # List path/thread names from various input formats is "$(vg paths --list -v x2.vg)" "x" "path listing works from vg" diff --git a/test/t/13_vg_sim.t b/test/t/13_vg_sim.t index 0481a82e463..ecc0799eeda 100644 --- a/test/t/13_vg_sim.t +++ b/test/t/13_vg_sim.t @@ -11,7 +11,7 @@ plan tests 36 vg construct -r small/x.fa -v small/x.vcf.gz >x.vg vg construct -r small/x.fa -v small/x.vcf.gz -a >x2.vg vg index -x x.xg x.vg -vg index -G x.gbwt -v small/x.vcf.gz x2.vg +vg gbwt -o x.gbwt -v small/x.vcf.gz -x x2.vg is $(vg sim -l 100 -n 100 -x x.xg | wc -l) 100 \ "vg sim creates the correct number of reads" @@ -58,7 +58,7 @@ rm -f x.vg x2.vg x.xg x.gbwt n.vg n.fa n.xg vg construct -r small/xy.fa -v small/x.vcf.gz -a >xy.vg vg index -x xy.xg xy.vg -vg index -G xy.gbwt -v small/x.vcf.gz xy.vg +vg gbwt -o xy.gbwt -v small/x.vcf.gz -x xy.vg vg sim -s 12345 -n 1000 -l 2 -e 0.1 -x xy.xg -g xy.gbwt --sample-name 1 --any-path >/dev/null is $? "0" "Sample simulation works along with --any-path" diff --git a/test/t/18_vg_call.t b/test/t/18_vg_call.t index 6a457f4eb9f..b97cee3fb08 100644 --- a/test/t/18_vg_call.t +++ b/test/t/18_vg_call.t @@ -89,7 +89,7 @@ is "${REF_COUNT_V}" "${REF_COUNT_A}" "Same number of reference calls with -a as # Output snarl traversals into a GBWT then genotype that vg call HGSVC_alts.xg -k HGSVC_alts.pack -s HG00514 -T | gzip > HGSVC_travs.gaf.gz -vg index HGSVC_alts.xg -F HGSVC_travs.gaf.gz -G HGSVC_travs.gbwt +vg gbwt -o HGSVC_travs.gbwt -x HGSVC_alts.xg -A HGSVC_travs.gaf.gz vg call HGSVC_alts.xg -k HGSVC_alts.pack -g HGSVC_travs.gbwt -s HG00514 > HGSVC_travs.vcf vg call HGSVC_alts.xg -k HGSVC_alts.pack -s HG00514 > HGSVC_direct.vcf # extract the called genotypes diff --git a/test/t/26_deconstruct.t b/test/t/26_deconstruct.t index 03c7f8f3a87..47a28c4ae35 100644 --- a/test/t/26_deconstruct.t +++ b/test/t/26_deconstruct.t @@ -121,7 +121,8 @@ is "$?" 0 "deconstructing vg graph gives same output as xg graph" rm -f tiny_names.gfa tiny_names.vg tiny_names.xg tiny_names_decon.vcf tiny_names_decon_vg.vcf vg construct -r small/x.fa -v small/x.vcf.gz -a > x.vg -vg index -x x.xg -G x.gbwt -v small/x.vcf.gz x.vg +vg index -x x.xg x.vg +vg gbwt -v small/x.vcf.gz -o x.gbwt -x x.vg vg deconstruct x.xg -g x.gbwt | bgzip > x.decon.vcf.gz tabix -f -p vcf x.decon.vcf.gz cat small/x.fa | bcftools consensus small/x.vcf.gz -s 1 -H 1 > small.s1.h1.fa diff --git a/test/t/33_vg_mpmap.t b/test/t/33_vg_mpmap.t index 16af8c4cef9..e569b2b9ed6 100644 --- a/test/t/33_vg_mpmap.t +++ b/test/t/33_vg_mpmap.t @@ -11,7 +11,8 @@ plan tests 21 # Exercise the GBWT # Index a couple of nearly identical contigs vg construct -m 1000 -a -r small/xy.fa -v small/xy2.vcf.gz >xy2.vg -vg index -x xy2.xg -g xy2.gcsa -v small/xy2.vcf.gz --gbwt-name xy2.gbwt -k 16 xy2.vg +vg index -x xy2.xg -g xy2.gcsa -k 16 xy2.vg +vg gbwt -v small/xy2.vcf.gz -o xy2.gbwt -x xy2.vg # We turn off the background model calibration with -B and ignore it with -P 1 diff --git a/test/t/37_vg_gbwt.t b/test/t/37_vg_gbwt.t index 9fdc0c8a4fb..870b5d9674a 100644 --- a/test/t/37_vg_gbwt.t +++ b/test/t/37_vg_gbwt.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 159 +plan tests 149 # Build vg graphs for two chromosomes @@ -21,10 +21,6 @@ vg index -x xy-alt.xg -L x.vg y.vg # Single chromosome: haplotypes vg gbwt -x x.vg -o x.gbwt -v small/xy2.vcf.gz is $? 0 "chromosome X GBWT with vg gbwt" -vg index -G x2.gbwt -v small/xy2.vcf.gz x.vg -is $? 0 "chromosome X GBWT with vg index" -cmp x.gbwt x2.gbwt -is $? 0 "identical construction results with vg gbwt and vg index" vg gbwt -x x.vg -o parse --parse-only -v small/xy2.vcf.gz is $? 0 "chromosome X VCF parse" ../deps/gbwt/bin/build_gbwt -p -r parse_x > /dev/null 2> /dev/null @@ -41,20 +37,16 @@ is $(vg gbwt -T x.gbwt | wc -l) 2 "chromosome X: 2 path names" is $(vg gbwt -C -L x.gbwt | wc -l) 1 "chromosome X: 1 contig name" is $(vg gbwt -S -L x.gbwt | wc -l) 1 "chromosome X: 1 sample name" -rm -f x.gbwt x2.gbwt parse_x.gbwt +rm -f x.gbwt parse_x.gbwt rm -f parse_x parse_x_0_1 # Single chromosome: paths vg gbwt -E -o x.ref.gbwt -x x.vg is $? 0 "chromosome X reference GBWT with vg gbwt" -vg index -G x2.ref.gbwt -T x.vg -is $? 0 "chromosome X reference GBWT with vg index" -cmp x.ref.gbwt x2.ref.gbwt -is $? 0 "identical construction results with vg gbwt and vg index" is $(vg gbwt -c x.ref.gbwt) 1 "chromosome X reference: 1 path" -rm -f x.ref.gbwt x2.ref.gbwt +rm -f x.ref.gbwt # Single chromosome: alignments @@ -62,17 +54,13 @@ vg paths -v x.vg -X -Q _alt > x.alts.gam vg convert -G x.alts.gam x.vg > x.alts.gaf vg gbwt -A -o x.alts.gaf.gbwt -x x.vg x.alts.gaf is $? 0 "chromosome X GAF with vg gbwt" -vg index -F x.alts.gaf -G x2.alts.gaf.gbwt x.vg -is $? 0 "chromosome X GAF with vg index" -cmp x.alts.gaf.gbwt x2.alts.gaf.gbwt -is $? 0 "identical construction results with vg gbwt and vg index" vg gbwt -A --gam-format -o x.alts.gam.gbwt -x x.vg x.alts.gam is $? 0 "chromosome X GAM with vg gbwt" cmp x.alts.gaf.gbwt x.alts.gaf.gbwt is $? 0 "identical construction results from GAF and GAM" rm -f x.alts.gam x.alts.gaf -rm -f x.alts.gaf.gbwt x2.alts.gaf.gbwt x.alts.gam.gbwt +rm -f x.alts.gaf.gbwt x.alts.gam.gbwt # Graph region: haplotypes @@ -80,12 +68,8 @@ vg construct -r small/x.fa -v small/x.vcf.gz -a --region x:100-200 > x.part.vg vg gbwt -x x.part.vg -o x.part.gbwt --vcf-region x:100-200 -v small/x.vcf.gz 2> log.txt is $? 0 "chromosome X subgraph GBWT with vg gbwt" is "$(cat log.txt | wc -c)" 0 "no warnings about missing variants" -vg index -G x2.part.gbwt --region x:100-200 -v small/x.vcf.gz x.part.vg 2> log.txt -is $? 0 "chromosome X subgraph GBWT with vg index" -cmp x.part.gbwt x2.part.gbwt -is $? 0 "identical construction results with vg gbwt and vg index" -rm -f x.part.vg x.part.gbwt x2.part.gbwt log.txt +rm -f x.part.vg x.part.gbwt log.txt # Multiple chromosomes: haplotypes @@ -127,16 +111,12 @@ rm -f xy.1000gp.gbwt # Multiple chromosomes: paths as contigs vg gbwt -E -o xy.contigs.gbwt -x xy.xg is $? 0 "paths as contigs with vg gbwt" -vg index -G xy2.contigs.gbwt -T xy.xg -is $? 0 "paths as contigs with vg index" -cmp xy.contigs.gbwt xy2.contigs.gbwt -is $? 0 "identical construction results with vg gbwt and vg index" is $(vg gbwt -c xy.contigs.gbwt) 2 "paths as contigs: 2 paths" is $(vg gbwt -C xy.contigs.gbwt) 2 "paths as contigs: 2 contigs" is $(vg gbwt -H xy.contigs.gbwt) 1 "paths as contigs: 1 haplotype" is $(vg gbwt -S xy.contigs.gbwt) 1 "paths as contigs: 1 sample" -rm -f xy.contigs.gbwt xy2.contigs.gbwt +rm -f xy.contigs.gbwt # Build an r-index diff --git a/test/t/38_vg_prune.t b/test/t/38_vg_prune.t index 3cd5db9cde0..1aa7533c260 100644 --- a/test/t/38_vg_prune.t +++ b/test/t/38_vg_prune.t @@ -10,7 +10,7 @@ plan tests 21 # Build a graph with one path and two threads vg construct -m 32 -r small/xy.fa -v small/xy2.vcf.gz -R x -C -a > x.vg 2> /dev/null -vg index -G x.gbwt -v small/xy2.vcf.gz x.vg +vg gbwt -o x.gbwt -v small/xy2.vcf.gz -x x.vg # Basic pruning: 5 components, 51 nodes, 51 edges vg prune -e 1 x.vg > y.vg @@ -54,8 +54,8 @@ rm -f x.vg x.gbwt vg construct -m 32 -r small/xy.fa -v small/xy2.vcf.gz -R x -C -a > x.vg 2> /dev/null vg construct -m 32 -r small/xy.fa -v small/xy2.vcf.gz -R y -C -a > y.vg 2> /dev/null vg ids -j -m xy.mapping x.vg y.vg -vg index -G x.gbwt -v small/xy2.vcf.gz x.vg -vg index -G y.gbwt -v small/xy2.vcf.gz y.vg +vg gbwt -o x.gbwt -v small/xy2.vcf.gz -x x.vg +vg gbwt -o y.gbwt -v small/xy2.vcf.gz -x y.vg # Prune a single-chromosome graph using multi-chromosome GBWT vg gbwt -m -o xy.gbwt x.gbwt y.gbwt diff --git a/test/t/46_vg_minimizer.t b/test/t/46_vg_minimizer.t index 06f9133f09d..e4090b54c9d 100644 --- a/test/t/46_vg_minimizer.t +++ b/test/t/46_vg_minimizer.t @@ -62,8 +62,10 @@ rm -f x.vg x.xg x.gbwt x.snarls x.dist x.mi x.gg x.gbz vg construct -r small/xy.fa -v small/xy2.vcf.gz -R x -C -a > x.vg 2> /dev/null vg construct -r small/xy.fa -v small/xy2.vcf.gz -R y -C -a > y.vg 2> /dev/null vg ids -j x.vg y.vg -vg index -x x.xg -G x.gbwt -v small/xy2.vcf.gz x.vg -vg index -x y.xg -G y.gbwt -v small/xy2.vcf.gz y.vg +vg index -x x.xg x.vg +vg gbwt -o x.gbwt -x x.vg -v small/xy2.vcf.gz +vg index -x y.xg y.vg +vg gbwt -o y.gbwt -x y.vg -v small/xy2.vcf.gz # Appending to the index vg minimizer --no-dist -t 1 -o x.mi -g x.gbwt x.xg From 5a2c3af1fda989c2e31ad9da5c1897ec999ac91a Mon Sep 17 00:00:00 2001 From: Jouni Siren Date: Thu, 31 Oct 2024 23:01:56 -0700 Subject: [PATCH 39/54] Update SDSL, GCSA, GBWT, GBWTGraph --- deps/gbwt | 2 +- deps/gbwtgraph | 2 +- deps/gcsa2 | 2 +- deps/sdsl-lite | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/deps/gbwt b/deps/gbwt index 029c2b45667..2aab62f0664 160000 --- a/deps/gbwt +++ b/deps/gbwt @@ -1 +1 @@ -Subproject commit 029c2b456675eab0fbb55de21c8002e1660cb0d3 +Subproject commit 2aab62f0664b2ce8eb4cffd6b3872c89b85fdfa6 diff --git a/deps/gbwtgraph b/deps/gbwtgraph index de61d340695..d2a423b5eb5 160000 --- a/deps/gbwtgraph +++ b/deps/gbwtgraph @@ -1 +1 @@ -Subproject commit de61d340695f64b8aa978816b195d6687e7fda5a +Subproject commit d2a423b5eb54c3c2f0676cb2925b46ad0ba7e189 diff --git a/deps/gcsa2 b/deps/gcsa2 index 7dbf9305817..8b6b049ab64 160000 --- a/deps/gcsa2 +++ b/deps/gcsa2 @@ -1 +1 @@ -Subproject commit 7dbf93058171b5af420f1b6166d4d2bf54cd8748 +Subproject commit 8b6b049ab6444e891bb7c38ec8f38169ce62e5fb diff --git a/deps/sdsl-lite b/deps/sdsl-lite index 86fa3534c1b..115dfd5b737 160000 --- a/deps/sdsl-lite +++ b/deps/sdsl-lite @@ -1 +1 @@ -Subproject commit 86fa3534c1bf02b5468bbe58e2e0b8f6ae2d6fa4 +Subproject commit 115dfd5b7371ef99a1e0611389b956bb88bf9fc4 From ba0ed68ebff2d05dc23e6190b572284d7f750527 Mon Sep 17 00:00:00 2001 From: Jouni Siren Date: Sat, 2 Nov 2024 04:09:56 -0700 Subject: [PATCH 40/54] Multithreaded GAF/GAM to GBWT --- src/gbwt_helper.cpp | 20 ----- src/gbwt_helper.hpp | 7 -- src/haplotype_indexer.cpp | 146 +++++++++++++++++++++++++++-------- src/haplotype_indexer.hpp | 11 ++- src/subcommand/gbwt_main.cpp | 4 +- test/t/37_vg_gbwt.t | 4 +- 6 files changed, 126 insertions(+), 66 deletions(-) diff --git a/src/gbwt_helper.cpp b/src/gbwt_helper.cpp index 0958ba45533..19306a7dfe3 100644 --- a/src/gbwt_helper.cpp +++ b/src/gbwt_helper.cpp @@ -88,26 +88,6 @@ gbwt::size_type gbwt_node_width(const HandleGraph& graph) { return gbwt::bit_length(gbwt::Node::encode(graph.max_node_id(), true)); } -void finish_gbwt_constuction(gbwt::GBWTBuilder& builder, - const std::vector& sample_names, - const std::vector& contig_names, - size_t haplotype_count, bool print_metadata, - const std::string& header) { - - builder.finish(); - builder.index.metadata.setSamples(sample_names); - builder.index.metadata.setHaplotypes(haplotype_count); - builder.index.metadata.setContigs(contig_names); - if (print_metadata) { - #pragma omp critical - { - std::cerr << header << ": "; - gbwt::operator<<(std::cerr, builder.index.metadata); - std::cerr << std::endl; - } - } -} - //------------------------------------------------------------------------------ void load_gbwt(gbwt::GBWT& index, const std::string& filename, bool show_progress) { diff --git a/src/gbwt_helper.hpp b/src/gbwt_helper.hpp index 33e1545066c..57b819a579b 100644 --- a/src/gbwt_helper.hpp +++ b/src/gbwt_helper.hpp @@ -67,13 +67,6 @@ gbwt::vector_type path_predecessors(const PathHandleGraph& graph, const std::str /// Determine the node width in bits for the GBWT nodes based on the given graph. gbwt::size_type gbwt_node_width(const HandleGraph& graph); -/// Finish GBWT construction and optionally print the metadata. -void finish_gbwt_constuction(gbwt::GBWTBuilder& builder, - const std::vector& sample_names, - const std::vector& contig_names, - size_t haplotype_count, bool print_metadata, - const std::string& header = "GBWT"); - //------------------------------------------------------------------------------ /* diff --git a/src/haplotype_indexer.cpp b/src/haplotype_indexer.cpp index 528160485cd..b7c935d7e05 100644 --- a/src/haplotype_indexer.cpp +++ b/src/haplotype_indexer.cpp @@ -4,10 +4,12 @@ #include #include +#include #include #include #include +#include #include #include "gbwt_helper.hpp" @@ -394,7 +396,7 @@ std::unique_ptr HaplotypeIndexer::build_gbwt(const std::vecto // And add every path that passes the filter (including haplotype paths) from the source graph. gbwtgraph::store_paths(builder, *graph, {PathSense::GENERIC, PathSense::REFERENCE, PathSense::HAPLOTYPE}, &path_filter); - // Finish the construction for this set of threads and put the index back. + // Finish the construction for this set of paths and put the index back. builder.finish(); builder.swapIndex(*index); } @@ -413,24 +415,43 @@ std::unique_ptr HaplotypeIndexer::build_gbwt(const PathHandle return build_gbwt({}, "GBWT", &graph); } -std::unique_ptr HaplotypeIndexer::build_gbwt(const PathHandleGraph& graph, - const std::vector& aln_filenames, const std::string& aln_format) const { +std::unique_ptr HaplotypeIndexer::build_gbwt(const HandleGraph& graph, + const std::vector& aln_filenames, const std::string& aln_format, size_t parallel_jobs) const { - // GBWT metadata. - std::vector sample_names, contig_names; - std::map> sample_info; // name -> (id, count) - contig_names.push_back("0"); // An artificial contig. - size_t haplotype_count = 0; + // Handle multithreading and parallel jobs. + parallel_jobs = std::max(1, parallel_jobs); + int old_threads = omp_get_max_threads(); + omp_set_num_threads(parallel_jobs); + + // Partition the graph into construction jobs. + if (this->show_progress) { + #pragma omp critical + { + std::cerr << "Partitioning the graph into GBWT construction jobs" << std::endl; + } + } + size_t target_size = graph.get_node_count() / parallel_jobs; + gbwtgraph::ConstructionJobs jobs = gbwtgraph::gbwt_construction_jobs(graph, target_size); + if (this->show_progress) { + #pragma omp critical + { + std::cerr << "Created " << jobs.size() << " parallel construction jobs" << std::endl; + } + } // GBWT construction. - gbwt::GBWTBuilder builder(gbwt_node_width(graph), this->gbwt_buffer_size * gbwt::MILLION, this->id_interval); - builder.index.addMetadata(); + std::vector builder_mutexes(jobs.size()); + std::vector> builders(jobs.size()); + std::vector> read_names(jobs.size()); // TODO: Concatenated strings + starting offsets may save space. + for (size_t i = 0; i < jobs.size(); i++) { + builders[i].reset(new gbwt::GBWTBuilder(gbwt_node_width(graph), this->gbwt_buffer_size * gbwt::MILLION, this->id_interval)); + } // Actual work. if (this->show_progress) { #pragma omp critical { - std::cerr << "Converting " << aln_format << " to threads" << std::endl; + std::cerr << "Converting " << aln_format << " to GBWT paths" << std::endl; } } std::function lambda = [&](Alignment& aln) { @@ -438,37 +459,98 @@ std::unique_ptr HaplotypeIndexer::build_gbwt(const PathHandle for (auto& m : aln.path().mapping()) { buffer.push_back(mapping_to_gbwt(m)); } - builder.insert(buffer, true); // Insert in both orientations. - size_t sample_id = 0, sample_count = 0; - auto iter = sample_info.find(aln.name()); - if (iter == sample_info.end()) { - sample_id = sample_names.size(); - sample_names.push_back(aln.name()); - sample_info[aln.name()] = std::pair(sample_id, sample_count); - haplotype_count++; - } else { - sample_id = iter->second.first; - sample_count = iter->second.second; - iter->second.second++; + size_t job_id = 0; + if (buffer.size() > 0) { + job_id = jobs.job(gbwt::Node::id(buffer.front())); + if (job_id >= jobs.size()) { + job_id = 0; + } + } + { + // Insert the path into the appropriate builder and record the read name. + std::lock_guard lock(builder_mutexes[job_id]); + builders[job_id]->insert(buffer, true); + read_names[job_id].push_back(aln.name()); } - builder.index.metadata.addPath(sample_id, 0, 0, sample_count); }; for (auto& file_name : aln_filenames) { if (aln_format == "GAM") { get_input_file(file_name, [&](istream& in) { - vg::io::for_each(in, lambda); + vg::io::for_each_parallel(in, lambda); }); } else { assert(aln_format == "GAF"); - vg::io::gaf_unpaired_for_each(graph, file_name, lambda); + vg::io::gaf_unpaired_for_each_parallel(graph, file_name, lambda); } } - - // Finish the construction and extract the index. - finish_gbwt_constuction(builder, sample_names, contig_names, haplotype_count, this->show_progress); - std::unique_ptr built(new gbwt::DynamicGBWT()); - builder.swapIndex(*built); - return built; + + // Finish the construction and convert to compressed GBWT. + std::vector partial_indexes(jobs.size()); + #pragma omp parallel for schedule(dynamic, 1) + for (size_t i = 0; i < jobs.size(); i++) { + builders[i]->finish(); + partial_indexes[i] = gbwt::GBWT(builders[i]->index); + builders[i].reset(); + } + + // Merge the partial indexes. + if (this->show_progress) { + #pragma omp critical + { + std::cerr << "Merging the partial GBWTs" << std::endl; + } + } + std::unique_ptr result(new gbwt::GBWT(partial_indexes)); + partial_indexes.clear(); + + // Create the metadata. + // TODO: This is quite slow. + if (this->show_progress) { + #pragma omp critical + { + std::cerr << "Creating metadata" << std::endl; + } + } + result->addMetadata(); + result->metadata.setContigs({ "unknown" }); + { + std::map> read_info; // name -> (sample id, fragment count) + for (auto& names : read_names) { + for (const std::string& name : names) { + size_t sample_id = 0, fragment_count = 0; + auto iter = read_info.find(name); + if (iter == read_info.end()) { + sample_id = read_info.size(); + read_info[name] = std::pair(sample_id, fragment_count); + } else { + sample_id = iter->second.first; + fragment_count = iter->second.second; + iter->second.second++; + } + result->metadata.addPath(sample_id, 0, 0, fragment_count); + } + names = std::vector(); + } + std::vector sample_names(read_info.size()); + for (auto& p : read_info) { + sample_names[p.second.first] = p.first; + } + read_info.clear(); + result->metadata.setSamples(sample_names); + result->metadata.setHaplotypes(sample_names.size()); + } + if (this->show_progress) { + #pragma omp critical + { + std::cerr << "GBWT: "; + gbwt::operator<<(std::cerr, result->metadata); + std::cerr << std::endl; + } + } + + // Restore the number of threads. + omp_set_num_threads(old_threads); + return result; } } diff --git a/src/haplotype_indexer.hpp b/src/haplotype_indexer.hpp index f555aec947b..ca212a94267 100644 --- a/src/haplotype_indexer.hpp +++ b/src/haplotype_indexer.hpp @@ -135,10 +135,15 @@ class HaplotypeIndexer : public Progressive { * the same name, the corresponding GBWT path names will have the same * sample identifier but different values in the count field. * - * aln_format can be "GAM" or "GAF" + * aln_format can be "GAM" or "GAF". + * + * Runs approximately the given number of jobs in parallel. The exact + * number depends on the sizes of weakly connected components in the graph. + * Each job uses at most 2 threads. */ - std::unique_ptr build_gbwt(const PathHandleGraph& graph, - const std::vector& aln_filenames, const std::string& aln_format) const; + std::unique_ptr build_gbwt(const HandleGraph& graph, + const std::vector& aln_filenames, const std::string& aln_format, + size_t parallel_jobs = 1) const; }; } diff --git a/src/subcommand/gbwt_main.cpp b/src/subcommand/gbwt_main.cpp index 5e55928a4fe..4eca4e47a81 100644 --- a/src/subcommand/gbwt_main.cpp +++ b/src/subcommand/gbwt_main.cpp @@ -1389,8 +1389,8 @@ void step_1_build_gbwts(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& co if (config.show_progress) { std::cerr << "Input type: " << (config.gam_format ? "GAM" : "GAF") << std::endl; } - // FIXME: Parallelize this. - std::unique_ptr temp = config.haplotype_indexer.build_gbwt(*(graphs.path_graph), config.input_filenames, (config.gam_format ? "GAM" : "GAF")); + std::unique_ptr temp = + config.haplotype_indexer.build_gbwt(*(graphs.path_graph), config.input_filenames, (config.gam_format ? "GAM" : "GAF"), config.build_jobs); gbwts.use(*temp); } diff --git a/test/t/37_vg_gbwt.t b/test/t/37_vg_gbwt.t index 870b5d9674a..1046da788d3 100644 --- a/test/t/37_vg_gbwt.t +++ b/test/t/37_vg_gbwt.t @@ -52,9 +52,9 @@ rm -f x.ref.gbwt # Single chromosome: alignments vg paths -v x.vg -X -Q _alt > x.alts.gam vg convert -G x.alts.gam x.vg > x.alts.gaf -vg gbwt -A -o x.alts.gaf.gbwt -x x.vg x.alts.gaf +vg gbwt -A --num-jobs 1 -o x.alts.gaf.gbwt -x x.vg x.alts.gaf is $? 0 "chromosome X GAF with vg gbwt" -vg gbwt -A --gam-format -o x.alts.gam.gbwt -x x.vg x.alts.gam +vg gbwt -A --num-jobs 1 --gam-format -o x.alts.gam.gbwt -x x.vg x.alts.gam is $? 0 "chromosome X GAM with vg gbwt" cmp x.alts.gaf.gbwt x.alts.gaf.gbwt is $? 0 "identical construction results from GAF and GAM" From a585bbcffa162f08dbe2ae9a745e37dde8bb5595 Mon Sep 17 00:00:00 2001 From: Jouni Siren Date: Sun, 3 Nov 2024 01:31:20 -0800 Subject: [PATCH 41/54] Faster metadata construction for GAM/GAF GBWT --- src/haplotype_indexer.cpp | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/haplotype_indexer.cpp b/src/haplotype_indexer.cpp index b7c935d7e05..75399a86e64 100644 --- a/src/haplotype_indexer.cpp +++ b/src/haplotype_indexer.cpp @@ -2,8 +2,9 @@ * \file haplotype_indexer.cpp: implementations of haplotype indexing with the GBWT */ +#include "haplotype_indexer.hpp" + #include -#include #include #include #include @@ -14,10 +15,9 @@ #include "gbwt_helper.hpp" -#include "haplotype_indexer.hpp" - -#include "path.hpp" #include "alignment.hpp" +#include "hash_map.hpp" +#include "path.hpp" using namespace std; @@ -442,7 +442,9 @@ std::unique_ptr HaplotypeIndexer::build_gbwt(const HandleGraph& grap // GBWT construction. std::vector builder_mutexes(jobs.size()); std::vector> builders(jobs.size()); - std::vector> read_names(jobs.size()); // TODO: Concatenated strings + starting offsets may save space. + // This is a bit inefficient, as read names are often longer than the SSO threshold for GCC (but not for Clang). + // TODO: Maybe use concatenated 0-terminated names? + std::vector> read_names(jobs.size()); for (size_t i = 0; i < jobs.size(); i++) { builders[i].reset(new gbwt::GBWTBuilder(gbwt_node_width(graph), this->gbwt_buffer_size * gbwt::MILLION, this->id_interval)); } @@ -504,7 +506,6 @@ std::unique_ptr HaplotypeIndexer::build_gbwt(const HandleGraph& grap partial_indexes.clear(); // Create the metadata. - // TODO: This is quite slow. if (this->show_progress) { #pragma omp critical { @@ -514,14 +515,15 @@ std::unique_ptr HaplotypeIndexer::build_gbwt(const HandleGraph& grap result->addMetadata(); result->metadata.setContigs({ "unknown" }); { - std::map> read_info; // name -> (sample id, fragment count) + // We can use 32-bit values, as GBWT metadata uses them as well. + string_hash_map> read_info; // name -> (sample id, fragment count) for (auto& names : read_names) { for (const std::string& name : names) { - size_t sample_id = 0, fragment_count = 0; + std::uint32_t sample_id = 0, fragment_count = 0; auto iter = read_info.find(name); if (iter == read_info.end()) { sample_id = read_info.size(); - read_info[name] = std::pair(sample_id, fragment_count); + read_info[name] = std::make_pair(sample_id, fragment_count); } else { sample_id = iter->second.first; fragment_count = iter->second.second; @@ -535,7 +537,7 @@ std::unique_ptr HaplotypeIndexer::build_gbwt(const HandleGraph& grap for (auto& p : read_info) { sample_names[p.second.first] = p.first; } - read_info.clear(); + read_info = string_hash_map>(); result->metadata.setSamples(sample_names); result->metadata.setHaplotypes(sample_names.size()); } From c1b58030cd2bc4079ff12ea6ec441e3d19cc791f Mon Sep 17 00:00:00 2001 From: Jouni Siren Date: Mon, 4 Nov 2024 00:12:53 -0800 Subject: [PATCH 42/54] Tests for multi-chromosome GAM/GAF GBWT --- test/t/37_vg_gbwt.t | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/test/t/37_vg_gbwt.t b/test/t/37_vg_gbwt.t index 1046da788d3..a5849bc47a5 100644 --- a/test/t/37_vg_gbwt.t +++ b/test/t/37_vg_gbwt.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 149 +plan tests 155 # Build vg graphs for two chromosomes @@ -50,7 +50,7 @@ rm -f x.ref.gbwt # Single chromosome: alignments -vg paths -v x.vg -X -Q _alt > x.alts.gam +vg paths -x x.vg -X -Q _alt > x.alts.gam vg convert -G x.alts.gam x.vg > x.alts.gaf vg gbwt -A --num-jobs 1 -o x.alts.gaf.gbwt -x x.vg x.alts.gaf is $? 0 "chromosome X GAF with vg gbwt" @@ -108,6 +108,25 @@ rm -f x.gbwt y.gbwt xy.merge.gbwt xy.fast.gbwt xy.parallel.gbwt xy.direct.gbwt x rm -f xy.1000gp.gbwt +# Multiple chromosomes: alignments +vg paths -x xy-alt.xg -X -Q _alt > xy.alts.gam +vg convert -G xy.alts.gam xy.xg > xy.alts.gaf +vg gbwt -A --num-jobs 1 -o xy.alts.gaf.gbwt -x xy.xg xy.alts.gaf +is $? 0 "multi-chromosome GAF with vg gbwt" +vg gbwt -A --num-jobs 1 --gam-format -o xy.alts.gam.gbwt -x xy.xg xy.alts.gam +is $? 0 "multi-chromosome GAM with vg gbwt" +cmp xy.alts.gaf.gbwt xy.alts.gaf.gbwt +is $? 0 "identical construction results from GAF and GAM" + +vg gbwt -A --num-jobs 2 -o multi.gbwt -x xy.xg xy.alts.gaf +is $? 0 "multi-chromosome GAF with vg gbwt using multiple jobs" +is $(vg gbwt -c xy.alts.gaf.gbwt) 58 "single job: 58 paths" +is $(vg gbwt -c multi.gbwt) 58 "multiple jobs: 58 paths" + +rm -f xy.alts.gam xy.alts.gaf +rm -f xy.alts.gaf.gbwt xy.alts.gam.gbwt multi.gbwt + + # Multiple chromosomes: paths as contigs vg gbwt -E -o xy.contigs.gbwt -x xy.xg is $? 0 "paths as contigs with vg gbwt" From 6957377dbfbe6a1a65567b964269994b663f14c2 Mon Sep 17 00:00:00 2001 From: Jouni Siren Date: Mon, 4 Nov 2024 00:35:38 -0800 Subject: [PATCH 43/54] Maybe SDSL will now point to the right hash --- deps/sdsl-lite | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/sdsl-lite b/deps/sdsl-lite index 115dfd5b737..4b4b1b8951b 160000 --- a/deps/sdsl-lite +++ b/deps/sdsl-lite @@ -1 +1 @@ -Subproject commit 115dfd5b7371ef99a1e0611389b956bb88bf9fc4 +Subproject commit 4b4b1b8951bf7fbdb5e050f98b5e61453964c167 From a4e7ddd41774d8431160490db26670514c3fa921 Mon Sep 17 00:00:00 2001 From: Jouni Siren Date: Mon, 4 Nov 2024 02:39:48 -0800 Subject: [PATCH 44/54] vg was apparently using another branch of SDSL --- deps/sdsl-lite | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/sdsl-lite b/deps/sdsl-lite index 4b4b1b8951b..ef23c5fe989 160000 --- a/deps/sdsl-lite +++ b/deps/sdsl-lite @@ -1 +1 @@ -Subproject commit 4b4b1b8951bf7fbdb5e050f98b5e61453964c167 +Subproject commit ef23c5fe9899f2b0afa53a32162ed0b06aff0e89 From 612d446061ce17ec14807cb883f33ba9f8895942 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 4 Nov 2024 17:49:53 -0500 Subject: [PATCH 45/54] Use toil-vg that uses vg gbwt to make gbwt --- vgci/vgci.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vgci/vgci.sh b/vgci/vgci.sh index 9a5e53b08f3..ca04cfbe7b4 100755 --- a/vgci/vgci.sh +++ b/vgci/vgci.sh @@ -30,7 +30,7 @@ KEEP_INTERMEDIATE_FILES=0 # Should we show stdout and stderr from tests? If so, set to "-s". SHOW_OPT="" # What toil-vg should we install? -TOIL_VG_PACKAGE="git+https://github.com/vgteam/toil-vg.git@c9bd6414f935e6095574a41a34addbb8d87b41a6" +TOIL_VG_PACKAGE="git+https://github.com/vgteam/toil-vg.git@d16da00b92c491f90433e151cb4f5a89a44395b8" # What toil should we install? # Could be something like "toil[aws,mesos]==3.20.0" # or "git+https://github.com/DataBiosphere/toil.git@3ab74776a3adebd6db75de16985ce9d734f60743#egg=toil[aws,mesos]" From 8a12486abcefc7644f00180f2c4b29916c22384c Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 5 Nov 2024 10:51:24 -0500 Subject: [PATCH 46/54] Use toil-vg that knows about DI2 and uses outfile correctly --- vgci/vgci.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vgci/vgci.sh b/vgci/vgci.sh index ca04cfbe7b4..c48ddbcf553 100755 --- a/vgci/vgci.sh +++ b/vgci/vgci.sh @@ -30,7 +30,7 @@ KEEP_INTERMEDIATE_FILES=0 # Should we show stdout and stderr from tests? If so, set to "-s". SHOW_OPT="" # What toil-vg should we install? -TOIL_VG_PACKAGE="git+https://github.com/vgteam/toil-vg.git@d16da00b92c491f90433e151cb4f5a89a44395b8" +TOIL_VG_PACKAGE="git+https://github.com/vgteam/toil-vg.git@45782c7ba5a372e7c3587ac1c63f4895176fc828" # What toil should we install? # Could be something like "toil[aws,mesos]==3.20.0" # or "git+https://github.com/DataBiosphere/toil.git@3ab74776a3adebd6db75de16985ce9d734f60743#egg=toil[aws,mesos]" From f846f72b6035922daf18e1f4f4b64cb38b55e2e3 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 5 Nov 2024 12:03:57 -0500 Subject: [PATCH 47/54] Use a gbwtgraph that can store subranges of haplotype fragment 0 --- deps/gbwtgraph | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/gbwtgraph b/deps/gbwtgraph index de61d340695..8819a8cf8f6 160000 --- a/deps/gbwtgraph +++ b/deps/gbwtgraph @@ -1 +1 @@ -Subproject commit de61d340695f64b8aa978816b195d6687e7fda5a +Subproject commit 8819a8cf8f66c22910a1167f548b944b0ac07ecf From 3da4e03b827444b86f65885fdf9622d8c9dc67c1 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 5 Nov 2024 12:42:56 -0500 Subject: [PATCH 48/54] Start rewriting subpath copying in terms of metadata --- src/algorithms/subgraph.cpp | 40 ++++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/src/algorithms/subgraph.cpp b/src/algorithms/subgraph.cpp index 2ea619ea952..b72d7f4c3f2 100644 --- a/src/algorithms/subgraph.cpp +++ b/src/algorithms/subgraph.cpp @@ -297,20 +297,54 @@ void extract_path_range(const PathPositionHandleGraph& source, path_handle_t pat /// in the source graph (unless the subpath was not cut up at all) void add_subpaths_to_subgraph(const PathPositionHandleGraph& source, MutablePathHandleGraph& subgraph, bool subpath_naming) { - std::unordered_map > subpaths; + + // We want to organize all visits by base path. This key type holds the + // sense, sample and locus names, haplotype, and phase block. + using base_metadata_t = std::tuple; + + // This stores, for each source graph base path, for each start offset, the handle at that offset on the path. + std::unordered_map > subpaths; + + // This stores information about base paths that don't have subranges, and + // their full lengths, so we can avoid generating new subrange metadata + // when we just have all of a path. + std::unordered_map full_path_lengths; + subgraph.for_each_handle([&](const handle_t& h) { handlegraph::nid_t id = subgraph.get_id(h); if (source.has_node(id)) { handle_t handle = source.get_handle(id); source.for_each_step_position_on_handle(handle, [&](const step_handle_t& step, const bool& is_rev, const uint64_t& pos) { path_handle_t path = source.get_path_handle_of_step(step); - std::string path_name = source.get_path_name(path); - subpaths[path_name][pos] = is_rev ? subgraph.flip(h) : h; + // Figure out the base path this visit is on + base_metadata_t key = {source.get_sense(path), source.get_sample_name(path), source.get_contig_name(path), source.get_haplotype(path), source.get_phase_block(path)}; + // Figure out the subrange of the base path it is relative to + subrange_t path_subrange = source.get_subrange(path); + uint64_t visit_offset = pos; + if (path_subrange != PathMetadata:NO_SUBRANGE) { + // If we have the position relative to a subrange, adjust by that subrange's offset. + visit_offset += path_subrange.first; + } + subpaths[key][visit_offset] = is_rev ? subgraph.flip(h) : h; + + if (path_subrange == PathMetadata:NO_SUBRANGE) { + // There's no subrange set, so this path is full-length in the source graph. + // See if we know of this path as a full-length path or not + auto it = full_path_lengths.find(key); + if (it == full_path_lengths.end()) { + // We haven't recorded its length yet, so do it. + full_path_lengths.emplace_hint(it, key, source.get_path_length(path)); + } + } return true; }); } }); + // TODO: Rewrite to find continuous subpath pieces and copy into the new graph with MutablePathMetadata::create_path + // But only use the subpaths if we aren't a complete full-length span of a base path. + // We will do this by extracting each contiguous subrange into a vector of handles, then computing metadata for it based on its length and whether it is a full-length span of a path that was a full path in the base graph, and then creating it and filling it in. + function new_subpath = [&subgraph](const string& path_name, bool is_circular, size_t subpath_offset) { PathSense sense; From a7c3d074397f96fc2d8de8eae08f3c4b3c35f27b Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 5 Nov 2024 18:45:13 -0500 Subject: [PATCH 49/54] Rewrite path chunking to use subpaths all the time and remove the old reference splicing code --- src/algorithms/subgraph.cpp | 146 ++++++++--------- src/algorithms/subgraph.hpp | 16 +- src/chunker.cpp | 295 +++++++--------------------------- src/chunker.hpp | 6 +- src/subcommand/chunk_main.cpp | 2 +- 5 files changed, 146 insertions(+), 319 deletions(-) diff --git a/src/algorithms/subgraph.cpp b/src/algorithms/subgraph.cpp index b72d7f4c3f2..86fac693ba1 100644 --- a/src/algorithms/subgraph.cpp +++ b/src/algorithms/subgraph.cpp @@ -1,5 +1,6 @@ #include "subgraph.hpp" #include "../path.hpp" +#include "../crash.hpp" namespace vg { namespace algorithms { @@ -290,13 +291,7 @@ void extract_path_range(const PathPositionHandleGraph& source, path_handle_t pat } } -/// add subpaths to the subgraph, providing a concatenation of subpaths that are discontiguous over the subgraph -/// based on their order in the path position index provided by the source graph -/// will clear any path found in both graphs before writing the new steps into it -/// if subpath_naming is true, a suffix will be added to each path in the subgraph denoting its offset -/// in the source graph (unless the subpath was not cut up at all) -void add_subpaths_to_subgraph(const PathPositionHandleGraph& source, MutablePathHandleGraph& subgraph, - bool subpath_naming) { +void add_subpaths_to_subgraph(const PathPositionHandleGraph& source, MutablePathHandleGraph& subgraph) { // We want to organize all visits by base path. This key type holds the // sense, sample and locus names, haplotype, and phase block. @@ -306,9 +301,9 @@ void add_subpaths_to_subgraph(const PathPositionHandleGraph& source, MutablePath std::unordered_map > subpaths; // This stores information about base paths that don't have subranges, and - // their full lengths, so we can avoid generating new subrange metadata - // when we just have all of a path. - std::unordered_map full_path_lengths; + // their full lengths and circularity flags, so we can avoid generating new + // subrange metadata when we just have all of a path. + std::unordered_map> full_path_info; subgraph.for_each_handle([&](const handle_t& h) { handlegraph::nid_t id = subgraph.get_id(h); @@ -317,23 +312,23 @@ void add_subpaths_to_subgraph(const PathPositionHandleGraph& source, MutablePath source.for_each_step_position_on_handle(handle, [&](const step_handle_t& step, const bool& is_rev, const uint64_t& pos) { path_handle_t path = source.get_path_handle_of_step(step); // Figure out the base path this visit is on - base_metadata_t key = {source.get_sense(path), source.get_sample_name(path), source.get_contig_name(path), source.get_haplotype(path), source.get_phase_block(path)}; + base_metadata_t key = {source.get_sense(path), source.get_sample_name(path), source.get_locus_name(path), source.get_haplotype(path), source.get_phase_block(path)}; // Figure out the subrange of the base path it is relative to subrange_t path_subrange = source.get_subrange(path); uint64_t visit_offset = pos; - if (path_subrange != PathMetadata:NO_SUBRANGE) { + if (path_subrange != PathMetadata::NO_SUBRANGE) { // If we have the position relative to a subrange, adjust by that subrange's offset. visit_offset += path_subrange.first; } subpaths[key][visit_offset] = is_rev ? subgraph.flip(h) : h; - if (path_subrange == PathMetadata:NO_SUBRANGE) { + if (path_subrange == PathMetadata::NO_SUBRANGE) { // There's no subrange set, so this path is full-length in the source graph. // See if we know of this path as a full-length path or not - auto it = full_path_lengths.find(key); - if (it == full_path_lengths.end()) { - // We haven't recorded its length yet, so do it. - full_path_lengths.emplace_hint(it, key, source.get_path_length(path)); + auto it = full_path_info.find(key); + if (it == full_path_info.end()) { + // We haven't recorded its length and circularity yet, so do it. + full_path_info.emplace_hint(it, key, std::make_pair(source.get_path_length(path), source.get_is_circular(path))); } } return true; @@ -341,66 +336,71 @@ void add_subpaths_to_subgraph(const PathPositionHandleGraph& source, MutablePath } }); - // TODO: Rewrite to find continuous subpath pieces and copy into the new graph with MutablePathMetadata::create_path - // But only use the subpaths if we aren't a complete full-length span of a base path. - // We will do this by extracting each contiguous subrange into a vector of handles, then computing metadata for it based on its length and whether it is a full-length span of a path that was a full path in the base graph, and then creating it and filling it in. + for (auto& base_and_visits : subpaths) { + // For each base path + const base_metadata_t& base_path_metadata = base_and_visits.first; + const auto& start_to_handle = base_and_visits.second; + // If we didn't put anything in the visit collection, it shouldn't be here. + crash_unless(!start_to_handle.empty()); - function new_subpath = - [&subgraph](const string& path_name, bool is_circular, size_t subpath_offset) { - PathSense sense; - string sample; - string locus; - size_t haplotype; - size_t phase_block; - subrange_t subrange; - PathMetadata::parse_path_name(path_name, sense, sample, locus, haplotype, phase_block, subrange); - if (subrange == PathMetadata::NO_SUBRANGE) { - subrange.first = subpath_offset; - } else { - subrange.first += subpath_offset; - } - subrange.first = subpath_offset; - subrange.second = PathMetadata::NO_END_POSITION; - string subpath_name = PathMetadata::create_path_name(sense, sample, locus, haplotype, phase_block, subrange); - if (subgraph.has_path(subpath_name)) { - subgraph.destroy_path(subgraph.get_path_handle(subpath_name)); - } - return subgraph.create_path_handle(subpath_name, is_circular); - }; + // We're going to walk over all the visits and find contiguous runs + auto run_start = start_to_handle.begin(); + auto run_end = run_start; + size_t start_coordinate = run_start->first; + while (run_end != start_to_handle.end()) { + // Until we run out of runs + // Figure out where this node ends on the path + size_t stop_coordinate = run_end->first + subgraph.get_length(run_end->second); + + // Look ahead + ++run_end; - for (auto& subpath : subpaths) { - const std::string& path_name = subpath.first; - path_handle_t source_path_handle = source.get_path_handle(path_name); - // destroy the path if it exists - if (subgraph.has_path(path_name)) { - subgraph.destroy_path(subgraph.get_path_handle(path_name)); - } - // create a new path. give it a subpath name if the flag's on and its smaller than original - path_handle_t path; - if (!subpath_naming || subpath.second.size() == source.get_step_count(source_path_handle) || - subpath.second.empty()) { - path = subgraph.create_path_handle(path_name, source.get_is_circular(source_path_handle)); - } else { - path = new_subpath(path_name, source.get_is_circular(source_path_handle), subpath.second.begin()->first); - } - for (auto p = subpath.second.begin(); p != subpath.second.end(); ++p) { - const handle_t& handle = p->second; - if (p != subpath.second.begin() && subpath_naming) { - auto prev = p; - --prev; - const handle_t& prev_handle = prev->second; - // distance from map - size_t delta = p->first - prev->first; - // what the distance should be if they're contiguous depends on relative orienations - size_t cont_delta = subgraph.get_length(prev_handle); - if (delta != cont_delta) { - // we have a discontinuity! we'll make a new path can continue from there - assert(subgraph.get_step_count(path) > 0); - path = new_subpath(path_name, subgraph.get_is_circular(path), p->first); + if (run_end != start_to_handle.end() && run_end->first == stop_coordinate) { + // The next visit is still contiguous, so advance. + continue; + } + + // Otherwise we've reached a break in continuity. We have a + // contiguous run from run_start to run_end, visiting the subrange + // start_coordinate to stop_coordinate. + + // Find out if we cover a full source graph path. + subrange_t run_subrange = {start_coordinate, stop_coordinate}; + bool is_circular = false; + if (start_coordinate == 0) { + // We might be a full path + auto found_length_and_circularity = full_path_info.find(base_path_metadata); + if (found_length_and_circularity != full_path_info.end() && found_length_and_circularity->second.first == stop_coordinate) { + // We are a full path + run_subrange = PathMetadata::NO_SUBRANGE; + // We can be circular. + is_circular = found_length_and_circularity->second.second; } } - //fill in the path information - subgraph.append_step(path, handle); + + // Make a path with all the metadata + path_handle_t new_path = subgraph.create_path( + std::get<0>(base_path_metadata), + std::get<1>(base_path_metadata), + std::get<2>(base_path_metadata), + std::get<3>(base_path_metadata), + std::get<4>(base_path_metadata), + run_subrange, + is_circular + ); + + for (auto it = run_start; it != run_end; ++it) { + // Copy the path's visits + subgraph.append_step(new_path, it->second); + } + + // Set up the next subpath. + // Set where it starts. + run_start = run_end; + if (run_start != start_to_handle.end()) { + // And if it will exist, set its start coordinate. + start_coordinate = run_start->first; + } } } } diff --git a/src/algorithms/subgraph.hpp b/src/algorithms/subgraph.hpp index fef9e174235..321baf20e40 100644 --- a/src/algorithms/subgraph.hpp +++ b/src/algorithms/subgraph.hpp @@ -34,13 +34,15 @@ void extract_id_range(const HandleGraph& source, const nid_t& id1, const nid_t& /// if end < 0, then it will walk to the end of the path void extract_path_range(const PathPositionHandleGraph& source, path_handle_t path_handle, int64_t start, int64_t end, MutableHandleGraph& subgraph); -/// add subpaths to the subgraph, providing a concatenation of subpaths that are discontiguous over the subgraph -/// based on their order in the path position index provided by the source graph -/// will clear any path found in both graphs before writing the new steps into it -/// if subpath_naming is true, a suffix will be added to each path in the subgraph denoting its offset -/// in the source graph (unless the subpath was not cut up at all) -void add_subpaths_to_subgraph(const PathPositionHandleGraph& source, MutablePathHandleGraph& subgraph, - bool subpath_naming = false); +/// Add subpaths to the subgraph for all paths visiting its nodes in the source +/// graph. +/// +/// Always generates correct path metadata, and a path for each contiguous +/// fragment of any base path. Assumes the source graph does not contain any +/// overlapping path fragments on a given base path, and that the subgraph does +/// not already contain any paths on a base path also present in the source +/// graph. +void add_subpaths_to_subgraph(const PathPositionHandleGraph& source, MutablePathHandleGraph& subgraph); /// We can accumulate a subgraph without accumulating all the edges between its nodes /// this helper ensures that we get the full set diff --git a/src/chunker.cpp b/src/chunker.cpp index 00fb6f6a3e9..5b1dad389a3 100644 --- a/src/chunker.cpp +++ b/src/chunker.cpp @@ -5,6 +5,7 @@ #include "algorithms/subgraph.hpp" #include "vg.hpp" #include "clip.hpp" +#include "crash.hpp" //#define debug @@ -22,16 +23,9 @@ PathChunker::~PathChunker() { void PathChunker::extract_subgraph(const Region& region, int64_t context, int64_t length, bool forward_only, MutablePathMutableHandleGraph& subgraph, Region& out_region) { - // This method still depends on VG - // (not a super high priority to port, as calling can now be done at genome scale and we no longer - // have to chunk up paths) - VG* vg_subgraph = dynamic_cast(&subgraph); - if (vg_subgraph == nullptr) { - vg_subgraph = new VG(); - assert(subgraph.get_node_count() == 0); - } // extract our path range into the graph + // TODO: Handle incoming names with subranges. path_handle_t path_handle = graph->get_path_handle(region.seq); step_handle_t start_step = graph->get_step_at_position(path_handle, region.start); handle_t start_handle = graph->get_handle_of_step(start_step); @@ -53,250 +47,81 @@ void PathChunker::extract_subgraph(const Region& region, int64_t context, int64_ if (graph->get_is_reverse(step_handle)) { step_handle = graph->flip(step_handle); } - if (!vg_subgraph->has_node(graph->get_id(step_handle))) { - vg_subgraph->create_handle(graph->get_sequence(step_handle), graph->get_id(step_handle)); + if (!subgraph.has_node(graph->get_id(step_handle))) { + subgraph.create_handle(graph->get_sequence(step_handle), graph->get_id(step_handle)); } }; // expand the context and get path information // if forward_only true, then we only go forward. if (context > 0) { - algorithms::expand_subgraph_by_steps(*graph, *vg_subgraph, context, forward_only); + algorithms::expand_subgraph_by_steps(*graph, subgraph, context, forward_only); } if (length > 0) { - algorithms::expand_subgraph_by_length(*graph, *vg_subgraph, context, forward_only); + algorithms::expand_subgraph_by_length(*graph, subgraph, context, forward_only); } else if (context == 0 && length == 0) { - algorithms::add_connecting_edges_to_subgraph(*graph, *vg_subgraph); + algorithms::add_connecting_edges_to_subgraph(*graph, subgraph); } - algorithms::add_subpaths_to_subgraph(*graph, *vg_subgraph, true); - - // merge back our reference path to use the old chopping code - // todo: work with subpaths somehow? - if (!vg_subgraph->has_path(region.seq)) { - map ref_subpaths; - vg_subgraph->for_each_path_handle([&](path_handle_t path_handle) { - string path_name = vg_subgraph->get_path_name(path_handle); - subrange_t subrange; - path_name = Paths::strip_subrange(path_name, &subrange); - if (subrange != PathMetadata::NO_SUBRANGE && path_name == region.seq) { - ref_subpaths[subrange.first] = path_handle; - } - }); - path_handle_t new_ref_path = vg_subgraph->create_path_handle(region.seq, graph->get_is_circular(path_handle)); - for (auto& ref_subpath : ref_subpaths) { - vg_subgraph->for_each_step_in_path(ref_subpath.second, [&] (step_handle_t subpath_step) { - vg_subgraph->append_step(new_ref_path, vg_subgraph->get_handle_of_step(subpath_step)); - }); - vg_subgraph->destroy_path(ref_subpath.second); - } - } - - // build the vg of the subgraph - vg_subgraph->remove_orphan_edges(); - - // get our range endpoints before context expansion - list& mappings = vg_subgraph->paths.get_path(region.seq); - assert(!mappings.empty()); - size_t mappings_size = mappings.size(); - int64_t input_start_node = graph->get_id(start_handle); - int64_t input_end_node = graph->get_id(end_handle); - -#ifdef debug -#pragma omp critical(cerr) - { - cerr << "Path range in expanded subgraph is " << *mappings.begin() << "-" << *mappings.rbegin() << endl; - } -#endif - - // replaces old xg position_in_path() to check node counts in path - function(const PathHandleGraph&, handle_t, path_handle_t)> path_steps_of_handle = - [] (const PathHandleGraph& graph, handle_t handle, path_handle_t path_handle) { - vector node_steps = graph.steps_of_handle(handle); - vector node_path_steps; - for (auto step : node_steps) { - if (graph.get_path_handle_of_step(step) == path_handle) { - node_path_steps.push_back(step); - } - } - return node_path_steps; - }; - - // we have no direct way of getting our steps out of the subgraph, so we - // go through node ids. the problem is that cycles can introduce - // ambiguity. we check for that here (only to punt on it later) - vector start_node_path_steps = path_steps_of_handle(*graph, start_handle, path_handle); - vector end_node_path_steps = path_steps_of_handle(*graph, end_handle, path_handle); - bool end_points_on_cycle = start_node_path_steps.size() > 1 || end_node_path_steps.size() > 1; - - // keep track of the edges in our original path - set, pair>> path_edge_set = - // walking out with the context length (as supported below) won't always work as expansion - // can grab an arbitrary amount of path regardless of context. so we load up the entire path: - // (todo: could sniff out limits from subgraph...) - get_path_edge_index(graph->path_begin(path_handle), graph->path_back(path_handle), std::max(context, length)); - - // the distance between them and the nodes in our input range - size_t left_padding = 0; - size_t right_padding = 0; - // do we need to rewrite back to our graph? - bool rewrite_paths = false; + algorithms::add_subpaths_to_subgraph(*graph, subgraph); + + // Find the lowest and highest offsets visited by any subpath of the target path we extracted on. + PathSense sense = graph->get_sense(path_handle); + std::string sample = graph->get_sample_name(path_handle); + std::string locus = graph->get_locus_name(path_handle); + size_t haplotype = graph->get_haplotype(path_handle); + size_t phase_block = graph->get_phase_block(path_handle); - if (!end_points_on_cycle) { - // start and end of our expanded chunk - auto start_it = mappings.begin(); - auto end_it = --mappings.end(); - - // find our input range in the expanded path. we know these nodes only appear once. - for (; start_it != mappings.end() && start_it->node_id() != input_start_node; ++start_it); - for (; end_it != mappings.begin() && end_it->node_id() != input_end_node; --end_it); - - // walk back our start point as we can without rank discontinuities. doesn't matter - // if we encounter cycles here, because we keep a running path length - auto cur_it = start_it; - auto prev_it = cur_it; - if (prev_it != mappings.begin()) { - for (; prev_it != mappings.begin(); --prev_it) { - cur_it = prev_it; - --cur_it; - handle_t prev_handle = vg_subgraph->get_handle(prev_it->node_id(), - prev_it->is_reverse()); - handle_t cur_handle = vg_subgraph->get_handle(cur_it->node_id(), - cur_it->is_reverse()); - edge_t edge = vg_subgraph->edge_handle(cur_handle, prev_handle); - if (!path_edge_set.count(make_pair(make_pair(vg_subgraph->get_id(edge.first), vg_subgraph->get_is_reverse(edge.first)), - make_pair(vg_subgraph->get_id(edge.second), vg_subgraph->get_is_reverse(edge.second))))) { -#ifdef debug -#pragma omp critical(cerr) - { - cerr << "found discontinuity between when left scanning path in subgraph: " << *cur_it << " and " << *prev_it << endl; - - } -#endif - break; - } - left_padding += cur_it->length; - } - } - start_it = prev_it; - // walk forward the end point - cur_it = end_it; - prev_it = cur_it; - for (++cur_it; cur_it != mappings.end(); ++prev_it, ++cur_it) { - handle_t prev_handle = vg_subgraph->get_handle(prev_it->node_id(), - prev_it->is_reverse()); - handle_t cur_handle = vg_subgraph->get_handle(cur_it->node_id(), - cur_it->is_reverse()); - edge_t edge = vg_subgraph->edge_handle(prev_handle, cur_handle); - if (!path_edge_set.count(make_pair(make_pair(vg_subgraph->get_id(edge.first), vg_subgraph->get_is_reverse(edge.first)), - make_pair(vg_subgraph->get_id(edge.second), vg_subgraph->get_is_reverse(edge.second))))) { -#ifdef debug -#pragma omp critical(cerr) - { - cerr << "found discontinuity between when right scanning path in subgraph: " << *prev_it << " and " << *cur_it << endl; - } -#endif - break; - } - right_padding += cur_it->length; - } - end_it = prev_it; + // Find the outer bounds of selected subpaths of the target path + size_t min_start = std::numeric_limits::max(); + size_t max_end = 0; - rewrite_paths = start_it != mappings.begin() || end_it != --mappings.end(); - - // cut out nodes before and after discontinuity - mappings.erase(mappings.begin(), start_it); - mappings.erase(++end_it, mappings.end()); - } - // We're clipping at a cycle in the reference path. Just preserve the path as-is from the - // input region. - else { - mappings.clear(); - for (step_handle_t step = start_step; step != end_plus_one_step; step = graph->get_next_step(step)) { - handle_t step_handle = graph->get_handle_of_step(step); - mapping_t mapping; - mapping.set_node_id(graph->get_id(step_handle)); - mapping.set_is_reverse(graph->get_is_reverse(step_handle)); - mappings.push_back(mapping); + subgraph.for_each_path_matching({sense}, {sample}, {locus}, [&](const path_handle_t subpath) { + if (subgraph.get_haplotype(subpath) != haplotype || subgraph.get_phase_block(subpath) != phase_block) { + // Skip this subpath since it's not the right phase/fragment + return true; } - rewrite_paths = true; - } - // Cut our graph so that our reference path end points are graph tips. This will let the - // snarl finder use the path to find telomeres. - path_handle_t sg_path_handle = vg_subgraph->get_path_handle(region.seq); - Node* start_node = vg_subgraph->get_node(mappings.begin()->node_id()); - auto sg_start_steps = path_steps_of_handle(*vg_subgraph, vg_subgraph->get_handle(start_node->id()), sg_path_handle); - if (rewrite_paths && sg_start_steps.size() == 1) { - if (!mappings.begin()->is_reverse() && vg_subgraph->start_degree(start_node) != 0) { - for (auto edge : vg_subgraph->edges_to(start_node)) { -#ifdef debug -#pragma omp critical(cerr) - { - cerr << "clipping out edge " << pb2json(*edge) << " in order to make path start a tip" << endl; - } -#endif - vg_subgraph->destroy_edge(edge); - } - } else if (mappings.begin()->is_reverse() && vg_subgraph->end_degree(start_node) != 0) { - for (auto edge : vg_subgraph->edges_from(start_node)) { -#ifdef debug -#pragma omp critical(cerr) - { - cerr << "clipping out edge " << pb2json(*edge) << " in order to make path start a tip" << endl; - } -#endif - vg_subgraph->destroy_edge(edge); - } + subrange_t subpath_subrange = subgraph.get_subrange(subpath); + if (subpath_subrange == PathMetadata::NO_SUBRANGE) { + // Fill in a 0 start + subpath_subrange.first = 0; } - } - Node* end_node = vg_subgraph->get_node(mappings.rbegin()->node_id()); - auto sg_end_steps = path_steps_of_handle(*vg_subgraph, vg_subgraph->get_handle(end_node->id()), sg_path_handle); - if (rewrite_paths && sg_end_steps.size() == 1) { - if (!mappings.rbegin()->is_reverse() && vg_subgraph->end_degree(end_node) != 0) { - for (auto edge : vg_subgraph->edges_from(end_node)) { -#ifdef debug -#pragma omp critical(cerr) - { - cerr << "clipping out edge " << pb2json(*edge) << " in order to make path end a tip" << endl; - } -#endif - vg_subgraph->destroy_edge(edge); - } - } else if (mappings.rbegin()->is_reverse() && vg_subgraph->start_degree(end_node) != 0) { - for (auto edge : vg_subgraph->edges_to(end_node)) { -#ifdef debug -#pragma omp critical(cerr) - { - cerr << "clipping out edge " << pb2json(*edge) << " in order to make path end a tip" << endl; - } -#endif - vg_subgraph->destroy_edge(edge); + if (subpath_subrange.second == PathMetadata::NO_END_POSITION) { + // Compute a length and use that to get the end. + // TODO: Sniff for an efficient/available get_path_length. + size_t path_length = 0; + for (handle_t handle : subgraph.scan_path(subpath)) { + path_length += subgraph.get_length(handle); } + subpath_subrange.second = subpath_subrange.first + path_length; } - } - - // Sync our updated paths lists back into the Graph protobuf - if (rewrite_paths) { - vg_subgraph->paths.rebuild_node_mapping(); - vg_subgraph->paths.rebuild_mapping_aux(); - vg_subgraph->graph.clear_path(); - vg_subgraph->paths.to_graph(vg_subgraph->graph); - } - // copy back out of vg if necessary - if (dynamic_cast(&subgraph) == nullptr) { - handlealgs::copy_path_handle_graph(vg_subgraph, &subgraph); - delete vg_subgraph; + // Min/max in the subrange bounds + min_start = std::min(min_start, subpath_subrange.first); + max_end = std::max(max_end, subpath_subrange.second); + + return true; + }); + + // TODO: We assume we actually found some of the target path + crash_unless(min_start != std::numeric_limits::max()); + + // Hackily remove source path subrange offsets if any + subrange_t source_subrange = graph->get_subrange(path_handle); + if (source_subrange != PathMetadata::NO_SUBRANGE) { + // If we requested something on this path region, we can't handle + // finding part of an earlier path region. + // TODO: Handle it. + crash_unless(min_start <= source_subrange.first); + min_start -= source_subrange.first; + max_end -= source_subrange.first; } - // start could fall inside a node. we find out where in the path the - // 0-offset point of the node is. - int64_t input_start_pos = graph->get_position_of_step(start_step); - int64_t input_end_pos = graph->get_position_of_step(end_step); + // Produce the output region. out_region.seq = region.seq; - out_region.start = input_start_pos - left_padding; - out_region.end = input_end_pos + graph->get_length(end_handle) + right_padding - 1; + out_region.start = min_start; + out_region.end = max_end; } void PathChunker::extract_snarls(const Region& region, SnarlManager& snarl_manager, MutablePathMutableHandleGraph& subgraph) { @@ -350,7 +175,7 @@ void PathChunker::extract_snarls(const Region& region, SnarlManager& snarl_manag algorithms::add_connecting_edges_to_subgraph(*graph, subgraph); // now fill in the paths - algorithms::add_subpaths_to_subgraph(*graph, subgraph, true); + algorithms::add_subpaths_to_subgraph(*graph, subgraph); } void PathChunker::extract_path_component(const string& path_name, MutablePathMutableHandleGraph& subgraph, Region& out_region) { @@ -361,18 +186,18 @@ void PathChunker::extract_path_component(const string& path_name, MutablePathMut path_ids.insert(graph->get_id(handle)); } - extract_component(path_ids, subgraph, true); + extract_component(path_ids, subgraph); out_region.seq = path_name; } -void PathChunker::extract_component(const unordered_set& node_ids, MutablePathMutableHandleGraph& subgraph, bool subpath_naming) { +void PathChunker::extract_component(const unordered_set& node_ids, MutablePathMutableHandleGraph& subgraph) { for (nid_t node_id : node_ids) { subgraph.create_handle(graph->get_sequence(graph->get_handle(node_id)), node_id); } algorithms::expand_subgraph_by_steps(*graph, subgraph, numeric_limits::max()); - algorithms::add_subpaths_to_subgraph(*graph, subgraph, subpath_naming); + algorithms::add_subpaths_to_subgraph(*graph, subgraph); } void PathChunker::extract_id_range(vg::id_t start, vg::id_t end, int64_t context, int64_t length, @@ -389,7 +214,7 @@ void PathChunker::extract_id_range(vg::id_t start, vg::id_t end, int64_t context if (length) { algorithms::expand_subgraph_by_length(*graph, subgraph, context, forward_only); } - algorithms::add_subpaths_to_subgraph(*graph, subgraph, true); + algorithms::add_subpaths_to_subgraph(*graph, subgraph); // build the vg out_region.start = subgraph.min_node_id(); diff --git a/src/chunker.hpp b/src/chunker.hpp index b9d88aeb60d..75923cd91aa 100644 --- a/src/chunker.hpp +++ b/src/chunker.hpp @@ -49,14 +49,14 @@ class PathChunker { MutablePathMutableHandleGraph& subgraph); /** - * Extract a connected component containing a given path + * Extract a connected component containing a given path. Processes path metadata and creates subpaths. */ void extract_path_component(const string& path_name, MutablePathMutableHandleGraph& subgraph, Region& out_region); /** - * Extract a connected component starting from an id set + * Extract a connected component starting from an id set. Processes path metadata and creates subpaths. */ - void extract_component(const unordered_set& node_ids, MutablePathMutableHandleGraph& subgraph, bool subpath_naming); + void extract_component(const unordered_set& node_ids, MutablePathMutableHandleGraph& subgraph); /** * Like above, but use (inclusive) id range instead of region on path. diff --git a/src/subcommand/chunk_main.cpp b/src/subcommand/chunk_main.cpp index 57397ed6713..970b75d35b6 100644 --- a/src/subcommand/chunk_main.cpp +++ b/src/subcommand/chunk_main.cpp @@ -726,7 +726,7 @@ int main_chunk(int argc, char** argv) { map trace_thread_frequencies; if (!component_ids.empty()) { subgraph = vg::io::new_output_graph(output_format); - chunker.extract_component(component_ids[i], *subgraph, false); + chunker.extract_component(component_ids[i], *subgraph); output_regions[i] = region; } else if (id_range == false) { From 5f5514b69df54a5be9eece03f1d88ac6c8620283 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Wed, 6 Nov 2024 12:08:37 -0500 Subject: [PATCH 50/54] Adjust unit tests to match idea of noting the expanded extracted visit to a cycle --- src/chunker.cpp | 21 +++++++++++++++++++-- src/chunker.hpp | 9 ++++++--- src/unittest/chunker.cpp | 18 ++++++++++++++++++ 3 files changed, 43 insertions(+), 5 deletions(-) diff --git a/src/chunker.cpp b/src/chunker.cpp index 5b1dad389a3..eb76a3cd09d 100644 --- a/src/chunker.cpp +++ b/src/chunker.cpp @@ -64,6 +64,13 @@ void PathChunker::extract_subgraph(const Region& region, int64_t context, int64_ } algorithms::add_subpaths_to_subgraph(*graph, subgraph); + // Now we need to figure out how we expanded the target path region we + // asked for. + // + // We don't just want the lowest and highest bounds of any subpath, we want + // the lowest and highest bound of the subpaths that actually overlap the + // targeted region. + // Find the lowest and highest offsets visited by any subpath of the target path we extracted on. PathSense sense = graph->get_sense(path_handle); std::string sample = graph->get_sample_name(path_handle); @@ -97,6 +104,13 @@ void PathChunker::extract_subgraph(const Region& region, int64_t context, int64_ subpath_subrange.second = subpath_subrange.first + path_length; } + if (subpath_subrange.first > region.end || subpath_subrange.second <= region.start) { + // This subpath doesn't actually overlap the selected target path + // region (which is 0-based, end-inclusive), and so shouldn't count + // for extending the selected region along the target path. + return true; + } + // Min/max in the subrange bounds min_start = std::min(min_start, subpath_subrange.first); max_end = std::max(max_end, subpath_subrange.second); @@ -118,10 +132,13 @@ void PathChunker::extract_subgraph(const Region& region, int64_t context, int64_ max_end -= source_subrange.first; } - // Produce the output region. + // We can't represent a region with a 0 end-exclusive coordinate. + crash_unless(max_end != 0); + + // Produce the output region. Convert coordinates to be 0-based, end-inclusive. out_region.seq = region.seq; out_region.start = min_start; - out_region.end = max_end; + out_region.end = max_end - 1; } void PathChunker::extract_snarls(const Region& region, SnarlManager& snarl_manager, MutablePathMutableHandleGraph& subgraph) { diff --git a/src/chunker.hpp b/src/chunker.hpp index 75923cd91aa..56922100613 100644 --- a/src/chunker.hpp +++ b/src/chunker.hpp @@ -31,9 +31,12 @@ class PathChunker { /** Extract subgraph corresponding to given path region into its * own vg graph, and send it to out_stream. The boundaries of the - * extracted graph (which can be different because we expand context and don't - * cut nodes) are written to out_region. If forward_only set, context - * is only expanded in the forward direction + * extracted region of the target path (which can be different because we + * expand context and don't cut nodes) are written to out_region. If the + * target path goes through the extracted region multiple times, only the + * extended bounds of the visit containing the target region are produced. + * + * If forward_only set, context is only expanded in the forward direction * * NOTE: we follow convention of Region coordinates being 0-based * inclusive. diff --git a/src/unittest/chunker.cpp b/src/unittest/chunker.cpp index 91326aa1ca9..24f7d3b645a 100644 --- a/src/unittest/chunker.cpp +++ b/src/unittest/chunker.cpp @@ -186,10 +186,28 @@ TEST_CASE("basic graph chunking", "[chunk]") { VG subgraph; Region out_region; chunker.extract_subgraph(region, 1, 0, false, subgraph, out_region); + + // We include node 4 in the cycle, and also node 3 which is 1 away, so + // we include all the loops arounf the cycle and need to start the + // extracted path region where it enters node 3 at base 6. REQUIRE(subgraph.node_count() == 7); REQUIRE(subgraph.edge_count() == 9); + REQUIRE(out_region.start == 6); + + } + + SECTION("Partial graph via cyclic path, 0 expansion") { + + Region region = {"z", 35, 36}; + VG subgraph; + Region out_region; + chunker.extract_subgraph(region, 0, 0, false, subgraph, out_region); + + REQUIRE(subgraph.node_count() == 1); + REQUIRE(subgraph.edge_count() == 0); REQUIRE(out_region.start == 31); + REQUIRE(out_region.end == 36); // End is inclusive } From 2d04e6a0354ddf097c425c2efeac5ec05ba7e28a Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Wed, 6 Nov 2024 12:36:43 -0500 Subject: [PATCH 51/54] Stop retaining paths when clearing vg::VG --- src/algorithms/subgraph.cpp | 3 +++ src/subcommand/find_main.cpp | 6 ++++-- src/vg.cpp | 1 + 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/algorithms/subgraph.cpp b/src/algorithms/subgraph.cpp index 86fac693ba1..5ff7d275a94 100644 --- a/src/algorithms/subgraph.cpp +++ b/src/algorithms/subgraph.cpp @@ -305,7 +305,10 @@ void add_subpaths_to_subgraph(const PathPositionHandleGraph& source, MutablePath // subrange metadata when we just have all of a path. std::unordered_map> full_path_info; + size_t subgraph_handles = 0; + subgraph.for_each_handle([&](const handle_t& h) { + subgraph_handles++; handlegraph::nid_t id = subgraph.get_id(h); if (source.has_node(id)) { handle_t handle = source.get_handle(id); diff --git a/src/subcommand/find_main.cpp b/src/subcommand/find_main.cpp index 48ebda09a6a..2e1fa0355d2 100644 --- a/src/subcommand/find_main.cpp +++ b/src/subcommand/find_main.cpp @@ -1,4 +1,5 @@ #include "subcommand.hpp" +#include "../crash.hpp" #include "../utility.hpp" #include "../mapper.hpp" #include @@ -683,8 +684,9 @@ int main_find(int argc, char** argv) { ofstream out(s.str().c_str()); vg::io::save_handle_graph(&graph, out); out.close(); - // reset our graph - dynamic_cast(graph).clear(); + // reset our graph so it has no nodes or paths anymore + graph.clear(); + crash_unless(graph.get_path_count() == 0); } if (subgraph_k) { prep_graph(); // don't forget to prep the graph, or the kmer set will be wrong[ diff --git a/src/vg.cpp b/src/vg.cpp index bdd81778d4e..cb43cf7e26d 100644 --- a/src/vg.cpp +++ b/src/vg.cpp @@ -631,6 +631,7 @@ void VG::destroy_edge(const handle_t& left, const handle_t& right) { } void VG::clear() { + clear_paths(); graph.mutable_node()->Clear(); graph.mutable_edge()->Clear(); clear_indexes(); From 68d2e7394038fe7bb58988a19192615536011146 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Wed, 6 Nov 2024 12:37:38 -0500 Subject: [PATCH 52/54] Stop counting subgraph handles --- src/algorithms/subgraph.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/algorithms/subgraph.cpp b/src/algorithms/subgraph.cpp index 5ff7d275a94..86fac693ba1 100644 --- a/src/algorithms/subgraph.cpp +++ b/src/algorithms/subgraph.cpp @@ -305,10 +305,7 @@ void add_subpaths_to_subgraph(const PathPositionHandleGraph& source, MutablePath // subrange metadata when we just have all of a path. std::unordered_map> full_path_info; - size_t subgraph_handles = 0; - subgraph.for_each_handle([&](const handle_t& h) { - subgraph_handles++; handlegraph::nid_t id = subgraph.get_id(h); if (source.has_node(id)) { handle_t handle = source.get_handle(id); From 3e5857c14b41be22e200d7f2f8b0d5bd778e1ca4 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Wed, 6 Nov 2024 12:44:48 -0500 Subject: [PATCH 53/54] Use the same subpath-free input graph for deconstruct as before --- test/t/26_deconstruct.t | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/t/26_deconstruct.t b/test/t/26_deconstruct.t index 03c7f8f3a87..492c07a81c2 100644 --- a/test/t/26_deconstruct.t +++ b/test/t/26_deconstruct.t @@ -90,7 +90,9 @@ printf "P\ty\t1+,3+,5+,6+,8+,9+,11+,12+,9+,10+,12+,14+,15+\t8M,1M,1M,3M,1M,19M,1 vg view -Fv cyclic_tiny.gfa > cyclic_tiny.vg vg index cyclic_tiny.vg -x cyclic_tiny.xg vg find -x cyclic_tiny.xg -n 10 -n 11 -n 12 -n 13 -n 14 -n 15 -c 1 > cycle.vg -vg index cycle.vg -x cycle.xg +# TODO: Make deconstruct see through subpaths to the base path +vg view cycle.vg | sed 's/\([xy]\)\[[-0-9]*\]/\1/g' >cycle-asfullpaths.gfa +vg index cycle-asfullpaths.gfa -x cycle.xg vg deconstruct cycle.xg -p y -e -t 1 > cycle_decon.vcf is $(grep -v "#" cycle_decon.vcf | wc -l) 1 "cyclic reference deconstruction has correct number of variants" grep -v "#" cycle_decon.vcf | grep 20 | awk '{print $1 "\t" $2 "\t" $4 "\t" $5 "\t" $10}' > cycle_decon.tsv From 184019446fb1cebf28602f8c1b4a350080f28b12 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Wed, 6 Nov 2024 12:48:09 -0500 Subject: [PATCH 54/54] Make chunk tests tolerate a subpath end on the extracted x subpath --- test/t/30_vg_chunk.t | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/t/30_vg_chunk.t b/test/t/30_vg_chunk.t index 142de98066a..48e52ffdb93 100644 --- a/test/t/30_vg_chunk.t +++ b/test/t/30_vg_chunk.t @@ -66,8 +66,8 @@ is $(vg chunk -x x.xg -r 1:1 -c 2 -T | vg view - -j | jq .node | grep id | wc -l # Check that traces work on a GBWT is $(vg chunk -x x.xg -G x.gbwt -r 1:1 -c 2 -T | vg view - -j | jq .node | grep id | wc -l) 5 "id chunker traces correct chunk size" -is "$(vg chunk -x x.xg -r 1:1 -c 2 -T | vg view - -j | jq -c '.path[] | select(.name != "x[0]")' | wc -l)" 0 "chunker extracts no threads from an empty gPBWT" -is "$(vg chunk -x x.xg -G x.haps.gbwt -r 1:1 -c 2 -T | vg view - -j | jq -c '.path[] | select(.name != "x[0]")' | wc -l)" 2 "chunker extracts 2 local threads from a gBWT with 2 locally distinct threads in it" +is "$(vg chunk -x x.xg -r 1:1 -c 2 -T | vg view - -j | jq -c '.path[] | select(.name | startswith("x[0") | not)' | wc -l)" 0 "chunker extracts no threads from an empty gPBWT" +is "$(vg chunk -x x.xg -G x.haps.gbwt -r 1:1 -c 2 -T | vg view - -j | jq -c '.path[] | select(.name | startswith("x[0") | not)' | wc -l)" 2 "chunker extracts 2 local threads from a gBWT with 2 locally distinct threads in it" is "$(vg chunk -x x.xg -G x.gbwt -r 1:1 -c 2 -T | vg view - -j | jq -r '.path[] | select(.name == "thread_0") | .mapping | length')" 3 "chunker can extract a partial haplotype from a GBWT" is "$(vg chunk -x x.gbz -r 1:1 -c 2 -T | vg view - -j | jq -r '.path[] | select(.name == "thread_0") | .mapping | length')" 3 "chunker can extract a partial haplotype from a GBZ" is "$(vg chunk -x x.gbz -r 1:1 -c 2 -T --no-embedded-haplotypes | vg view - -j | jq -r '.path[] | select(.name == "thread_0") | .mapping | length')" "" "chunker doesn't see haplotypes in the GBZ if asked not to"