From b093eeace4832ecabc85d43b70f84d495a674988 Mon Sep 17 00:00:00 2001
From: kojix2 <2xijok@gmail.com>
Date: Thu, 12 Oct 2023 11:44:56 +0900
Subject: [PATCH 01/54] Update elfutils submodule URL

---
 .gitmodules | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitmodules b/.gitmodules
index 6b6337be74d..f1fb6d32eef 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -63,7 +63,7 @@
 	url = https://github.com/adamnovak/backward-cpp.git
 [submodule "deps/elfutils"]
 	path = deps/elfutils
-	url = git://sourceware.org/git/elfutils.git
+	url = https://sourceware.org/git/elfutils.git
 [submodule "deps/structures"]
 	path = deps/structures
 	url = https://github.com/vgteam/structures.git

From 9a6206e4667167257ec918802e89675e5aaebd5d Mon Sep 17 00:00:00 2001
From: Jouni Siren <jouni.siren@iki.fi>
Date: Tue, 13 Aug 2024 14:24:09 +0300
Subject: [PATCH 02/54] Better error handling in vg gbwt

---
 src/subcommand/gbwt_main.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/subcommand/gbwt_main.cpp b/src/subcommand/gbwt_main.cpp
index db1175b426e..b22f0c0ce8e 100644
--- a/src/subcommand/gbwt_main.cpp
+++ b/src/subcommand/gbwt_main.cpp
@@ -999,9 +999,11 @@ void validate_gbwt_config(GBWTConfig& config) {
     if (!config.to_remove.empty()) {
         if (config.build == GBWTConfig::build_gbz) {
             std::cerr << "error: [vg gbwt] the GBWT extracted from GBZ cannot have paths modified" << std::endl;
+            std::exit(EXIT_FAILURE);
         }
         if (config.build == GBWTConfig::build_gbwtgraph) {
             std::cerr << "error: [vg gbwt] the GBWT loaded with a GBWTGraph cannot have paths modified" << std::endl;
+            std::exit(EXIT_FAILURE);
         }
         if (!(config.input_filenames.size() == 1 || config.merge != GBWTConfig::merge_none) || !has_gbwt_output) {
             std::cerr << "error: [vg gbwt] removing a sample requires one input GBWT and output GBWT" << std::endl;

From 25716504b69c7b357f5a61ad4934aba695244d62 Mon Sep 17 00:00:00 2001
From: Xian <xhchang@ucsc.edu>
Date: Wed, 11 Sep 2024 17:58:34 +0200
Subject: [PATCH 03/54] Add unit test for a reverse loop in a chain without a
 snarl followed by a snarl and the bdsg change to fix it

---
 deps/libbdsg                          |  2 +-
 src/unittest/snarl_distance_index.cpp | 54 +++++++++++++++++++++++----
 2 files changed, 48 insertions(+), 8 deletions(-)

diff --git a/deps/libbdsg b/deps/libbdsg
index 757896a678e..c8a10cb4cb9 160000
--- a/deps/libbdsg
+++ b/deps/libbdsg
@@ -1 +1 @@
-Subproject commit 757896a678eb7af7215147ef38cdb9315775dfe0
+Subproject commit c8a10cb4cb96c74093d1dc2883f2f4379a622b4e
diff --git a/src/unittest/snarl_distance_index.cpp b/src/unittest/snarl_distance_index.cpp
index 15ef5388edc..ca30acb6511 100644
--- a/src/unittest/snarl_distance_index.cpp
+++ b/src/unittest/snarl_distance_index.cpp
@@ -1782,7 +1782,7 @@ namespace vg {
                    
         } 
         TEST_CASE( "Snarl decomposition can handle chains with nodes in different directions",
-                  "[snarl_distance][bug]" ) {
+                  "[snarl_distance]" ) {
         
         
             // This graph will have a snarl from 1 to 8, a snarl from 2 to 7,
@@ -6678,7 +6678,7 @@ namespace vg {
         
             }
         } //End test case
-        TEST_CASE("top level chain subgraph", "[snarl_distance][snarl_distance_subgraph][bug]") {
+        TEST_CASE("top level chain subgraph", "[snarl_distance][snarl_distance_subgraph]") {
             VG graph;
         
             Node* n1 = graph.create_node("GCA");
@@ -6712,8 +6712,6 @@ namespace vg {
             Edge* e16 = graph.create_edge(n10, n12);
             Edge* e17 = graph.create_edge(n11, n12);
 
-            ofstream out ("test_graph.vg");
-            graph.serialize(out);
 
         
             IntegratedSnarlFinder snarl_finder(graph);
@@ -6749,9 +6747,6 @@ namespace vg {
                 SnarlDistanceIndex distance_index;
                 fill_in_distance_index(&distance_index, &graph, &snarl_finder);
                 subgraph_in_distance_range(distance_index, path, &graph, 11, 14, sub_graph, true);
-                for (auto& id : sub_graph) {
-                    cerr << id << endl;
-                }
         
                 REQUIRE(!sub_graph.count(3));
                 REQUIRE(!sub_graph.count(8));
@@ -6876,6 +6871,51 @@ namespace vg {
         
         }//end test case
 
+        TEST_CASE( "Loop in chain not connected to snarl", "[snarl_distance]" ) {
+
+            VG graph;
+
+            Node* n1 = graph.create_node("GCAA");
+            Node* n2 = graph.create_node("GCAA");
+            Node* n3 = graph.create_node("AAAT");
+            Node* n4 = graph.create_node("T");
+            Node* n5 = graph.create_node("G");
+            Node* n6 = graph.create_node("AAAA");
+            Node* n7 = graph.create_node("GGAA");
+            Node* n8 = graph.create_node("TTT");
+
+            Edge* e1 = graph.create_edge(n1, n2);
+            Edge* e2 = graph.create_edge(n2, n2, true, false);
+            Edge* e3 = graph.create_edge(n2, n3);
+
+            Edge* e4 = graph.create_edge(n3, n4);
+            Edge* e5 = graph.create_edge(n3, n5);
+            Edge* e6 = graph.create_edge(n4, n6);
+            Edge* e7 = graph.create_edge(n5, n6);
+            Edge* e8 = graph.create_edge(n6, n7);
+            Edge* e9 = graph.create_edge(n6, n6, false, true);
+            Edge* e10 = graph.create_edge(n7, n8);
+
+
+            //get the snarls
+            IntegratedSnarlFinder snarl_finder(graph);
+            SnarlDistanceIndex distance_index;
+            fill_in_distance_index(&distance_index, &graph, &snarl_finder);
+
+
+            SECTION("Traversal of chain") {
+                net_handle_t chain1_6 = distance_index.get_parent(distance_index.get_node_net_handle(n1->id()));
+                distance_index.for_each_child(chain1_6, [&](const net_handle_t& child) {
+                    assert(distance_index.get_parent(child) == chain1_6);
+                });
+            }
+            SECTION("Minimum distances are correct") {
+                REQUIRE(distance_index.minimum_distance(n1->id(),false, 0, n1->id(), true, 0) == 30);
+            }
+
+        }
+
+
         TEST_CASE("random test subgraph", "[snarl_distance][snarl_distance_subgraph]") {
 
             int64_t min = 20; int64_t max = 50;

From 7657aa38eed73b676639d8ac44e0d150598d5369 Mon Sep 17 00:00:00 2001
From: Xian <xhchang@ucsc.edu>
Date: Sat, 1 Jul 2023 18:30:37 +0200
Subject: [PATCH 04/54] Topologically-ish sort children of a snarl and make
 their ranks the order of the sort

---
 src/snarl_distance_index.cpp | 198 ++++++++++++++++++++++++++++++++++-
 1 file changed, 193 insertions(+), 5 deletions(-)

diff --git a/src/snarl_distance_index.cpp b/src/snarl_distance_index.cpp
index c0a950838bc..f4338f58e76 100644
--- a/src/snarl_distance_index.cpp
+++ b/src/snarl_distance_index.cpp
@@ -789,14 +789,202 @@ void populate_snarl_index(
         return curr_index;
     };
 
+    //TODO: Copying the list
+    vector<pair<SnarlDistanceIndex::temp_record_t, size_t>> all_children = temp_snarl_record.children;
+
+    /*
+     * Do a topological sort of the children and re-assign ranks based on the sort
+     * TODO: Snarls aren't guaranteed to be DAGs, so ideally this will be a sort
+     *       that minimizes back edges and the number of times a node is traversed backwards
+     *       For now though, just do a topological sort and don't take any loops or reversing edges
+     */
+     if (!temp_snarl_record.is_root_snarl) {
+
+        //This will hold the new order of the children. Each value is an index into all_children, which 
+        //matches the ranks(-2) of the children 
+        vector<size_t> topological_sort_order;
+        topological_sort_order.reserve(all_children.size());
+
+        // This holds everything in the topological order, to check which nodes (and therefore edges)
+        // have already been added
+        // Unlike the topological order, this stores the orientation as well.
+        // Each node is only added once to the topological order, but the reverse orientation
+        // may still be traversed to ensure that all nodes are found
+        unordered_set<std::pair<size_t, bool>> visited_nodes;
+        visited_nodes.reserve(all_children.size());
+
+        //All nodes that have no incoming edges
+        vector<pair<size_t, bool>> source_nodes;
+
+        /* Add all sources. This will start out as the start node and any tips or nodes that
+           are only reachable from the end node
+        */
+        //unordered_set<std::pair<size_t, bool>> children_seen_from_start;
+        //vector<pair<size_t, bool>> dfs_stack_from_start
+
+        //// Look for tips and loops from the end node that never reach the start node
+        //vector<pair<size_t, bool>> dfs_stack_from_end;
+        //dfs_stack_from_end.emplace_back(std::numeric_limits<size_t>::max(), false); //To indicate end node
+        //while (dfs_stack_from_end.size() != 0) {
+        //    // Go through all nodes from the end and search for anything that is a tip or that loops without 
+        //    // reaching anything seen on from the start 
+        //}
+
+        //Add max() to indicate that we start at the start node, since the start node doesn't actually have a 
+        //rank. This gets added last so it is traversed first
+        source_nodes.emplace_back(std::numeric_limits<size_t>::max(), false);
+
+        //We'll be done sorting when everything is in the sorted vector
+        while (!source_nodes.empty()) {
+
+            //Pick a child with no incoming edges
+            pair<size_t, bool> current_child_index = source_nodes.back();
+            source_nodes.pop_back();
+
+            //Mark it as being visited
+            assert(visited_nodes.count(current_child_index) == 0);
+            visited_nodes.emplace(current_child_index);
+
+            //Get the graph handle for that child, pointing out from the end of the chain
+            handle_t current_graph_handle;
+            if (current_child_index.first == std::numeric_limits<size_t>::max()) {
+                //If the current child is the start bound, then get the start node pointing in 
+                current_graph_handle = graph->get_handle(temp_snarl_record.start_node_id,temp_snarl_record.start_node_rev);
+            } else {
+                pair<SnarlDistanceIndex::temp_record_t, size_t> current_index = all_children[current_child_index.first];
+                if (current_index.first == SnarlDistanceIndex::TEMP_NODE) {
+                    //If the current child is a node, then get the node pointing in the correct direction
+                    current_graph_handle = graph->get_handle(current_index.second, current_child_index.second);
+                } else if (current_child_index.second) {
+                    //If the current child is a chain, and we're traversing the chain backwards
+                    current_graph_handle = graph->get_handle(temp_index.temp_chain_records[current_index.second].start_node_id, 
+                                                  !temp_index.temp_chain_records[current_index.second].start_node_rev);
+                } else {
+                    //Otherwise, the current child is a chain and we're traversing the chain forwards
+                    current_graph_handle = graph->get_handle(temp_index.temp_chain_records[current_index.second].end_node_id, 
+                                                     temp_index.temp_chain_records[current_index.second].end_node_rev);
+                }
+            }
+                 
+            //Add everything reachable from the start boundary node that has no other incoming edges
+            graph->follow_edges(current_graph_handle, false, [&](const handle_t next_handle) {
+#ifdef debug_distance_indexing
+                cerr << "Following forward edges from " << graph->get_id(current_graph_handle) << " to " << graph->get_id(next_handle) << endl;
+#endif
+                if (graph->get_id(next_handle) == temp_snarl_record.start_node_id ||
+                    graph->get_id(next_handle) == temp_snarl_record.end_node_id) {
+                    //If this is trying to leave the snarl, skip it
+                    return true;
+                }
+                //Check the next_handle going in the other direction, to see if it could be a new source node. 
+                //If it reaches anything unseen, then it can't be a source node
 
-    /*Now go through each of the children and add distances from that child to everything reachable from it
+                //Get the index of next_handle
+                pair<SnarlDistanceIndex::temp_record_t, size_t> next_index = get_ancestor_of_node(make_pair(SnarlDistanceIndex::TEMP_NODE, graph->get_id(next_handle))); 
+                size_t next_rank = next_index.first == SnarlDistanceIndex::TEMP_NODE
+                            ? temp_index.temp_node_records.at(next_index.second-temp_index.min_node_id).rank_in_parent  
+                            : temp_index.temp_chain_records[next_index.second].rank_in_parent;
+                assert(all_children[next_rank-2] == next_index);
+                bool next_rev = next_index.first == SnarlDistanceIndex::TEMP_NODE || temp_index.temp_chain_records[next_index.second].is_trivial 
+                            ? graph->get_is_reverse(next_handle) 
+                            : graph->get_id(next_handle) == temp_index.temp_chain_records[next_index.second].end_node_id;
+                if (visited_nodes.count(make_pair(next_rank, next_rev)) != 0) {
+                    //If this is a loop, just skip it
+                    return true;
+                }
+
+                //Get the handle from the child represented by next_handle going the other way
+                handle_t reverse_handle = next_index.first == SnarlDistanceIndex::TEMP_NODE ? 
+                            graph->get_handle(next_index.second, !next_rev) :
+                            (next_rev ? graph->get_handle(temp_index.temp_chain_records[next_index.second].end_node_id, 
+                                                          temp_index.temp_chain_records[next_index.second].end_node_rev)
+                                      : graph->get_handle(temp_index.temp_chain_records[next_index.second].start_node_id, 
+                                                          !temp_index.temp_chain_records[next_index.second].start_node_rev));
+
+                //Does this have no unseen incoming edges? Check as we go through incoming edges
+                bool is_source = true;
+
+                //Does this have no unseen incoming edges but including nodes we've seen in the other direction?
+                //TODO: Actually do this
+                graph->follow_edges(reverse_handle, false, [&](const handle_t incoming_handle) {
+#ifdef debug_distance_indexing
+                cerr << "Getting backwards edge to " << graph->get_id(incoming_handle) << endl;
+#endif
+                    if (graph->get_id(incoming_handle) == temp_snarl_record.start_node_id ||
+                        graph->get_id(incoming_handle) == temp_snarl_record.end_node_id) {
+                        //If this is trying to leave the snarl
+                        return true;
+                    }
+                    //The index of the snarl's child that next_handle represents
+                    pair<SnarlDistanceIndex::temp_record_t, size_t> incoming_index = get_ancestor_of_node(make_pair(SnarlDistanceIndex::TEMP_NODE, graph->get_id(incoming_handle))); 
+                    size_t incoming_rank = incoming_index.first == SnarlDistanceIndex::TEMP_NODE
+                                ? temp_index.temp_node_records.at(incoming_index.second-temp_index.min_node_id).rank_in_parent  
+                                : temp_index.temp_chain_records[incoming_index.second].rank_in_parent;
+
+                    bool incoming_rev = incoming_index.first == SnarlDistanceIndex::TEMP_NODE || temp_index.temp_chain_records[incoming_index.second].is_trivial 
+                                ? graph->get_is_reverse(incoming_handle) 
+                                : graph->get_id(incoming_handle) == temp_index.temp_chain_records[incoming_index.second].end_node_id;
+                    //subtract 2 to get the index from the rank
+                    assert(incoming_rank >= 2);
+                    incoming_rank-=2;
+
+                    //If we haven't seen the incoming node before, then this isn't a source so we break out of
+                    //the loop and keep going
+                    if (visited_nodes.count(std::make_pair(incoming_rank, !incoming_rev)) == 0) {
+                        is_source = false;
+                    }
+                    //Keep going
+                    return true;
+                });
+                if (is_source) {
+                    //If this is a new source node, then add it as a source node
+
+                    //subtract 2 to get the index from the rank
+                    assert(next_rank >= 2);
+                    next_rank-=2;
+                    source_nodes.emplace_back(next_rank, next_rev);
+                }
+                return true;
+            });
+            if (current_child_index.first != std::numeric_limits<size_t>::max() &&
+                visited_nodes.count(make_pair(current_child_index.first, !current_child_index.second)) == 0) {
+                //If this node wasn't already added in the other direction, add it to the topological sort
+                topological_sort_order.emplace_back(current_child_index.first);
+            }
+        }
+
+        //TODO: Do this properly
+        // For now, we only really want a topological ordering of DAGs, and I'm going to ignore tips
+        // So if anything is only reachable from the end node, then add it in an arbitrary order
+        vector<bool> check_ranks (all_children.size(), false);
+        for (size_t x : topological_sort_order) {
+            check_ranks[x] = true;
+        }
+        //If anything wasn't in the topological order, add it now
+        for (size_t i = 0 ; i < check_ranks.size() ; i++) {
+            if (!check_ranks[i]) {
+                topological_sort_order.emplace_back(i);
+            }
+        }
+        assert(topological_sort_order.size() == all_children.size());
+
+
+        //We've finished doing to topological sort, so update every child's rank to be the new order
+        for (size_t new_rank = 0 ; new_rank < topological_sort_order.size() ; new_rank++) {
+            size_t old_rank = topological_sort_order[new_rank];
+            if (all_children[old_rank].first == SnarlDistanceIndex::TEMP_NODE) {
+                temp_index.temp_node_records.at(all_children[old_rank].second-temp_index.min_node_id).rank_in_parent = new_rank+2;
+            } else {
+                temp_index.temp_chain_records[all_children[old_rank].second].rank_in_parent = new_rank+2;
+            }
+        }
+     }
+
+    /*
+     * Now go through each of the children and add distances from that child to everything reachable from it
      * Start a dijkstra traversal from each node side in the snarl and record all distances
      */
 
-    //Add the start and end nodes to the list of children so that we include them in the traversal 
-    //TODO: Copying the list
-    vector<pair<SnarlDistanceIndex::temp_record_t, size_t>> all_children = temp_snarl_record.children;
 
     //Reserve enough space to store all possible distances
     temp_snarl_record.distances.reserve( (temp_snarl_record.node_count > size_limit || size_limit == 0) 
@@ -807,8 +995,8 @@ void populate_snarl_index(
         temp_index.use_oversized_snarls = true;
     }
 
+    //Add the start and end nodes to the list of children so that we include them in the traversal 
     if (!temp_snarl_record.is_root_snarl) {
-
         all_children.emplace_back(SnarlDistanceIndex::TEMP_NODE, temp_snarl_record.start_node_id);
         all_children.emplace_back(SnarlDistanceIndex::TEMP_NODE, temp_snarl_record.end_node_id);
     }

From 26387cc341304e53336295a76737a67d6f0ef7e0 Mon Sep 17 00:00:00 2001
From: Xian <xhchang@ucsc.edu>
Date: Sat, 1 Jul 2023 20:07:27 +0200
Subject: [PATCH 05/54] Topologically sort from the start of a snarl relative
 to the top-level chain

---
 src/snarl_distance_index.cpp | 84 +++++++++++++++++++++++++++++++++---
 1 file changed, 78 insertions(+), 6 deletions(-)

diff --git a/src/snarl_distance_index.cpp b/src/snarl_distance_index.cpp
index f4338f58e76..395d0740a69 100644
--- a/src/snarl_distance_index.cpp
+++ b/src/snarl_distance_index.cpp
@@ -767,7 +767,8 @@ void populate_snarl_index(
 
 
     /*Helper function to find the ancestor of a node that is a child of this snarl */
-    auto get_ancestor_of_node = [&](pair<SnarlDistanceIndex::temp_record_t, size_t> curr_index) {
+    auto get_ancestor_of_node = [&](pair<SnarlDistanceIndex::temp_record_t, size_t> curr_index,
+                                    pair<SnarlDistanceIndex::temp_record_t, size_t> ancestor_snarl_index) {
 
         //This is a child that isn't a node, so it must be a chain
         if (curr_index.second == temp_snarl_record.start_node_id || 
@@ -777,7 +778,7 @@ void populate_snarl_index(
 
         //Otherwise, walk up until we hit the current snarl
         pair<SnarlDistanceIndex::temp_record_t, size_t> parent_index = temp_index.temp_node_records.at(curr_index.second-temp_index.min_node_id).parent;
-        while (parent_index != snarl_index) {
+        while (parent_index != ancestor_snarl_index) {
             curr_index=parent_index;
             parent_index = parent_index.first == SnarlDistanceIndex::TEMP_SNARL ? temp_index.temp_snarl_records.at(parent_index.second).parent
                                                             : temp_index.temp_chain_records.at(parent_index.second).parent;
@@ -800,6 +801,74 @@ void populate_snarl_index(
      */
      if (!temp_snarl_record.is_root_snarl) {
 
+         //Is this snarl reversed relative to the top-level chain?
+         bool is_reversed = false;
+         // Walk up the snarl tree and if anything is reversed (or a chain is only reachable backwards in its parent)
+         // then flip is_reversed
+         // Since we don't have distances in snarl ancestors yet, walk out the fronts of chains and see if 
+         // we hit the snarl start or end
+        pair<SnarlDistanceIndex::temp_record_t, size_t> current_index = snarl_index;
+        while (current_index.first != SnarlDistanceIndex::TEMP_ROOT) {
+
+            //Get the parent of the current index
+            pair<SnarlDistanceIndex::temp_record_t, size_t> parent_index = 
+                current_index.first == SnarlDistanceIndex::TEMP_SNARL ? temp_index.temp_snarl_records.at(current_index.second).parent
+                                                            : temp_index.temp_chain_records.at(current_index.second).parent;
+            if (parent_index.first == SnarlDistanceIndex::TEMP_SNARL) {
+                //If the parent is a snarl, then walk out the front of the chain and see if it reaches the start of the ancestor snarl
+                vector<handle_t> to_check;
+                unordered_set<handle_t> seen;
+                to_check.emplace_back(graph->get_handle(temp_index.temp_chain_records[current_index.second].start_node_id, 
+                                                          !temp_index.temp_chain_records[current_index.second].start_node_rev));
+                seen.emplace(to_check.back());
+                bool reaches_start = false;
+                while (!to_check.empty()) {
+                    handle_t current_handle = to_check.back();
+                    to_check.pop_back();
+                    graph->follow_edges(current_handle, false, [&](const handle_t next_handle) {
+                        if (seen.count(next_handle) == 0) {
+                            if (graph->get_id(next_handle) == temp_index.temp_snarl_records[parent_index.second].start_node_id) {
+                                //If this reached the start node, then we consider the chain to be oriented forward 
+                                // so we can stop
+                                reaches_start = true;
+                                //Stop iterating
+                                return false;
+                            } else if (graph->get_id(next_handle) != temp_index.temp_snarl_records[parent_index.second].end_node_id) {
+                                //If this isn't leaving the snarl, then continue traversing
+                                //We need to jump to the end of the current chain
+
+                                //First, find the temp_chain_record for the chain we just entered
+                                pair<SnarlDistanceIndex::temp_record_t, size_t> next_index  = 
+                                    get_ancestor_of_node(make_pair(SnarlDistanceIndex::TEMP_NODE, graph->get_id(next_handle)), parent_index);
+                                
+                                to_check.emplace_back( next_index.first == SnarlDistanceIndex::TEMP_NODE 
+                                    ? next_handle :
+                                      (graph->get_id(next_handle) == temp_index.temp_chain_records[next_index.second].start_node_id
+                                        ? graph->get_handle(temp_index.temp_chain_records[next_index.second].end_node_id, 
+                                                            temp_index.temp_chain_records[next_index.second].end_node_rev)
+                                        : graph->get_handle(temp_index.temp_chain_records[next_index.second].start_node_id, 
+                                                            !temp_index.temp_chain_records[next_index.second].start_node_rev)));
+
+                            }
+                            seen.emplace(next_handle);
+                        }
+                        return true;
+                    });
+                    if (!reaches_start) {
+                        //If we couldn't reach the start of the parent from the start of the child, then assume the child
+                        //was reversed 
+                        is_reversed = !is_reversed;
+                    }
+                }
+            }
+            current_index=parent_index;
+        }
+
+        //Where do we start the topological sort? The start or end bound
+        handle_t topological_sort_start = is_reversed ? graph->get_handle(temp_snarl_record.start_node_id,temp_snarl_record.start_node_rev)
+                                                      : graph->get_handle(temp_snarl_record.end_node_id,!temp_snarl_record.end_node_rev);
+
+
         //This will hold the new order of the children. Each value is an index into all_children, which 
         //matches the ranks(-2) of the children 
         vector<size_t> topological_sort_order;
@@ -849,7 +918,7 @@ void populate_snarl_index(
             handle_t current_graph_handle;
             if (current_child_index.first == std::numeric_limits<size_t>::max()) {
                 //If the current child is the start bound, then get the start node pointing in 
-                current_graph_handle = graph->get_handle(temp_snarl_record.start_node_id,temp_snarl_record.start_node_rev);
+                current_graph_handle = topological_sort_start;
             } else {
                 pair<SnarlDistanceIndex::temp_record_t, size_t> current_index = all_children[current_child_index.first];
                 if (current_index.first == SnarlDistanceIndex::TEMP_NODE) {
@@ -880,7 +949,8 @@ void populate_snarl_index(
                 //If it reaches anything unseen, then it can't be a source node
 
                 //Get the index of next_handle
-                pair<SnarlDistanceIndex::temp_record_t, size_t> next_index = get_ancestor_of_node(make_pair(SnarlDistanceIndex::TEMP_NODE, graph->get_id(next_handle))); 
+                pair<SnarlDistanceIndex::temp_record_t, size_t> next_index = 
+                    get_ancestor_of_node(make_pair(SnarlDistanceIndex::TEMP_NODE, graph->get_id(next_handle)), snarl_index); 
                 size_t next_rank = next_index.first == SnarlDistanceIndex::TEMP_NODE
                             ? temp_index.temp_node_records.at(next_index.second-temp_index.min_node_id).rank_in_parent  
                             : temp_index.temp_chain_records[next_index.second].rank_in_parent;
@@ -916,7 +986,8 @@ void populate_snarl_index(
                         return true;
                     }
                     //The index of the snarl's child that next_handle represents
-                    pair<SnarlDistanceIndex::temp_record_t, size_t> incoming_index = get_ancestor_of_node(make_pair(SnarlDistanceIndex::TEMP_NODE, graph->get_id(incoming_handle))); 
+                    pair<SnarlDistanceIndex::temp_record_t, size_t> incoming_index = 
+                        get_ancestor_of_node(make_pair(SnarlDistanceIndex::TEMP_NODE, graph->get_id(incoming_handle)), snarl_index); 
                     size_t incoming_rank = incoming_index.first == SnarlDistanceIndex::TEMP_NODE
                                 ? temp_index.temp_node_records.at(incoming_index.second-temp_index.min_node_id).rank_in_parent  
                                 : temp_index.temp_chain_records[incoming_index.second].rank_in_parent;
@@ -1144,7 +1215,8 @@ void populate_snarl_index(
                     auto& node_record = temp_index.temp_node_records.at(graph->get_id(next_handle)-temp_index.min_node_id);
 
                     //The index of the snarl's child that next_handle represents
-                    pair<SnarlDistanceIndex::temp_record_t, size_t> next_index = get_ancestor_of_node(make_pair(SnarlDistanceIndex::TEMP_NODE, graph->get_id(next_handle))); 
+                    pair<SnarlDistanceIndex::temp_record_t, size_t> next_index = 
+                        get_ancestor_of_node(make_pair(SnarlDistanceIndex::TEMP_NODE, graph->get_id(next_handle)), snarl_index); 
 
                     bool next_is_tip = start_index.first == SnarlDistanceIndex::TEMP_NODE 
                               ? temp_index.temp_node_records.at(start_index.second-temp_index.min_node_id).is_tip 

From c5ff4208eb7b03e0f063d3a163c06aadd8a12ed8 Mon Sep 17 00:00:00 2001
From: Xian <xhchang@ucsc.edu>
Date: Sat, 1 Jul 2023 22:15:06 +0200
Subject: [PATCH 06/54] Fix orientation of topological sort

---
 src/snarl_distance_index.cpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/snarl_distance_index.cpp b/src/snarl_distance_index.cpp
index 395d0740a69..4a0f95e8786 100644
--- a/src/snarl_distance_index.cpp
+++ b/src/snarl_distance_index.cpp
@@ -863,10 +863,15 @@ void populate_snarl_index(
             }
             current_index=parent_index;
         }
+#ifdef debug_distance_indexing
+        if (is_reversed) {
+            cerr << "\tsnarl is reversed relative to the top-level chain" << endl;
+        }
+#endif
 
         //Where do we start the topological sort? The start or end bound
-        handle_t topological_sort_start = is_reversed ? graph->get_handle(temp_snarl_record.start_node_id,temp_snarl_record.start_node_rev)
-                                                      : graph->get_handle(temp_snarl_record.end_node_id,!temp_snarl_record.end_node_rev);
+        handle_t topological_sort_start = is_reversed ? graph->get_handle(temp_snarl_record.end_node_id,!temp_snarl_record.end_node_rev)
+                                                      : graph->get_handle(temp_snarl_record.start_node_id,temp_snarl_record.start_node_rev);
 
 
         //This will hold the new order of the children. Each value is an index into all_children, which 

From f335c9ca544849c1a51b0edc98906cb6317b59c7 Mon Sep 17 00:00:00 2001
From: Xian <xhchang@ucsc.edu>
Date: Thu, 12 Sep 2024 09:13:36 +0200
Subject: [PATCH 07/54] Fix unit test

---
 src/unittest/snarl_distance_index.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/unittest/snarl_distance_index.cpp b/src/unittest/snarl_distance_index.cpp
index ca30acb6511..fc7b808bd35 100644
--- a/src/unittest/snarl_distance_index.cpp
+++ b/src/unittest/snarl_distance_index.cpp
@@ -6906,7 +6906,7 @@ namespace vg {
             SECTION("Traversal of chain") {
                 net_handle_t chain1_6 = distance_index.get_parent(distance_index.get_node_net_handle(n1->id()));
                 distance_index.for_each_child(chain1_6, [&](const net_handle_t& child) {
-                    assert(distance_index.get_parent(child) == chain1_6);
+                    assert(distance_index.canonical(distance_index.get_parent(child)) == distance_index.canonical(chain1_6));
                 });
             }
             SECTION("Minimum distances are correct") {

From eb2ba80a4f76660804b3b9e7d7d6eb97b10e2b3e Mon Sep 17 00:00:00 2001
From: Xian <xhchang@ucsc.edu>
Date: Thu, 12 Sep 2024 09:43:59 +0200
Subject: [PATCH 08/54] Silence warnings for oversized snarls

---
 src/snarl_distance_index.cpp  | 4 ++--
 src/snarl_distance_index.hpp  | 2 +-
 src/subcommand/index_main.cpp | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/snarl_distance_index.cpp b/src/snarl_distance_index.cpp
index 4a0f95e8786..d60cd780356 100644
--- a/src/snarl_distance_index.cpp
+++ b/src/snarl_distance_index.cpp
@@ -20,13 +20,13 @@ size_t maximum_distance(const SnarlDistanceIndex& distance_index, pos_t pos1, po
                                             get_id(pos2), get_is_rev(pos2), get_offset(pos2)); 
 }
 
-void fill_in_distance_index(SnarlDistanceIndex* distance_index, const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit) {
+void fill_in_distance_index(SnarlDistanceIndex* distance_index, const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit, bool silence_warnings) {
     distance_index->set_snarl_size_limit(size_limit);
 
     //Build the temporary distance index from the graph
     SnarlDistanceIndex::TemporaryDistanceIndex temp_index = make_temporary_distance_index(graph, snarl_finder, size_limit);
 
-    if (temp_index.use_oversized_snarls) {
+    if (!silence_warnings && temp_index.use_oversized_snarls) {
         cerr << "warning: distance index uses oversized snarls, which may make mapping slow" << endl;
         cerr << "\ttry increasing --snarl-limit when building the distance index" << endl;
     }
diff --git a/src/snarl_distance_index.hpp b/src/snarl_distance_index.hpp
index 33f3c6c8490..13e8777301c 100644
--- a/src/snarl_distance_index.hpp
+++ b/src/snarl_distance_index.hpp
@@ -22,7 +22,7 @@ size_t maximum_distance(const SnarlDistanceIndex& distance_index, pos_t pos1, po
 
 //Fill in the index
 //size_limit is a limit on the number of nodes in a snarl, after which the index won't store pairwise distances
-void fill_in_distance_index(SnarlDistanceIndex* distance_index, const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit = 50000);
+void fill_in_distance_index(SnarlDistanceIndex* distance_index, const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit = 50000, bool silence_warnings=true);
 
 //Fill in the temporary snarl record with distances
 void populate_snarl_index(SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, 
diff --git a/src/subcommand/index_main.cpp b/src/subcommand/index_main.cpp
index 3ffb69dc70a..26a761162f2 100644
--- a/src/subcommand/index_main.cpp
+++ b/src/subcommand/index_main.cpp
@@ -701,7 +701,7 @@ int main_index(int argc, char** argv) {
                 SnarlDistanceIndex distance_index;
 
                 //Fill it in
-                fill_in_distance_index(&distance_index, xg.get(), &snarl_finder, snarl_limit);
+                fill_in_distance_index(&distance_index, xg.get(), &snarl_finder, snarl_limit, false);
                 // Save it
                 distance_index.serialize(dist_name);
             } else {

From 22039a119a4d7dc21e47b29ae3365d6310397459 Mon Sep 17 00:00:00 2001
From: Xian <xhchang@ucsc.edu>
Date: Thu, 12 Sep 2024 11:52:14 +0200
Subject: [PATCH 09/54] Update bdsg to fix distanceless distance indexing

---
 deps/libbdsg                          |  2 +-
 src/unittest/snarl_distance_index.cpp | 26 +++++++++++++++++++++-----
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/deps/libbdsg b/deps/libbdsg
index c8a10cb4cb9..33e2fc213eb 160000
--- a/deps/libbdsg
+++ b/deps/libbdsg
@@ -1 +1 @@
-Subproject commit c8a10cb4cb96c74093d1dc2883f2f4379a622b4e
+Subproject commit 33e2fc213eb492a0db38a30ecc1b3b4e3146e769
diff --git a/src/unittest/snarl_distance_index.cpp b/src/unittest/snarl_distance_index.cpp
index fc7b808bd35..5067b31040c 100644
--- a/src/unittest/snarl_distance_index.cpp
+++ b/src/unittest/snarl_distance_index.cpp
@@ -150,7 +150,7 @@ namespace vg {
                 REQUIRE(std::get<2>(traceback.second.back()) == -5);
             }
         }
-        TEST_CASE( "Nested chain with loop", "[snarl_distance]" ) {
+        TEST_CASE( "Nested chain with loop", "[snarl_distance][bug]" ) {
         
             VG graph;
                 
@@ -190,9 +190,9 @@ namespace vg {
             
             //get the snarls
             IntegratedSnarlFinder snarl_finder(graph); 
-            SnarlDistanceIndex distance_index;
-            fill_in_distance_index(&distance_index, &graph, &snarl_finder);
             SECTION("Traversal of chain") {
+                SnarlDistanceIndex distance_index;
+                fill_in_distance_index(&distance_index, &graph, &snarl_finder);
                 net_handle_t chain1_13 = distance_index.get_parent(distance_index.get_node_net_handle(n1->id()));
                 distance_index.for_each_child(chain1_13, [&](const net_handle_t& child) {
                     if (distance_index.is_node(child)) {
@@ -200,6 +200,8 @@ namespace vg {
                 });
             }
             SECTION("Minimum distances are correct") {
+                SnarlDistanceIndex distance_index;
+                fill_in_distance_index(&distance_index, &graph, &snarl_finder);
                 net_handle_t node2 = distance_index.get_node_net_handle(n2->id());
                 net_handle_t chain1_13 = distance_index.get_parent(node2);
                 REQUIRE(distance_index.distance_in_parent(chain1_13, distance_index.flip(node2), distance_index.flip(node2)) == 0); 
@@ -211,6 +213,8 @@ namespace vg {
                 REQUIRE(distance_index.minimum_distance(n7->id(),false, 0, n8->id(), true, 0) == 1);
             }
             SECTION("Paths are correct") {
+                SnarlDistanceIndex distance_index;
+                fill_in_distance_index(&distance_index, &graph, &snarl_finder);
 
                 size_t traversal_i = 0;
                 vector<pair<handlegraph::handle_t, size_t>> actual_path;
@@ -231,6 +235,8 @@ namespace vg {
                 //REQUIRE(traversal_i == 7);
             }
             SECTION("Path that leaves lowest common ancestor") {
+                SnarlDistanceIndex distance_index;
+                fill_in_distance_index(&distance_index, &graph, &snarl_finder);
 
                 size_t traversal_i = 0;
                 vector<pair<handlegraph::handle_t, size_t>> actual_path;
@@ -254,6 +260,10 @@ namespace vg {
                 //});
                 //REQUIRE(traversal_i == 8);
             }
+            SECTION("Distanceless index") {
+                SnarlDistanceIndex distance_index;
+                fill_in_distance_index(&distance_index, &graph, &snarl_finder, 0);
+            }
         }
         TEST_CASE( "Snarl decomposition can deal with multiple connected components",
                   "[snarl_distance]" ) {
@@ -730,9 +740,9 @@ namespace vg {
             
             //get the snarls
             IntegratedSnarlFinder snarl_finder(graph); 
-            SnarlDistanceIndex distance_index;
-            fill_in_distance_index(&distance_index, &graph, &snarl_finder);
             SECTION("Traverse the root") {
+                SnarlDistanceIndex distance_index;
+                fill_in_distance_index(&distance_index, &graph, &snarl_finder);
                 net_handle_t chain1 = distance_index.get_parent(distance_index.get_node_net_handle(n1->id()));
                 net_handle_t node2 = distance_index.get_node_net_handle(n2->id());
                 net_handle_t chain2 = distance_index.get_parent(node2);
@@ -748,6 +758,8 @@ namespace vg {
                 REQUIRE(found == 2);
             }
             SECTION("into_which_snarl") {
+                SnarlDistanceIndex distance_index;
+                fill_in_distance_index(&distance_index, &graph, &snarl_finder);
                 REQUIRE((distance_index.into_which_snarl(n4->id(), false) == std::make_tuple(4, false, true) ||
                         distance_index.into_which_snarl(n4->id(), false) == std::make_tuple(5, true, true)));
                 REQUIRE((distance_index.into_which_snarl(n5->id(), true) == std::make_tuple(4, false, true) ||
@@ -757,6 +769,10 @@ namespace vg {
                 REQUIRE((distance_index.into_which_snarl(n4->id(), true) == std::make_tuple(4, true, false) ||
                         distance_index.into_which_snarl(n4->id(), true) == std::make_tuple(2, false, false)));
             }
+            SECTION("distanceless index") {
+                SnarlDistanceIndex distance_index;
+                fill_in_distance_index(&distance_index, &graph, &snarl_finder, 0);
+            }
         }
 
 

From 59f3f246b5ffc7374973f38e6d69241a1a831f65 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Tue, 17 Sep 2024 11:22:01 -0400
Subject: [PATCH 10/54] fix typo in paths test

---
 test/t/11_vg_paths.t | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/t/11_vg_paths.t b/test/t/11_vg_paths.t
index 36729a7a0ba..e21a8fe2c09 100644
--- a/test/t/11_vg_paths.t
+++ b/test/t/11_vg_paths.t
@@ -90,10 +90,10 @@ diff original.fa norm_x4.fa
 is $? 0 "path normalizer doesnt alter path sequences"
 
 # note: x3 is x4 in reverse, so we key on that
-grep x3 norm_x2.gfa | awk '{print $3}' > x4.path
-grep x3 norm_x2.gfa | awk '{print $3}' >> x4.path
-grep x3 norm_x2.gfa | awk '{print $3}'> x4.norm.path
-grep x5 norm_x2.gfa | awk '{print $3}' >> x4.norm.path
+grep x3 norm_x4.gfa | awk '{print $3}' > x4.path
+grep x3 norm_x4.gfa | awk '{print $3}' >> x4.path
+grep x3 norm_x4.gfa | awk '{print $3}'> x4.norm.path
+grep x5 norm_x4.gfa | awk '{print $3}' >> x4.norm.path
 diff x4.path x4.norm.path
 is $? 0 "path normalizere correctly snapped all equivalent paths to x4"
 

From c78dce76ed83832f2e6d37e7b98d65992163e612 Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Tue, 17 Sep 2024 11:22:23 -0400
Subject: [PATCH 11/54] set snarl distance to 0 in path normalize

---
 src/traversal_clusters.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/traversal_clusters.cpp b/src/traversal_clusters.cpp
index 7e361c84f6a..d96a82f3b98 100644
--- a/src/traversal_clusters.cpp
+++ b/src/traversal_clusters.cpp
@@ -384,8 +384,7 @@ void merge_equivalent_traversals_in_graph(MutablePathHandleGraph* graph, const u
     SnarlDistanceIndex distance_index;
     {
         IntegratedSnarlFinder snarl_finder(*graph);
-        // todo: why can't I pass in 0 below -- I don't want any dinstances!
-        fill_in_distance_index(&distance_index, graph, &snarl_finder, 1);
+        fill_in_distance_index(&distance_index, graph, &snarl_finder, 0);
     }
 
     // only consider embedded paths that span snarl

From e49db317e080763e29bdd651166293d71ca59ee8 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Thu, 19 Sep 2024 10:42:38 -0400
Subject: [PATCH 12/54] Remove no-longer-needed kff CMake patching

The upstream problem we're working around was fixed in https://github.com/Kmer-File-Format/kff-cpp-api/commit/2a5ede6a11bde054f5304e95b69252e9cdfb7e0b, and manually editing the CMakeLists.txt upsets git when we go to change submodule versions.
---
 Makefile | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index c0becd6407a..67c1e556f0e 100644
--- a/Makefile
+++ b/Makefile
@@ -613,12 +613,11 @@ endif
 
 $(INC_DIR)/kff_io.hpp: $(LIB_DIR)/libkff.a
 
-# We need to drop the hardcoderd CMAKE_CXX_FLAGS. See <https://github.com/Kmer-File-Format/kff-cpp-api/issues/16>
 $(LIB_DIR)/libkff.a: $(KFF_DIR)/kff_io.cpp $(KFF_DIR)/kff_io.hpp.in
 ifeq ($(shell uname -s),Darwin)
-	+. ./source_me.sh && cd $(KFF_DIR) && sed -i.bak '/set(CMAKE_CXX_FLAGS/d' CMakeLists.txt && rm -Rf build && mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-fPIC -Wall -Ofast -g $(CXXFLAGS)" .. && AS_INTEGRATED_ASSEMBLER=1 $(MAKE) $(FILTER) && cp kff_io.hpp $(CWD)/$(INC_DIR) && mv libkff.a $(CWD)/$(LIB_DIR)
+	+. ./source_me.sh && cd $(KFF_DIR) && rm -Rf build && mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-fPIC -Wall -Ofast -g $(CXXFLAGS)" .. && AS_INTEGRATED_ASSEMBLER=1 $(MAKE) $(FILTER) && cp kff_io.hpp $(CWD)/$(INC_DIR) && mv libkff.a $(CWD)/$(LIB_DIR)
 else
-	+. ./source_me.sh && cd $(KFF_DIR) && sed -i.bak '/set(CMAKE_CXX_FLAGS/d' CMakeLists.txt && rm -Rf build && mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-fPIC -Wall -Ofast -g $(CXXFLAGS)" .. && $(MAKE) $(FILTER) && cp kff_io.hpp $(CWD)/$(INC_DIR) && mv libkff.a $(CWD)/$(LIB_DIR)
+	+. ./source_me.sh && cd $(KFF_DIR) && rm -Rf build && mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-fPIC -Wall -Ofast -g $(CXXFLAGS)" .. && $(MAKE) $(FILTER) && cp kff_io.hpp $(CWD)/$(INC_DIR) && mv libkff.a $(CWD)/$(LIB_DIR)
 endif
 
 $(INC_DIR)/BooPHF.h: $(BBHASH_DIR)/BooPHF.h

From c4e40f5f8dcf5f5b22ecfb24fb62b8a4be5a4170 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Mon, 23 Sep 2024 14:35:34 -0400
Subject: [PATCH 13/54] Add more codenames

---
 src/version.cpp | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/version.cpp b/src/version.cpp
index 984627fb6d8..a488bd89447 100644
--- a/src/version.cpp
+++ b/src/version.cpp
@@ -103,7 +103,17 @@ const unordered_map<string, string> Version::codenames = {
     {"v1.57.0", "Franchini"},
     {"v1.58.0", "Cartari"},
     {"v1.59.0", "Casatico"},
-    {"v1.69.0", "Bologna"}    // Reserved
+    {"v1.60.0", "Annicco"},
+    {"v1.61.0", "Plodio"},
+    {"v1.62.0", "Ranzano"},
+    {"v1.63.0", "Boccaleone"},
+    {"v1.64.0", "Vibbiana"},
+    {"v1.65.0", "Carfon"},
+    {"v1.66.0", "Navetta"},
+    {"v1.67.0", "Vetria"},
+    {"v1.68.0", "Rimbocchi"},
+    {"v1.69.0", "Bologna"},
+    {"v1.70.0", "Zebedassi"}
     // Add more codenames here
 };
 

From 17fd1b39d818f0b9ba4c9d88576e8b702d21b50e Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Thu, 26 Sep 2024 11:29:04 -0400
Subject: [PATCH 14/54] Add error checking around writing kmer files

This might help with problems like https://github.com/vgteam/vg/issues/4404#issuecomment-2373287057 by letting us tell whether the kmer files were actually accepted by the filesystem or not.
---
 src/kmer.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/kmer.cpp b/src/kmer.cpp
index 52d63713096..188e3f01ca6 100644
--- a/src/kmer.cpp
+++ b/src/kmer.cpp
@@ -348,7 +348,15 @@ string write_gcsa_kmers_to_tmpfile(const HandleGraph& graph, int kmer_size, size
         temp_file::remove(tmpfile);
         throw ex;
     }
+    if (!out) {
+        std::cerr << "error[write_gcsa_kmers_to_tmpfile]: I/O error while writing kmers to " << tmpfile << std::endl;
+        exit(1);
+    }
     out.close();
+    if (!out) {
+        std::cerr << "error[write_gcsa_kmers_to_tmpfile]: I/O error while closing kmer file " << tmpfile << std::endl;
+        exit(1);
+    }
     return tmpfile;
 }
 

From 015e7ee460e69cb8459edda55484ba05a3e639df Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Thu, 3 Oct 2024 11:33:49 -0400
Subject: [PATCH 15/54] Pass $DEBIAN_FRONTEND through make get-deps

This lets you stop `make get-deps` from demanding your timezone.
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 67c1e556f0e..e94642c0ca3 100644
--- a/Makefile
+++ b/Makefile
@@ -521,7 +521,7 @@ static-docker: static scripts/*
 # We have system-level deps to install
 # We want the One True Place for them to be in the Dockerfile.
 get-deps:
-	sudo apt-get install -qq -y --no-upgrade $(shell cat Dockerfile | sed -n '/^###DEPS_BEGIN###/,$${p;/^###DEPS_END###/q}' | grep -v '^ *#' | grep -v "^RUN" | tr '\n' ' ' | tr -d '\\')
+	sudo DEBIAN_FRONTEND=$(DEBIAN_FRONTEND) apt-get install -qq -y --no-upgrade $(shell cat Dockerfile | sed -n '/^###DEPS_BEGIN###/,$${p;/^###DEPS_END###/q}' | grep -v '^ *#' | grep -v "^RUN" | tr '\n' ' ' | tr -d '\\')
 
 # And we have submodule deps to build
 deps: $(DEPS)

From 638ec669a64814d682de51d3a917c0b72af3a0df Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Thu, 10 Oct 2024 16:28:29 -0400
Subject: [PATCH 16/54] fix validate bug on protobuf graphs

---
 src/subcommand/validate_main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/subcommand/validate_main.cpp b/src/subcommand/validate_main.cpp
index 9fe64ca05c9..531f5a287ae 100644
--- a/src/subcommand/validate_main.cpp
+++ b/src/subcommand/validate_main.cpp
@@ -125,7 +125,7 @@ int main_validate(int argc, char** argv) {
     if (!gam_only) {
         VG* vg_graph = dynamic_cast<VG*>(graph.get());
         if (vg_graph != nullptr) {
-            if (!vg_graph->is_valid(true, true, check_orphans, true)) {
+            if (!vg_graph->is_valid(true, true, true, check_orphans)) {
                 valid_graph = false;
             }
         }

From 51e512ded330c6887555830713043ab20ad930fe Mon Sep 17 00:00:00 2001
From: Glenn Hickey <glenn.hickey@gmail.com>
Date: Mon, 21 Oct 2024 13:49:34 -0400
Subject: [PATCH 17/54] revert from distance index to snarl manager in path
 normalizer

---
 src/subcommand/paths_main.cpp |  2 +-
 src/traversal_clusters.cpp    | 80 +++++++++++++++++++++++------------
 src/traversal_clusters.hpp    |  5 ++-
 3 files changed, 59 insertions(+), 28 deletions(-)

diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp
index fb668d7edd4..9a4e868a06a 100644
--- a/src/subcommand/paths_main.cpp
+++ b/src/subcommand/paths_main.cpp
@@ -610,7 +610,7 @@ int main_paths(int argc, char** argv) {
                 for_each_selected_path([&](const path_handle_t& path_handle) {
                     selected_paths.insert(path_handle);
                 });
-                merge_equivalent_traversals_in_graph(mutable_graph, selected_paths);
+                merge_equivalent_traversals_in_graph(mutable_graph, selected_paths, true);
             }
             if (!to_destroy.empty()) {
                 mutable_graph->destroy_paths(to_destroy);
diff --git a/src/traversal_clusters.cpp b/src/traversal_clusters.cpp
index d96a82f3b98..130b65f69e1 100644
--- a/src/traversal_clusters.cpp
+++ b/src/traversal_clusters.cpp
@@ -2,6 +2,7 @@
 #include "traversal_finder.hpp"
 #include "integrated_snarl_finder.hpp"
 #include "snarl_distance_index.hpp"
+#include "snarls.hpp"
 
 //#define debug
 
@@ -379,37 +380,64 @@ static void merge_equivalent_traversals_in_snarl(MutablePathHandleGraph* graph,
     }
 }
 
-void merge_equivalent_traversals_in_graph(MutablePathHandleGraph* graph, const unordered_set<path_handle_t>& selected_paths) {  
-    // compute the snarls
-    SnarlDistanceIndex distance_index;
-    {
-        IntegratedSnarlFinder snarl_finder(*graph);
-        fill_in_distance_index(&distance_index, graph, &snarl_finder, 0);
-    }
+void merge_equivalent_traversals_in_graph(MutablePathHandleGraph* graph, const unordered_set<path_handle_t>& selected_paths,
+                                          bool use_snarl_manager) {
 
     // only consider embedded paths that span snarl
     PathTraversalFinder path_trav_finder(*graph);
 
-    // do every snarl top-down.  this is because it's possible (tho probably rare) for a child snarl to
-    // be redundant after normalizing its parent. don't think the opposite (normalizing child)
-    // causes redundant parent.. todo: can we guarantee?!
-    net_handle_t root = distance_index.get_root();
-    deque<net_handle_t> queue = {root};
-    
-    while (!queue.empty()) {
-        net_handle_t net_handle = queue.front();
-        queue.pop_front();
-        if (distance_index.is_snarl(net_handle)) {
-            net_handle_t start_bound = distance_index.get_bound(net_handle, false, true);
-            net_handle_t end_bound = distance_index.get_bound(net_handle, true, false);
-            handle_t start_handle = distance_index.get_handle(start_bound, graph);
-            handle_t end_handle = distance_index.get_handle(end_bound, graph);
+    if (use_snarl_manager) {
+        // compute the snarls using the old snarl manager
+        IntegratedSnarlFinder finder(*graph);
+        SnarlManager snarl_manager(std::move(finder.find_snarls_parallel()));
+
+        deque<const Snarl*> queue;
+        snarl_manager.for_each_top_level_snarl([&](const Snarl* snarl) {
+            queue.push_back(snarl);
+        });
+
+        while (!queue.empty()) {
+            const Snarl* snarl = queue.front();
+            queue.pop_front();
+            handle_t start_handle = graph->get_handle(snarl->start().node_id(), snarl->start().backward());
+            handle_t end_handle = graph->get_handle(snarl->end().node_id(), snarl->end().backward());
             merge_equivalent_traversals_in_snarl(graph, selected_paths, path_trav_finder, start_handle, end_handle);
-        }        
-        if (net_handle == root || distance_index.is_snarl(net_handle) || distance_index.is_chain(net_handle)) {
-            distance_index.for_each_child(net_handle, [&](net_handle_t child_handle) {
-                queue.push_back(child_handle);
-            });
+            const vector<const Snarl*>& children = snarl_manager.children_of(snarl);
+            for (const Snarl* child : children) {
+                queue.push_back(child);
+            }
+        }
+    } else {
+        // compute the snarls using the distance index
+        // this is what we want to do going forward since it uses the new api, no protobuf etc,
+        // but unfortunately it seems way slower on some graphs, hence
+        SnarlDistanceIndex distance_index;
+        {
+            IntegratedSnarlFinder snarl_finder(*graph);
+            fill_in_distance_index(&distance_index, graph, &snarl_finder, 0);
+        }
+
+        // do every snarl top-down.  this is because it's possible (tho probably rare) for a child snarl to
+        // be redundant after normalizing its parent. don't think the opposite (normalizing child)
+        // causes redundant parent.. todo: can we guarantee?!
+        net_handle_t root = distance_index.get_root();
+        deque<net_handle_t> queue = {root};
+    
+        while (!queue.empty()) {
+            net_handle_t net_handle = queue.front();
+            queue.pop_front();
+            if (distance_index.is_snarl(net_handle)) {
+                net_handle_t start_bound = distance_index.get_bound(net_handle, false, true);
+                net_handle_t end_bound = distance_index.get_bound(net_handle, true, false);
+                handle_t start_handle = distance_index.get_handle(start_bound, graph);
+                handle_t end_handle = distance_index.get_handle(end_bound, graph);
+                merge_equivalent_traversals_in_snarl(graph, selected_paths, path_trav_finder, start_handle, end_handle);
+            }        
+            if (net_handle == root || distance_index.is_snarl(net_handle) || distance_index.is_chain(net_handle)) {
+                distance_index.for_each_child(net_handle, [&](net_handle_t child_handle) {
+                    queue.push_back(child_handle);
+                });
+            }
         }
     }
 }
diff --git a/src/traversal_clusters.hpp b/src/traversal_clusters.hpp
index 646cbf46409..b35bbb3d730 100644
--- a/src/traversal_clusters.hpp
+++ b/src/traversal_clusters.hpp
@@ -99,6 +99,9 @@ vector<vector<int>> assign_child_snarls_to_traversals(const PathHandleGraph* gra
 /// Note: this doesn't modify the graph toplogy, so uncovered nodes and edges as a result of path editing
 /// would usually need removale with vg clip afterwards
 ///
-void merge_equivalent_traversals_in_graph(MutablePathHandleGraph* graph, const unordered_set<path_handle_t>& selected_paths);
+/// the use_snarl_manager toggles between distnace index and snarl manager for computing snarls
+/// (adding this option to (hopefully) temporarily revert to the snarl manager for performance reasons)
+void merge_equivalent_traversals_in_graph(MutablePathHandleGraph* graph, const unordered_set<path_handle_t>& selected_paths,
+                                          bool use_snarl_manager=false);
 
 }

From a335429503055975e1be8d38b416797fe7649c2e Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Mon, 21 Oct 2024 16:49:30 -0400
Subject: [PATCH 18/54] Make padded window size parity match anchor parity for
 orientation independence

---
 src/surjector.cpp | 48 ++++++++++++++++++++++++++++++++++++++++-------
 src/surjector.hpp |  4 ++++
 2 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/src/surjector.cpp b/src/surjector.cpp
index be0a2e067d5..9c510bde9aa 100644
--- a/src/surjector.cpp
+++ b/src/surjector.cpp
@@ -4038,7 +4038,7 @@ using namespace std;
             for (int i = 0; i < path_chunks.size(); ++i) {
                 auto& chunk = path_chunks[i];
                 // Mark anchors that are themselves suspicious as not to be kept.
-                if (chunk.first.first == path_chunks.front().first.first && chunk.first.second == path_chunks.back().first.second
+                if (chunk.first.first == path_chunks.front().first.first && chunk.first.second == path_chunks.back().first.second // TODO: shouldn't this be || if we want to match *either* tail?
                     && (anchor_lengths[i] <= max_tail_anchor_prune || chunk.first.second - chunk.first.first <= max_tail_anchor_prune)) {
 #ifdef debug_anchored_surject
                     cerr << "anchor " << i << " (read interval " << (chunk.first.first - sequence.begin()) << " : " << (chunk.first.second - sequence.begin()) << ") pruned for being a short tail" << endl;
@@ -4050,19 +4050,53 @@ using namespace std;
                 if ((anchor_lengths[i] <= max_low_complexity_anchor_prune || chunk.first.second - chunk.first.first <= max_low_complexity_anchor_prune)) {
                     SeqComplexity<6> chunk_complexity(chunk.first.first, chunk.first.second);
                     if (chunk.first.second - chunk.first.first < pad_suspicious_anchors_to_length) {
-                        auto read_context_begin = max(sequence.begin(), chunk.first.first - (pad_suspicious_anchors_to_length - (chunk.first.second - chunk.first.first)) / 2);
-                        auto read_context_end = min(sequence.end(), read_context_begin + pad_suspicious_anchors_to_length);
-                        if (read_context_end == sequence.end()) {
-                            // try to ensure enough bases if we're near the end of the read
-                            read_context_begin = max(sequence.begin(), read_context_end - pad_suspicious_anchors_to_length);
+                        // We need to fetch out a sequence at least pad_suspicious_anchors_to_length bp long (unless the whole sequence is shorter) and analyze that.
+                        
+                        // There's no way to symmetrically (and thus
+                        // independently of orientation) get an even-length
+                        // window around an odd-length window, or visa versa.
+                        // So express everything as per-side padding, and round it up.
+                        size_t chunk_length = chunk.first.second - chunk.first.first;
+                        size_t padding_per_side = (pad_suspicious_anchors_to_length - chunk_length + 1) / 2;
+
+                        // Pad separately on each side to avoid read bounds
+                        size_t left_padding = padding_per_side;
+                        size_t right_padding = padding_per_side;
+
+                        // Shift the padded window right if we hit the start
+                        size_t start_offset = chunk.first.first - sequence.begin();
+                        if (start_offset < left_padding) {
+                            right_padding += (left_padding - start_offset);
+                            left_padding = start_offset;
                         }
+
+                        // Shift the padded window left if we hit the end
+                        size_t remaining_until_end = sequence.end() - chunk.first.second;
+                        if (remaining_until_end < right_padding) {
+                            left_padding += (right_padding - remaining_until_end);
+                            right_padding = remaining_until_end;
+                        }
+
+                        // If we hit the start again, clip since the whole window doesn't fit.
+                        left_padding = min(left_padding, start_offset);
+
+                        // TODO: Is there a more closed-form way to budge the padding?
+
+                        // Now expand the iterator range without ever constructing pre-begin iterators
+                        auto read_context_begin = chunk.first.first - left_padding;
+                        auto read_context_end = chunk.first.second + right_padding;
+
+#ifdef debug_anchored_surject
+                        std::cerr << "For read interval " << (chunk.first.first - sequence.begin()) << ":" << (chunk.first.second - sequence.begin()) << " length " << (chunk.first.second - chunk.first.first) << " use context " << (read_context_begin - sequence.begin()) << ":" << (read_context_end - sequence.begin()) << " length " << (read_context_end - read_context_begin) << std::endl;
+#endif
+
                         SeqComplexity<6> context_complexity(read_context_begin, read_context_end);
                         // TODO: repetitive
                         for (int order = 1, max_order = 6; order <= max_order; ++order) {
                             //cerr << "padded anchor " << i << " (read[" << (chunk.first.first - sequence.begin()) << ":" << (chunk.first.second - sequence.begin()) << "]), seq " << string(read_context_begin, read_context_end) << ", order " << order << " with p-value " << context_complexity.p_value(order) << ", repetitive fraction " << chunk_complexity.repetitiveness(order) << endl;
                             if (context_complexity.p_value(order) < low_complexity_p_value) {
 #ifdef debug_anchored_surject
-                                cerr << "anchor " << i << " (read[" << (chunk.first.first - sequence.begin()) << ":" << (chunk.first.second - sequence.begin()) << "]) pruned being for having context with low complexity at order " << order << ", p-value " << context_complexity.p_value(order) << " and anchor repetitive fraction " << chunk_complexity.repetitiveness(order) << endl;
+                                cerr << "anchor " << i << " (read[" << (chunk.first.first - sequence.begin()) << ":" << (chunk.first.second - sequence.begin()) << "]) pruned for having context with low complexity at order " << order << ", p-value " << context_complexity.p_value(order) << " and anchor repetitive fraction " << chunk_complexity.repetitiveness(order) << endl;
 #endif
                                 // the sequences is repetitive at this order
                                 keep[i] = false;
diff --git a/src/surjector.hpp b/src/surjector.hpp
index a582df64128..84bacead647 100644
--- a/src/surjector.hpp
+++ b/src/surjector.hpp
@@ -132,6 +132,10 @@ using namespace std;
         double low_complexity_p_value = .0075;
         int64_t max_low_complexity_anchor_prune = 40;
         int64_t max_low_complexity_anchor_trim = 65;
+        /// When examining anchors for suspiciousness, try and make them at
+        /// least this long. To ensure orientation symmetry, we will make
+        /// anchors with the oppsite parity (even if this is odd, or odd if
+        /// this is even) 1bp longer.
         int64_t pad_suspicious_anchors_to_length = 12;
         
         // A function for computing band padding

From cabfe9fc66f31fed06eae727b04c3cb4ea16b61b Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Mon, 21 Oct 2024 16:50:01 -0400
Subject: [PATCH 19/54] Remove extra debugging

---
 src/surjector.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/surjector.cpp b/src/surjector.cpp
index 9c510bde9aa..0c2ff88df30 100644
--- a/src/surjector.cpp
+++ b/src/surjector.cpp
@@ -4086,10 +4086,6 @@ using namespace std;
                         auto read_context_begin = chunk.first.first - left_padding;
                         auto read_context_end = chunk.first.second + right_padding;
 
-#ifdef debug_anchored_surject
-                        std::cerr << "For read interval " << (chunk.first.first - sequence.begin()) << ":" << (chunk.first.second - sequence.begin()) << " length " << (chunk.first.second - chunk.first.first) << " use context " << (read_context_begin - sequence.begin()) << ":" << (read_context_end - sequence.begin()) << " length " << (read_context_end - read_context_begin) << std::endl;
-#endif
-
                         SeqComplexity<6> context_complexity(read_context_begin, read_context_end);
                         // TODO: repetitive
                         for (int order = 1, max_order = 6; order <= max_order; ++order) {

From 68fc4284eaf31de6fcbb62f3aacfa26354e79112 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Mon, 21 Oct 2024 17:05:54 -0400
Subject: [PATCH 20/54] Change short tail pruning to only require being one
 tail and not both

---
 src/surjector.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/surjector.cpp b/src/surjector.cpp
index 0c2ff88df30..954adb0b279 100644
--- a/src/surjector.cpp
+++ b/src/surjector.cpp
@@ -4038,8 +4038,8 @@ using namespace std;
             for (int i = 0; i < path_chunks.size(); ++i) {
                 auto& chunk = path_chunks[i];
                 // Mark anchors that are themselves suspicious as not to be kept.
-                if (chunk.first.first == path_chunks.front().first.first && chunk.first.second == path_chunks.back().first.second // TODO: shouldn't this be || if we want to match *either* tail?
-                    && (anchor_lengths[i] <= max_tail_anchor_prune || chunk.first.second - chunk.first.first <= max_tail_anchor_prune)) {
+                if ((chunk.first.first == path_chunks.front().first.first || chunk.first.second == path_chunks.back().first.second) // Is at either tail
+                    && (anchor_lengths[i] <= max_tail_anchor_prune || chunk.first.second - chunk.first.first <= max_tail_anchor_prune)) { // And is too short
 #ifdef debug_anchored_surject
                     cerr << "anchor " << i << " (read interval " << (chunk.first.first - sequence.begin()) << " : " << (chunk.first.second - sequence.begin()) << ") pruned for being a short tail" << endl;
 #endif

From 0c826dc84bed3697294a5fda31102b9cd9520a5f Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Mon, 21 Oct 2024 17:15:54 -0400
Subject: [PATCH 21/54] Add a test for surjection orientation independence

---
 test/surject/opposite_strands.gaf |  2 ++
 test/surject/opposite_strands.gfa | 43 +++++++++++++++++++++++++++++++
 test/t/15_vg_surject.t            |  4 ++-
 3 files changed, 48 insertions(+), 1 deletion(-)
 create mode 100644 test/surject/opposite_strands.gaf
 create mode 100644 test/surject/opposite_strands.gfa

diff --git a/test/surject/opposite_strands.gaf b/test/surject/opposite_strands.gaf
new file mode 100644
index 00000000000..73e07638fe5
--- /dev/null
+++ b/test/surject/opposite_strands.gaf
@@ -0,0 +1,2 @@
+forward	35	0	35	+	>56629809>56629810>56629811>56629814>56629815>56629819>56629820>56629822	132	97	132	35	35	0	cs:Z::35
+reverse	35	0	35	+	<56629822<56629820<56629819<56629815<56629814<56629811<56629810<56629809	132	0	35	35	35	0	cs:Z::35
diff --git a/test/surject/opposite_strands.gfa b/test/surject/opposite_strands.gfa
new file mode 100644
index 00000000000..8177180d24c
--- /dev/null
+++ b/test/surject/opposite_strands.gfa
@@ -0,0 +1,43 @@
+H	VN:Z:1.1	RS:Z:GRCh38 CHM13
+S	56629807	G
+S	56629808	T
+S	56629809	TTTCTGATTATAAATATTGCACATGTATTGATTACATAAATCCATATACTATAAAACTGATATTTAAGAGAATAAAAGTCCCAACCTCAGAATTAACTACTG
+S	56629810	C
+S	56629811	A
+S	56629812	A
+S	56629813	C
+S	56629814	CCCCC
+S	56629815	C
+S	56629816	G
+S	56629817	T
+S	56629818	T
+S	56629819	T
+S	56629820	TTTTTTTTTTT
+S	56629821	T
+S	56629822	GATGGAGTCT
+S	56629823	C
+S	56629824	G
+W	CHM13	0	chr8	81560724	81560860	>56629807>56629809>56629810>56629813>56629814>56629817>56629818>56629819>56629820>56629821>56629822>56629824
+W	GRCh38	0	chr8	35183939	35184073	>56629807>56629809>56629810>56629811>56629814>56629815>56629819>56629820>56629822>56629823
+L	56629807	+	56629809	+	0M
+L	56629808	+	56629809	+	0M
+L	56629809	+	56629812	-	0M
+L	56629809	+	56629810	+	0M
+L	56629810	+	56629813	+	0M
+L	56629810	+	56629811	+	0M
+L	56629811	+	56629814	+	0M
+L	56629812	-	56629813	+	0M
+L	56629813	+	56629814	+	0M
+L	56629814	+	56629817	+	0M
+L	56629814	+	56629815	+	0M
+L	56629815	+	56629819	+	0M
+L	56629815	+	56629816	-	0M
+L	56629816	-	56629819	+	0M
+L	56629817	+	56629818	+	0M
+L	56629818	+	56629819	+	0M
+L	56629819	+	56629820	+	0M
+L	56629820	+	56629822	+	0M
+L	56629820	+	56629821	+	0M
+L	56629821	+	56629822	+	0M
+L	56629822	+	56629824	+	0M
+L	56629822	+	56629823	+	0M
diff --git a/test/t/15_vg_surject.t b/test/t/15_vg_surject.t
index 6e0ddff37e2..3704ec823c2 100644
--- a/test/t/15_vg_surject.t
+++ b/test/t/15_vg_surject.t
@@ -6,7 +6,7 @@ BASH_TAP_ROOT=../deps/bash-tap
 PATH=../bin:$PATH # for vg
 
 
-plan tests 47
+plan tests 48
 
 vg construct -r small/x.fa >j.vg
 vg index -x j.xg j.vg
@@ -198,3 +198,5 @@ is "$(vg surject -x x.xg -M -m -s -i -t 1 mapped.gamp | grep -v '@' | wc -l)" 80
 
 rm x.vg x.pathdup.vg x.xg x.gcsa x.gcsa.lcp x.gam mapped.gam mapped.gamp
 
+is "$(vg surject -p CHM13#0#chr8 -x surject/opposite_strands.gfa --prune-low-cplx --sam-output --gaf-input surject/opposite_strands.gaf | grep -v "^@" | cut -f3-12 | sort | uniq | wc -l)" "1" "vg surject low compelxity pruning gets the same alignment regardless of read orientation"
+

From b0304a29c177d6dae88ac7c92e4198e1111f8b85 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Fri, 25 Oct 2024 16:34:46 -0400
Subject: [PATCH 22/54] Add tests for chunk on haplotype paths

---
 test/t/30_vg_chunk.t | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/test/t/30_vg_chunk.t b/test/t/30_vg_chunk.t
index 20c8466ea88..3eccc903a2e 100644
--- a/test/t/30_vg_chunk.t
+++ b/test/t/30_vg_chunk.t
@@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap
 
 PATH=../bin:$PATH # for vg
 
-plan tests 32
+plan tests 33
 
 # Construct a graph with alt paths so we can make a GBWT and a GBZ
 vg construct -m 1000 -r small/x.fa -v small/x.vcf.gz -a >x.vg
@@ -126,5 +126,14 @@ is "$?" 0 "components finds subgraphs"
 
 rm -f xy.vg x.vg y.vg x_nodes.txt y_nodes.txt convert path_chunk_x.vg  convert path_chunk_y.vg pc_x_nodes.txt pc_y_nodes.txt x_paths.txt pc_x_paths.txt components_chunk_0.vg components_chunk_1.vg comp_0_nodes.txt comp_1_nodes.txt comp_nodes.txt nodes.txt x.gam y.gam xy.gam path_chunk_x.gam path_chunk_y.gam
 
+vg gbwt --gbz-format --graph-name graph.gbz --gfa-input graphs/gfa_with_reference.gfa
+vg chunk -x graph.gbz -p sample1#1#chr1#0:1-2 -c 1 >/dev/null 2>log.txt
+grep "out_of_range" log.txt
+is "$?" "1" "chunking on a haplotype path does not produce an out of range error"
+grep "not found in" log.txt
+is "$?" "1" "chunking on a haplotype path does not produce a path not found error"
+
+rm -f graph.gbz log.txt
+
 
 

From a04b6bcace475e23aefaf6a272643219c843f29c Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Fri, 25 Oct 2024 17:59:19 -0400
Subject: [PATCH 23/54] Allow vg chunk to reference-index particular hapolotype
 paths requested

---
 deps/libbdsg                  |  2 +-
 src/subcommand/chunk_main.cpp | 78 ++++++++++++++++++++---------------
 test/t/30_vg_chunk.t          |  7 ++--
 3 files changed, 49 insertions(+), 38 deletions(-)

diff --git a/deps/libbdsg b/deps/libbdsg
index 33e2fc213eb..fd5c186d04c 160000
--- a/deps/libbdsg
+++ b/deps/libbdsg
@@ -1 +1 @@
-Subproject commit 33e2fc213eb492a0db38a30ecc1b3b4e3146e769
+Subproject commit fd5c186d04c9e00c088b4b3c98f687b82e3e6e15
diff --git a/src/subcommand/chunk_main.cpp b/src/subcommand/chunk_main.cpp
index 22f6e02651e..57397ed6713 100644
--- a/src/subcommand/chunk_main.cpp
+++ b/src/subcommand/chunk_main.cpp
@@ -349,6 +349,36 @@ int main_chunk(int argc, char** argv) {
     bool chunk_gam = !gam_files.empty() && gam_split_size == 0;
     bool chunk_graph = gam_and_graph || (!chunk_gam && gam_split_size == 0);
 
+    // parse the regions into a list before loading the graph, if we're
+    // specifying regions by path name.
+    vector<Region> regions;
+    if (!region_strings.empty()) {
+        for (auto& region_string : region_strings) {
+            Region region;
+            parse_region(region_string, region);
+            regions.push_back(region);
+        }
+    }
+    if (!path_list_file.empty()) {
+        ifstream pr_stream(path_list_file.c_str());
+        if (!pr_stream) {
+            cerr << "error:[vg chunk] unable to open path regions: " << path_list_file << endl;
+            return 1;
+        }
+        while (pr_stream) {
+            string buf;
+            std::getline(pr_stream, buf);
+            if (!buf.empty()) {
+                Region region;
+                parse_region(buf, region);
+                regions.push_back(region);
+            }
+        }
+    }
+    if (!in_bed_file.empty()) {
+        parse_bed_regions(in_bed_file, regions);
+    }
+
     // Load the snarls
     unique_ptr<SnarlManager> snarl_manager;
     if (!snarl_filename.empty()) {
@@ -377,9 +407,17 @@ int main_chunk(int argc, char** argv) {
             return 1;
         }
         in.close();
+
+        // To support the regions we were asked for, we might need to ensure
+        // the paths they are on are actually indexed for reference style
+        // offset lookups.
+        std::unordered_set<std::string> ensure_indexed;
+        for (auto& region : regions) {
+            ensure_indexed.insert(region.seq);
+        }
         
         path_handle_graph = vg::io::VPKG::load_one<PathHandleGraph>(xg_file);
-        graph = overlay_helper.apply(path_handle_graph.get());
+        graph = overlay_helper.apply(path_handle_graph.get(), ensure_indexed);
         in.close();
     }
 
@@ -463,35 +501,7 @@ int main_chunk(int argc, char** argv) {
     // (instead of an index)
     unordered_map<nid_t, int32_t> node_to_component;
     
-    // parse the regions into a list
-    vector<Region> regions;
-    if (!region_strings.empty()) {
-        for (auto& region_string : region_strings) {
-            Region region;
-            parse_region(region_string, region);
-            regions.push_back(region);
-        }
-    }
-    else if (!path_list_file.empty()) {
-        ifstream pr_stream(path_list_file.c_str());
-        if (!pr_stream) {
-            cerr << "error:[vg chunk] unable to open path regions: " << path_list_file << endl;
-            return 1;
-        }
-        while (pr_stream) {
-            string buf;
-            std::getline(pr_stream, buf);
-            if (!buf.empty()) {
-                Region region;
-                parse_region(buf, region);
-                regions.push_back(region);
-            }
-        }
-    }
-    else if (!in_bed_file.empty()) {
-        parse_bed_regions(in_bed_file, regions);
-    }
-    else if (id_range) {
+    if (id_range) {
         if (n_chunks) {
             // Determine the ranges from the source graph itself.
             // how many nodes per range?
@@ -556,9 +566,9 @@ int main_chunk(int argc, char** argv) {
             delete range_stream;
         }
     }
-    else if (graph != nullptr && (!components || path_components)) {
-        // every path
-        graph->for_each_path_handle([&](path_handle_t path_handle) {
+    if (graph != nullptr && path_components) {
+        // every reference or generic path (guaranteed to be reference indexed)
+        graph->for_each_path_matching({PathSense::REFERENCE, PathSense::GENERIC}, {}, {}, [&](path_handle_t path_handle) {
                 Region region;
                 region.seq = graph->get_path_name(path_handle);
                 if (!Paths::is_alt(region.seq)) {
@@ -596,7 +606,7 @@ int main_chunk(int argc, char** argv) {
     if (!id_range) {
         for (auto& region : regions) {
             if (!graph->has_path(region.seq)) {
-                cerr << "error[vg chunk]: input path " << region.seq << " not found in xg index" << endl;
+                cerr << "error[vg chunk]: input path " << region.seq << " not found in graph" << endl;
                 return 1;
             }
             region.start = max((int64_t)0, region.start);
diff --git a/test/t/30_vg_chunk.t b/test/t/30_vg_chunk.t
index 3eccc903a2e..142de98066a 100644
--- a/test/t/30_vg_chunk.t
+++ b/test/t/30_vg_chunk.t
@@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap
 
 PATH=../bin:$PATH # for vg
 
-plan tests 33
+plan tests 35
 
 # Construct a graph with alt paths so we can make a GBWT and a GBZ
 vg construct -m 1000 -r small/x.fa -v small/x.vcf.gz -a >x.vg
@@ -127,13 +127,14 @@ is "$?" 0 "components finds subgraphs"
 rm -f xy.vg x.vg y.vg x_nodes.txt y_nodes.txt convert path_chunk_x.vg  convert path_chunk_y.vg pc_x_nodes.txt pc_y_nodes.txt x_paths.txt pc_x_paths.txt components_chunk_0.vg components_chunk_1.vg comp_0_nodes.txt comp_1_nodes.txt comp_nodes.txt nodes.txt x.gam y.gam xy.gam path_chunk_x.gam path_chunk_y.gam
 
 vg gbwt --gbz-format --graph-name graph.gbz --gfa-input graphs/gfa_with_reference.gfa
-vg chunk -x graph.gbz -p sample1#1#chr1#0:1-2 -c 1 >/dev/null 2>log.txt
+vg chunk -x graph.gbz -p sample1#1#chr1#0:1-2 -c 1 >part.vg 2>log.txt
 grep "out_of_range" log.txt
 is "$?" "1" "chunking on a haplotype path does not produce an out of range error"
 grep "not found in" log.txt
 is "$?" "1" "chunking on a haplotype path does not produce a path not found error"
+is "$(vg stats -z part.vg | grep nodes | cut -f2)" "4" "chunking on a haplotype produces the correct size graph"
 
-rm -f graph.gbz log.txt
+rm -f graph.gbz part.vg log.txt
 
 
 

From 9f2cd0bcaf3c8278d11670cdb3e26855e17ae510 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Mon, 28 Oct 2024 10:11:21 -0400
Subject: [PATCH 24/54] Add -o to vg paths to look at paths through a reference
 overlay

---
 src/subcommand/paths_main.cpp | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp
index fb668d7edd4..cda4be4256b 100644
--- a/src/subcommand/paths_main.cpp
+++ b/src/subcommand/paths_main.cpp
@@ -18,6 +18,7 @@
 #include "../gbwt_helper.hpp"
 #include "../traversal_clusters.hpp"
 #include "../io/save_handle_graph.hpp"
+#include <bdsg/overlays/overlay_helper.hpp>
 #include <vg/io/vpkg.hpp>
 #include <vg/io/stream.hpp>
 #include <vg/io/alignment_emitter.hpp>
@@ -57,6 +58,8 @@ void help_paths(char** argv) {
          << "    -G, --generic-paths      select the generic, non-reference, non-haplotype paths" << endl
          << "    -R, --reference-paths    select the reference paths" << endl
          << "    -H, --haplotype-paths    select the haplotype paths paths" << endl
+         << "  configuration:" << endl
+         << "    -o, --overlay            apply a ReferencePathOverlayHelper to the graph" << endl
          << "    -t, --threads N          number of threads to use [all available]. applies only to snarl finding within -n" << endl;
 }
 
@@ -122,6 +125,7 @@ int main_paths(int argc, char** argv) {
     bool coverage = false;
     const size_t coverage_bins = 10;
     bool normalize_paths = false;
+    bool overlay = false;
 
     int c;
     optind = 2; // force optind past command positional argument
@@ -151,6 +155,7 @@ int main_paths(int argc, char** argv) {
             {"reference-paths", no_argument, 0, 'R'},
             {"haplotype-paths", no_argument, 0, 'H'},
             {"coverage", no_argument, 0, 'c'},            
+            {"overlay", no_argument, 0, 'o'},
 
             // Hidden options for backward compatibility.
             {"threads", no_argument, 0, 'T'},
@@ -160,7 +165,7 @@ int main_paths(int argc, char** argv) {
         };
 
         int option_index = 0;
-        c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:drnaGRHp:ct:",
+        c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:drnaGRHp:coTq:t:",
                 long_options, &option_index);
 
         // Detect the end of the options.
@@ -276,6 +281,10 @@ int main_paths(int argc, char** argv) {
             output_formats++;
             break;
 
+        case 'o':
+            overlay = true;
+            break;
+
         case 'T':
             std::cerr << "warning: [vg paths] option --threads is obsolete and unnecessary" << std::endl;
             break;
@@ -375,11 +384,23 @@ int main_paths(int argc, char** argv) {
     
     // Load whatever indexes we were given
     // Note: during handlifiction, distinction between -v and -x options disappeared.
-    unique_ptr<PathHandleGraph> graph;
+    unique_ptr<PathHandleGraph> path_handle_graph;
     if (!graph_file.empty()) {
         // Load the graph
-        graph = vg::io::VPKG::load_one<PathHandleGraph>(graph_file);
+        path_handle_graph = vg::io::VPKG::load_one<PathHandleGraph>(graph_file);
+    }
+    bdsg::ReferencePathOverlayHelper overlay_helper;
+    PathHandleGraph* graph = nullptr;
+    if (path_handle_graph) {
+        if (overlay) {
+            // Try to apply the overlay if the user wanted it and this isn't
+            // already a graph implementing position lookups.
+            graph = overlay_helper.apply(path_handle_graph.get());
+        } else {
+            graph = path_handle_graph.get();
+        }
     }
+
     unique_ptr<gbwt::GBWT> gbwt_index;
     if (!gbwt_file.empty()) {
         // We want a gbwt
@@ -418,7 +439,7 @@ int main_paths(int argc, char** argv) {
     if (extract_as_gam || extract_as_gaf) {
         // Open up a GAM/GAF output stream
         aln_emitter = vg::io::get_non_hts_alignment_emitter("-", extract_as_gaf ? "GAF" : "GAM", {}, get_thread_count(),
-                                                            graph.get());
+                                                            graph);
     } else if (extract_as_vg) {
         // Open up a VG Graph chunk output stream
         graph_emitter = unique_ptr<vg::io::ProtobufEmitter<Graph>>(new vg::io::ProtobufEmitter<Graph>(cout));
@@ -589,7 +610,7 @@ int main_paths(int argc, char** argv) {
         };
         
         if (drop_paths || retain_paths || normalize_paths) {
-            MutablePathMutableHandleGraph* mutable_graph = dynamic_cast<MutablePathMutableHandleGraph*>(graph.get());
+            MutablePathMutableHandleGraph* mutable_graph = dynamic_cast<MutablePathMutableHandleGraph*>(graph);
             if (!mutable_graph) {
                 std::cerr << "error[vg paths]: graph cannot be modified" << std::endl;
                 exit(1);

From b4f5c3ee9009152941a1dccc3ba45a60cae122a8 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Mon, 28 Oct 2024 11:00:51 -0400
Subject: [PATCH 25/54] Update error and bump libbdsg

Update vg deconstruct error message to not be readable as about missing
_alt_ paths, and use a libbdsg where the reference path overlay indexes
all non-hidden paths (including e.g. haplotype paths from GFA).
---
 deps/libbdsg          | 2 +-
 src/deconstructor.cpp | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/deps/libbdsg b/deps/libbdsg
index fd5c186d04c..88181dbc808 160000
--- a/deps/libbdsg
+++ b/deps/libbdsg
@@ -1 +1 @@
-Subproject commit fd5c186d04c9e00c088b4b3c98f687b82e3e6e15
+Subproject commit 88181dbc80835e93de681e84e245c891e250f77c
diff --git a/src/deconstructor.cpp b/src/deconstructor.cpp
index aa052db955a..0f15b8139d7 100644
--- a/src/deconstructor.cpp
+++ b/src/deconstructor.cpp
@@ -1168,9 +1168,10 @@ string Deconstructor::get_vcf_header() {
     }
 
     if (sample_to_haps.empty()) {
-        cerr << "Error [vg deconstruct]: No paths found for alt alleles in the graph. Note that "
-             << "exhaustive path-free traversal finding is no longer supported, and vg deconstruct "
-             << "now only works on embedded paths and GBWT threads." << endl;
+        cerr << "Error [vg deconstruct]: No paths other than selected reference(s) found in the graph, "
+             << "so no alt alleles can be generated. Note that exhaustive path-free traversal finding "
+             << "is no longer supported, and vg deconstruct now only works on embedded paths and GBWT "
+             << "threads." << endl;
         exit(1);
     }
 

From 151ae72d23fe65f00c29868d2b9dac59c3835581 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Mon, 28 Oct 2024 11:13:55 -0400
Subject: [PATCH 26/54] Bump macos version for tests

---
 .github/workflows/testmac.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/testmac.yml b/.github/workflows/testmac.yml
index 11d656e7a05..802d656ccfb 100644
--- a/.github/workflows/testmac.yml
+++ b/.github/workflows/testmac.yml
@@ -15,7 +15,7 @@ on:
 jobs:
   testmac:
     name: Test on Mac
-    runs-on: macos-12
+    runs-on: macos-15
 
     steps:
       - name: Use cache
@@ -26,14 +26,14 @@ jobs:
             lib
             include
             bin
-          key: ${{ runner.os }}-12-${{ github.ref }}
+          key: ${{ runner.os }}-15-${{ github.ref }}
           # Restore keys are a "list", but really only a multiline string is
           # accepted. Also we match by prefix. And the most recent cache is
           # used, not the most specific.
           # See: https://docs.github.com/en/actions/guides/caching-dependencies-to-speed-up-workflows#matching-a-cache-key
           restore-keys: |
-            ${{ runner.os }}-12-${{ github.base_ref }}
-            ${{ runner.os }}-12
+            ${{ runner.os }}-15-${{ github.base_ref }}
+            ${{ runner.os }}-15
           
       - name: Checkout code without submodules
         uses: actions/checkout@v2

From 23364decdc3832b70cceee02de5fb1869ebed346 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Mon, 28 Oct 2024 11:51:51 -0400
Subject: [PATCH 27/54] Remove source_me.sh which makes system samtools use our
 dylibs

---
 Dockerfile           |   7 +--
 Makefile             | 139 +++++++++++++++++++++----------------------
 README.md            |   4 +-
 configure.py         |  44 --------------
 deps/libbdsg         |   2 +-
 scripts/setup-server |   2 +-
 source_me.sh         |  23 -------
 test/Makefile        |   2 +-
 vgci/vgci.sh         |   1 -
 9 files changed, 77 insertions(+), 147 deletions(-)
 delete mode 100644 configure.py
 delete mode 100755 source_me.sh

diff --git a/Dockerfile b/Dockerfile
index ff1433eccfc..cf4279d6045 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -48,7 +48,6 @@ RUN apt-get -qq -y update && apt-get -qq -y upgrade && apt-get -qq -y install \
 ###DEPS_END###
 
 # Prepare to build submodule dependencies
-COPY source_me.sh /vg/source_me.sh
 COPY deps /vg/deps
 # To increase portability of the docker image, when building for amd64, set the
 # target CPU architecture to Nehalem (2008) rather than auto-detecting the
@@ -59,17 +58,17 @@ RUN if [ -z "${TARGETARCH}" ] || [ "${TARGETARCH}" = "amd64" ] ; then sed -i s/m
 RUN find . -name CMakeCache.txt | xargs rm -f
 # Build the dependencies
 COPY Makefile /vg/Makefile
-RUN . ./source_me.sh && CXXFLAGS="$(if [ -z "${TARGETARCH}" ] || [ "${TARGETARCH}" = "amd64" ] ; then echo " -march=nehalem "; fi)" CFLAGS="$(if [ -z "${TARGETARCH}" ] || [ "${TARGETARCH}" = "amd64" ] ; then echo " -march=nehalem "; fi)" make -j $((THREADS < $(nproc) ? THREADS : $(nproc))) deps
+RUN CXXFLAGS="$(if [ -z "${TARGETARCH}" ] || [ "${TARGETARCH}" = "amd64" ] ; then echo " -march=nehalem "; fi)" CFLAGS="$(if [ -z "${TARGETARCH}" ] || [ "${TARGETARCH}" = "amd64" ] ; then echo " -march=nehalem "; fi)" make -j $((THREADS < $(nproc) ? THREADS : $(nproc))) deps
 
 # Bring in the sources, which we need in order to build.
 COPY src /vg/src
 
 # Build all the object files for vg, but don't link.
 # Also pass the arch here
-RUN . ./source_me.sh && CXXFLAGS="$(if [ -z "${TARGETARCH}" ] || [ "${TARGETARCH}" = "amd64" ] ; then echo " -march=nehalem "; fi)" make -j $((THREADS < $(nproc) ? THREADS : $(nproc))) objs
+RUN CXXFLAGS="$(if [ -z "${TARGETARCH}" ] || [ "${TARGETARCH}" = "amd64" ] ; then echo " -march=nehalem "; fi)" make -j $((THREADS < $(nproc) ? THREADS : $(nproc))) objs
 
 # Do the final build and link, knowing the version. Trim down the resulting binary but make sure to include enough debug info for profiling.
-RUN . ./source_me.sh && CXXFLAGS="$(if [ -z "${TARGETARCH}" ] || [ "${TARGETARCH}" = "amd64" ] ; then echo " -march=nehalem "; fi)" make -j $((THREADS < $(nproc) ? THREADS : $(nproc))) static && strip -d bin/vg
+RUN CXXFLAGS="$(if [ -z "${TARGETARCH}" ] || [ "${TARGETARCH}" = "amd64" ] ; then echo " -march=nehalem "; fi)" make -j $((THREADS < $(nproc) ? THREADS : $(nproc))) static && strip -d bin/vg
 
 # Ship the scripts
 COPY scripts /vg/scripts
diff --git a/Makefile b/Makefile
index 67c1e556f0e..ff91be32fa5 100644
--- a/Makefile
+++ b/Makefile
@@ -494,12 +494,12 @@ $(LIB_DIR)/libvg.$(SHARED_SUFFIX): $(LIBVG_SHARED_DEPS)
 
 # Each test set can have its own binary, and not link everything static
 $(UNITTEST_EXE): $(UNITTEST_BIN_DIR)/%: $(UNITTEST_OBJ_DIR)/%.o $(UNITTEST_SUPPORT_OBJ) $(CONFIG_OBJ) $(LIB_DIR)/libvg.$(SHARED_SUFFIX)
-	. ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) -o $@ $< $(UNITTEST_SUPPORT_OBJ) $(CONFIG_OBJ) $(LIB_DIR)/libvg.$(SHARED_SUFFIX) $(LD_LIB_DIR_FLAGS) $(LDFLAGS) $(LD_LIB_FLAGS) $(LD_STATIC_LIB_FLAGS) $(LD_STATIC_LIB_DEPS) $(LD_EXE_LIB_FLAGS)
+	$(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) -o $@ $< $(UNITTEST_SUPPORT_OBJ) $(CONFIG_OBJ) $(LIB_DIR)/libvg.$(SHARED_SUFFIX) $(LD_LIB_DIR_FLAGS) $(LDFLAGS) $(LD_LIB_FLAGS) $(LD_STATIC_LIB_FLAGS) $(LD_STATIC_LIB_DEPS) $(LD_EXE_LIB_FLAGS)
 
 # For a normal dynamic build we remove the static build marker
 $(BIN_DIR)/$(EXE): $(LIB_DIR)/libvg.a $(EXE_DEPS)
 	-rm -f $(LIB_DIR)/vg_is_static
-	. ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) -o $(BIN_DIR)/$(EXE) $(OBJ_DIR)/main.o $(UNITTEST_OBJ) $(SUBCOMMAND_OBJ) $(CONFIG_OBJ) $(LD_LIB_DIR_FLAGS) $(LDFLAGS) $(LIB_DIR)/libvg.a $(LD_LIB_FLAGS) $(START_STATIC) $(LD_STATIC_LIB_FLAGS) $(END_STATIC) $(LD_STATIC_LIB_DEPS) $(LD_EXE_LIB_FLAGS)
+	$(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) -o $(BIN_DIR)/$(EXE) $(OBJ_DIR)/main.o $(UNITTEST_OBJ) $(SUBCOMMAND_OBJ) $(CONFIG_OBJ) $(LD_LIB_DIR_FLAGS) $(LDFLAGS) $(LIB_DIR)/libvg.a $(LD_LIB_FLAGS) $(START_STATIC) $(LD_STATIC_LIB_FLAGS) $(END_STATIC) $(LD_STATIC_LIB_DEPS) $(LD_EXE_LIB_FLAGS)
 # We keep a file that we touch on the last static build.
 # If the vg linkables are newer than the last static build, we do a build
 $(LIB_DIR)/vg_is_static: $(OBJ_DIR)/main.o $(LIB_DIR)/libvg.a $(UNITTEST_OBJ) $(SUBCOMMAND_OBJ) $(CONFIG_OBJ) $(DEPS) $(LINK_DEPS)
@@ -527,10 +527,9 @@ get-deps:
 deps: $(DEPS)
 
 test: $(BIN_DIR)/$(EXE) $(LIB_DIR)/libvg.a test/build_graph $(BIN_DIR)/shuf $(BIN_DIR)/vcf2tsv $(FASTAHACK_DIR)/fastahack $(BIN_DIR)/rapper
-	. ./source_me.sh && cd test && prove -v t
+	cd test && prove -v t
 	# Hide the compiler configuration from the doc tests, so that the ones that
-	# build code can't pick up libraries out of the bg build itself. Definitely
-	# don't source source_me.sh!
+	# build code can't pick up libraries out of the vg build itself.
 	# But still supply vg on the PATH. Hope it knows where its own libraries are.
 	CFLAGS= CXXFLAGS= CPPFLAGS= LDFLAGS= INCLUDE_FLAGS= LIBRARY_PATH= LD_LIBRARY_PATH= DYLD_LIBRARY_PATH= DYLD_FALLBACK_LIBRARY_PATH= LD_INCLUDE_PATH= CC= CXX= CXX_STANDARD= PATH=$(CWD)/bin:$(PATH) doc/test-docs.sh
 
@@ -557,67 +556,67 @@ else
 endif
 
 test/build_graph: test/build_graph.cpp $(LIB_DIR)/libvg.a $(SRC_DIR)/vg.hpp
-	. ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) -o test/build_graph test/build_graph.cpp $(LD_LIB_DIR_FLAGS) $(LDFLAGS) $(LIB_DIR)/libvg.a $(LD_LIB_FLAGS) $(START_STATIC) $(LD_STATIC_LIB_FLAGS) $(END_STATIC) $(FILTER)
+	$(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) -o test/build_graph test/build_graph.cpp $(LD_LIB_DIR_FLAGS) $(LDFLAGS) $(LIB_DIR)/libvg.a $(LD_LIB_FLAGS) $(START_STATIC) $(LD_STATIC_LIB_FLAGS) $(END_STATIC) $(FILTER)
 
 # TODO: The normal and debug jemalloc builds can't safely be run at the same time.
 $(LIB_DIR)/libjemalloc.a: $(JEMALLOC_DIR)/src/*.c
-	+. ./source_me.sh && rm -f $(LIB_DIR)/libjemalloc*.* && rm -Rf $(CWD)/$(INC_DIR)/jemalloc && cd $(JEMALLOC_DIR) && ./autogen.sh && ./configure --enable-prof --disable-libdl --prefix=`pwd` $(FILTER) && $(MAKE) clean && $(MAKE) $(FILTER) && cp lib/libjemalloc.a $(CWD)/$(LIB_DIR)/ && cp -r include/* $(CWD)/$(INC_DIR)/
+	+rm -f $(LIB_DIR)/libjemalloc*.* && rm -Rf $(CWD)/$(INC_DIR)/jemalloc && cd $(JEMALLOC_DIR) && ./autogen.sh && ./configure --enable-prof --disable-libdl --prefix=`pwd` $(FILTER) && $(MAKE) clean && $(MAKE) $(FILTER) && cp lib/libjemalloc.a $(CWD)/$(LIB_DIR)/ && cp -r include/* $(CWD)/$(INC_DIR)/
 
 $(LIB_DIR)/libjemalloc_debug.a: $(JEMALLOC_DIR)/src/*.c
-	+. ./source_me.sh && rm -f $(LIB_DIR)/libjemalloc*.* && rm -Rf $(CWD)/$(INC_DIR)/jemalloc && cd $(JEMALLOC_DIR) && ./autogen.sh && ./configure --enable-prof --disable-libdl --enable-debug --enable-fill --prefix=`pwd` $(FILTER) && $(MAKE) clean && $(MAKE) $(FILTER) && cp lib/libjemalloc.a $(CWD)/$(LIB_DIR)/libjemalloc_debug.a && cp -r include/* $(CWD)/$(INC_DIR)/
+	+rm -f $(LIB_DIR)/libjemalloc*.* && rm -Rf $(CWD)/$(INC_DIR)/jemalloc && cd $(JEMALLOC_DIR) && ./autogen.sh && ./configure --enable-prof --disable-libdl --enable-debug --enable-fill --prefix=`pwd` $(FILTER) && $(MAKE) clean && $(MAKE) $(FILTER) && cp lib/libjemalloc.a $(CWD)/$(LIB_DIR)/libjemalloc_debug.a && cp -r include/* $(CWD)/$(INC_DIR)/
 
 # Use fake patterns to tell Make that this rule generates all these files when run once.
 # Here % should always match "lib" which is a common substring.
 # See https://stackoverflow.com/a/19822767
 $(LIB_DIR)/%sdsl.a $(LIB_DIR)/%divsufsort.a $(LIB_DIR)/%divsufsort64.a : $(SDSL_DIR)/lib/*.cpp $(SDSL_DIR)/include/sdsl/*.hpp
 ifeq ($(shell uname -s),Darwin)
-	+. ./source_me.sh && cd $(SDSL_DIR) && AS_INTEGRATED_ASSEMBLER=1 BUILD_PORTABLE=1 CXXFLAGS="-fPIC $(CPPFLAGS) $(CXXFLAGS)" ./install.sh $(CWD) $(FILTER)
+	+cd $(SDSL_DIR) && AS_INTEGRATED_ASSEMBLER=1 BUILD_PORTABLE=1 CXXFLAGS="-fPIC $(CPPFLAGS) $(CXXFLAGS)" ./install.sh $(CWD) $(FILTER)
 else
-	+. ./source_me.sh && cd $(SDSL_DIR) && BUILD_PORTABLE=1 CXXFLAGS="-fPIC $(CPPFLAGS) $(CXXFLAGS)" ./install.sh $(CWD) $(FILTER)
+	+cd $(SDSL_DIR) && BUILD_PORTABLE=1 CXXFLAGS="-fPIC $(CPPFLAGS) $(CXXFLAGS)" ./install.sh $(CWD) $(FILTER)
 endif
 
 $(LIB_DIR)/libssw.a: $(SSW_DIR)/*.c $(SSW_DIR)/*.cpp $(SSW_DIR)/*.h
-	+. ./source_me.sh && cd $(SSW_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && ar rs $(CWD)/$(LIB_DIR)/libssw.a ssw.o ssw_cpp.o && cp ssw_cpp.h ssw.h $(CWD)/$(INC_DIR)
+	+cd $(SSW_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && ar rs $(CWD)/$(LIB_DIR)/libssw.a ssw.o ssw_cpp.o && cp ssw_cpp.h ssw.h $(CWD)/$(INC_DIR)
 
 # We need to hide -Xpreprocessor -fopenmp from Snappy, at least on Mac, because
 # it will drop the -Xpreprocessor and keep the -fopenmp and upset Clang.
 $(LIB_DIR)/libsnappy.a: $(SNAPPY_DIR)/*.cc $(SNAPPY_DIR)/*.h
-	+. ./source_me.sh && cd $(SNAPPY_DIR) && ./autogen.sh && CXXFLAGS="-fPIC $(filter-out -Xpreprocessor -fopenmp,$(CXXFLAGS))" ./configure --prefix=$(CWD) $(FILTER) && CXXFLAGS="-fPIC $(filter-out -Xpreprocessor -fopenmp,$(CXXFLAGS))" $(MAKE) libsnappy.la $(FILTER) && cp .libs/libsnappy.a $(CWD)/lib/ && cp snappy-c.h snappy-sinksource.h snappy-stubs-public.h snappy.h $(CWD)/include/
+	+cd $(SNAPPY_DIR) && ./autogen.sh && CXXFLAGS="-fPIC $(filter-out -Xpreprocessor -fopenmp,$(CXXFLAGS))" ./configure --prefix=$(CWD) $(FILTER) && CXXFLAGS="-fPIC $(filter-out -Xpreprocessor -fopenmp,$(CXXFLAGS))" $(MAKE) libsnappy.la $(FILTER) && cp .libs/libsnappy.a $(CWD)/lib/ && cp snappy-c.h snappy-sinksource.h snappy-stubs-public.h snappy.h $(CWD)/include/
 
 $(INC_DIR)/gcsa/gcsa.h: $(LIB_DIR)/libgcsa2.a
 
 $(LIB_DIR)/libgcsa2.a: $(LIB_DIR)/libsdsl.a $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a $(wildcard $(GCSA2_DIR)/*.cpp) $(wildcard $(GCSA2_DIR)/include/gcsa/*.h)
 ifeq ($(shell uname -s),Darwin)
-	+. ./source_me.sh && cp -r $(GCSA2_DIR)/include/gcsa $(CWD)/$(INC_DIR)/ && cd $(GCSA2_DIR) && $(MAKE) clean && make directories && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" AS_INTEGRATED_ASSEMBLER=1 $(MAKE) lib/libgcsa2.a $(FILTER) && mv lib/libgcsa2.a $(CWD)/$(LIB_DIR)
+	+cp -r $(GCSA2_DIR)/include/gcsa $(CWD)/$(INC_DIR)/ && cd $(GCSA2_DIR) && $(MAKE) clean && make directories && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" AS_INTEGRATED_ASSEMBLER=1 $(MAKE) lib/libgcsa2.a $(FILTER) && mv lib/libgcsa2.a $(CWD)/$(LIB_DIR)
 else
-	+. ./source_me.sh && cp -r $(GCSA2_DIR)/include/gcsa $(CWD)/$(INC_DIR)/ && cd $(GCSA2_DIR) && $(MAKE) clean && make directories && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) lib/libgcsa2.a $(FILTER) && mv lib/libgcsa2.a $(CWD)/$(LIB_DIR)
+	+cp -r $(GCSA2_DIR)/include/gcsa $(CWD)/$(INC_DIR)/ && cd $(GCSA2_DIR) && $(MAKE) clean && make directories && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) lib/libgcsa2.a $(FILTER) && mv lib/libgcsa2.a $(CWD)/$(LIB_DIR)
 endif
 
 $(INC_DIR)/gbwt/dynamic_gbwt.h: $(LIB_DIR)/libgbwt.a
 
 $(LIB_DIR)/libgbwt.a: $(LIB_DIR)/libsdsl.a $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a $(wildcard $(GBWT_DIR)/src/*.cpp) $(wildcard $(GBWT_DIR)/include/gbwt/*.h)
 ifeq ($(shell uname -s),Darwin)
-	+. ./source_me.sh && cp -r $(GBWT_DIR)/include/gbwt $(CWD)/$(INC_DIR)/ && cd $(GBWT_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" AS_INTEGRATED_ASSEMBLER=1 $(MAKE) $(FILTER) && mv lib/libgbwt.a $(CWD)/$(LIB_DIR)
+	+cp -r $(GBWT_DIR)/include/gbwt $(CWD)/$(INC_DIR)/ && cd $(GBWT_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" AS_INTEGRATED_ASSEMBLER=1 $(MAKE) $(FILTER) && mv lib/libgbwt.a $(CWD)/$(LIB_DIR)
 else
-	+. ./source_me.sh && cp -r $(GBWT_DIR)/include/gbwt $(CWD)/$(INC_DIR)/ && cd $(GBWT_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && mv lib/libgbwt.a $(CWD)/$(LIB_DIR)
+	+cp -r $(GBWT_DIR)/include/gbwt $(CWD)/$(INC_DIR)/ && cd $(GBWT_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && mv lib/libgbwt.a $(CWD)/$(LIB_DIR)
 endif
 
 $(INC_DIR)/gbwtgraph/gbwtgraph.h: $(LIB_DIR)/libgbwtgraph.a
 
 $(LIB_DIR)/libgbwtgraph.a: $(LIB_DIR)/libgbwt.a $(LIB_DIR)/libsdsl.a $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a $(LIB_DIR)/libhandlegraph.a $(wildcard $(GBWTGRAPH_DIR)/src/*.cpp) $(wildcard $(GBWTGRAPH_DIR)/include/gbwtgraph/*.h)
 ifeq ($(shell uname -s),Darwin)
-	+. ./source_me.sh && cp -r $(GBWTGRAPH_DIR)/include/gbwtgraph $(CWD)/$(INC_DIR)/ && cd $(GBWTGRAPH_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" AS_INTEGRATED_ASSEMBLER=1 $(MAKE) $(FILTER) && mv lib/libgbwtgraph.a $(CWD)/$(LIB_DIR)
+	+cp -r $(GBWTGRAPH_DIR)/include/gbwtgraph $(CWD)/$(INC_DIR)/ && cd $(GBWTGRAPH_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" AS_INTEGRATED_ASSEMBLER=1 $(MAKE) $(FILTER) && mv lib/libgbwtgraph.a $(CWD)/$(LIB_DIR)
 else
-	+. ./source_me.sh && cp -r $(GBWTGRAPH_DIR)/include/gbwtgraph $(CWD)/$(INC_DIR)/ && cd $(GBWTGRAPH_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && mv lib/libgbwtgraph.a $(CWD)/$(LIB_DIR)
+	+cp -r $(GBWTGRAPH_DIR)/include/gbwtgraph $(CWD)/$(INC_DIR)/ && cd $(GBWTGRAPH_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && mv lib/libgbwtgraph.a $(CWD)/$(LIB_DIR)
 endif
 
 $(INC_DIR)/kff_io.hpp: $(LIB_DIR)/libkff.a
 
 $(LIB_DIR)/libkff.a: $(KFF_DIR)/kff_io.cpp $(KFF_DIR)/kff_io.hpp.in
 ifeq ($(shell uname -s),Darwin)
-	+. ./source_me.sh && cd $(KFF_DIR) && rm -Rf build && mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-fPIC -Wall -Ofast -g $(CXXFLAGS)" .. && AS_INTEGRATED_ASSEMBLER=1 $(MAKE) $(FILTER) && cp kff_io.hpp $(CWD)/$(INC_DIR) && mv libkff.a $(CWD)/$(LIB_DIR)
+	+cd $(KFF_DIR) && rm -Rf build && mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-fPIC -Wall -Ofast -g $(CXXFLAGS)" .. && AS_INTEGRATED_ASSEMBLER=1 $(MAKE) $(FILTER) && cp kff_io.hpp $(CWD)/$(INC_DIR) && mv libkff.a $(CWD)/$(LIB_DIR)
 else
-	+. ./source_me.sh && cd $(KFF_DIR) && rm -Rf build && mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-fPIC -Wall -Ofast -g $(CXXFLAGS)" .. && $(MAKE) $(FILTER) && cp kff_io.hpp $(CWD)/$(INC_DIR) && mv libkff.a $(CWD)/$(LIB_DIR)
+	+cd $(KFF_DIR) && rm -Rf build && mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-fPIC -Wall -Ofast -g $(CXXFLAGS)" .. && $(MAKE) $(FILTER) && cp kff_io.hpp $(CWD)/$(INC_DIR) && mv libkff.a $(CWD)/$(LIB_DIR)
 endif
 
 $(INC_DIR)/BooPHF.h: $(BBHASH_DIR)/BooPHF.h
@@ -627,17 +626,17 @@ $(INC_DIR)/progress_bar.hpp: $(PROGRESS_BAR_DIR)/progress_bar.hpp
 	+cp $(PROGRESS_BAR_DIR)/progress_bar.hpp $(CWD)/$(INC_DIR)
 
 $(OBJ_DIR)/progress_bar.o: $(PROGRESS_BAR_DIR)/progress_bar.cpp $(PROGRESS_BAR_DIR)/*.hpp
-	+. ./source_me.sh && $(CXX) -I$(FASTAHACK_DIR) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -c -o $@ $<
+	+$(CXX) -I$(FASTAHACK_DIR) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -c -o $@ $<
 $(SHARED_OBJ_DIR)/progress_bar.o: $(PROGRESS_BAR_DIR)/progress_bar.cpp $(PROGRESS_BAR_DIR)/*.hpp
-	+. ./source_me.sh && $(CXX) -I$(FASTAHACK_DIR) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -fPIC -c -o $@ $<
+	+$(CXX) -I$(FASTAHACK_DIR) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -fPIC -c -o $@ $<
 
 $(INC_DIR)/Fasta.h:  $(FASTAHACK_DIR)/Fasta.h
-	+. ./source_me.sh && cd $(FASTAHACK_DIR) && cp Fasta.h $(CWD)/$(INC_DIR)
+	+cd $(FASTAHACK_DIR) && cp Fasta.h $(CWD)/$(INC_DIR)
 
 $(OBJ_DIR)/Fasta.o: $(FASTAHACK_DIR)/Fasta.cpp $(INC_DIR)/Fasta.h 
-	+. ./source_me.sh && $(CXX) -I$(FASTAHACK_DIR) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -c -o $@ $< $(FILTER)
+	+$(CXX) -I$(FASTAHACK_DIR) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -c -o $@ $< $(FILTER)
 $(SHARED_OBJ_DIR)/Fasta.o: $(FASTAHACK_DIR)/Fasta.cpp $(INC_DIR)/Fasta.h
-	+. ./source_me.sh && $(CXX) -I$(FASTAHACK_DIR) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -fPIC -c -o $@ $< $(FILTER)
+	+$(CXX) -I$(FASTAHACK_DIR) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -fPIC -c -o $@ $< $(FILTER)
 
 # We have this target to clean up the old Protobuf we used to have.
 # We can remove it after we no longer care about building properly on a dirty
@@ -663,10 +662,10 @@ $(LIB_DIR)/cleaned_old_elfutils:
 $(LIB_DIR)/libvgio.a: $(LIB_DIR)/libhts.a $(LIB_DIR)/libhandlegraph.a $(LIB_DIR)/pkgconfig/htslib.pc $(LIB_DIR)/cleaned_old_protobuf_v003 $(LIBVGIO_DIR)/CMakeLists.txt $(LIBVGIO_DIR)/src/*.cpp $(LIBVGIO_DIR)/include/vg/io/*.hpp $(LIBVGIO_DIR)/deps/vg.proto
 	+rm -f $(CWD)/$(INC_DIR)/vg.pb.h $(CWD)/$(INC_DIR)/vg/vg.pb.h
 	+rm -Rf $(CWD)/$(INC_DIR)/vg/io/
-	+. ./source_me.sh && export CXXFLAGS="$(CPPFLAGS) $(CXXFLAGS)" && export LDFLAGS="$(LD_LIB_DIR_FLAGS) $(LDFLAGS)" && cd $(LIBVGIO_DIR) && rm -Rf CMakeCache.txt CMakeFiles *.cmake install_manifest.txt *.pb.cc *.pb.h *.a && rm -rf build-vg && mkdir build-vg && cd build-vg && PKG_CONFIG_PATH=$(CWD)/$(LIB_DIR)/pkgconfig:$(PKG_CONFIG_PATH) cmake -DCMAKE_CXX_STANDARD=$(CXX_STANDARD) -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_PREFIX_PATH="/usr;$(OMP_PREFIXES)" -DCMAKE_INSTALL_PREFIX=$(CWD) -DCMAKE_INSTALL_LIBDIR=lib .. $(FILTER) && $(MAKE) clean && VERBOSE=1 $(MAKE) $(FILTER) && $(MAKE) install
+	+export CXXFLAGS="$(CPPFLAGS) $(CXXFLAGS)" && export LDFLAGS="$(LD_LIB_DIR_FLAGS) $(LDFLAGS)" && cd $(LIBVGIO_DIR) && rm -Rf CMakeCache.txt CMakeFiles *.cmake install_manifest.txt *.pb.cc *.pb.h *.a && rm -rf build-vg && mkdir build-vg && cd build-vg && PKG_CONFIG_PATH=$(CWD)/$(LIB_DIR)/pkgconfig:$(PKG_CONFIG_PATH) cmake -DCMAKE_CXX_STANDARD=$(CXX_STANDARD) -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_PREFIX_PATH="/usr;$(OMP_PREFIXES)" -DCMAKE_INSTALL_PREFIX=$(CWD) -DCMAKE_INSTALL_LIBDIR=lib .. $(FILTER) && $(MAKE) clean && VERBOSE=1 $(MAKE) $(FILTER) && $(MAKE) install
 
 $(LIB_DIR)/libhandlegraph.a: $(LIBHANDLEGRAPH_DIR)/src/include/handlegraph/*.hpp $(LIBHANDLEGRAPH_DIR)/src/*.cpp
-	+. ./source_me.sh && cd $(LIBHANDLEGRAPH_DIR) && rm -Rf build CMakeCache.txt CMakeFiles && mkdir build && cd build && CXXFLAGS="$(CXXFLAGS) $(CPPFLAGS)" cmake -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_INSTALL_PREFIX=$(CWD) -DCMAKE_INSTALL_LIBDIR=lib .. && $(MAKE) $(FILTER) && $(MAKE) install
+	+cd $(LIBHANDLEGRAPH_DIR) && rm -Rf build CMakeCache.txt CMakeFiles && mkdir build && cd build && CXXFLAGS="$(CXXFLAGS) $(CPPFLAGS)" cmake -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_INSTALL_PREFIX=$(CWD) -DCMAKE_INSTALL_LIBDIR=lib .. && $(MAKE) $(FILTER) && $(MAKE) install
 
 
 # On Linux, libdeflate builds a .so.
@@ -682,7 +681,7 @@ ifeq ($(shell uname -s),Darwin)
 endif
 
 $(LIB_DIR)/libdeflate.a: $(LIBDEFLATE_DIR)/*.h $(LIBDEFLATE_DIR)/lib/*.h $(LIBDEFLATE_DIR)/lib/*/*.h $(LIBDEFLATE_DIR)/lib/*.c $(LIBDEFLATE_DIR)/lib/*/*.c
-	+. ./source_me.sh && cd $(LIBDEFLATE_DIR) && V=1 LDFLAGS="$(LDFLAGS) $(LD_RENAMEABLE_FLAGS)" $(MAKE) $(FILTER) && cp libdeflate.a $(CWD)/$(LIB_DIR) && cp libdeflate.h $(CWD)/$(INC_DIR)
+	+cd $(LIBDEFLATE_DIR) && V=1 LDFLAGS="$(LDFLAGS) $(LD_RENAMEABLE_FLAGS)" $(MAKE) $(FILTER) && cp libdeflate.a $(CWD)/$(LIB_DIR) && cp libdeflate.h $(CWD)/$(INC_DIR)
 
 # We build htslib after libdeflate so it can use libdeflate.
 # We need to do some wizardry to get it to pick up the right build and target system types on modern autotools.
@@ -695,13 +694,13 @@ $(LIB_DIR)/libdeflate.a: $(LIBDEFLATE_DIR)/*.h $(LIBDEFLATE_DIR)/lib/*.h $(LIBDE
 # a system path, in case another htslib is installed on the system. Some HTSlib
 # headers look for the current HTSlib with <>.
 $(LIB_DIR)/libhts%a $(LIB_DIR)/pkgconfig/htslib%pc $(LIB_DIR)/libhts%$(SHARED_SUFFIX): $(LIB_DIR)/libdeflate.a $(LIB_DIR)/libdeflate.$(SHARED_SUFFIX) $(HTSLIB_DIR)/*.c $(HTSLIB_DIR)/*.h $(HTSLIB_DIR)/htslib/*.h $(HTSLIB_DIR)/cram/*.c $(HTSLIB_DIR)/cram/*.h
-	+. ./source_me.sh && cd $(HTSLIB_DIR) && rm -Rf $(CWD)/$(INC_DIR)/htslib $(CWD)/$(LIB_DIR)/libhts* && autoreconf -i && autoheader && autoconf || true
-	+. ./source_me.sh && cd $(HTSLIB_DIR) && (./configure -n 2>&1 || true) | grep "build system type" | rev | cut -f1 -d' ' | rev >systype.txt
-	+. ./source_me.sh && cd $(HTSLIB_DIR) && CFLAGS="-I$(CWD)/$(HTSLIB_DIR) -isystem $(CWD)/$(HTSLIB_DIR) -I$(CWD)/$(INC_DIR) $(CFLAGS)" LDFLAGS="$(LDFLAGS) -L$(CWD)/$(LIB_DIR) $(LD_UTIL_RPATH_FLAGS)" ./configure --with-libdeflate --disable-s3 --disable-gcs --disable-libcurl --disable-plugins --prefix=$(CWD) --host=$$(cat systype.txt) $(FILTER) && $(MAKE) clean && $(MAKE) $(FILTER) && $(MAKE) install
+	+cd $(HTSLIB_DIR) && rm -Rf $(CWD)/$(INC_DIR)/htslib $(CWD)/$(LIB_DIR)/libhts* && autoreconf -i && autoheader && autoconf || true
+	+cd $(HTSLIB_DIR) && (./configure -n 2>&1 || true) | grep "build system type" | rev | cut -f1 -d' ' | rev >systype.txt
+	+cd $(HTSLIB_DIR) && CFLAGS="-I$(CWD)/$(HTSLIB_DIR) -isystem $(CWD)/$(HTSLIB_DIR) -I$(CWD)/$(INC_DIR) $(CFLAGS)" LDFLAGS="$(LDFLAGS) -L$(CWD)/$(LIB_DIR) $(LD_UTIL_RPATH_FLAGS)" ./configure --with-libdeflate --disable-s3 --disable-gcs --disable-libcurl --disable-plugins --prefix=$(CWD) --host=$$(cat systype.txt) $(FILTER) && $(MAKE) clean && $(MAKE) $(FILTER) && $(MAKE) install
 
 # Build and install tabixpp for vcflib.
 $(LIB_DIR)/libtabixpp.a: $(LIB_DIR)/libhts.a $(TABIXPP_DIR)/*.cpp $(TABIXPP_DIR)/*.hpp
-	+. ./source_me.sh && cd $(TABIXPP_DIR) && rm -f tabix.o libtabixpp.a && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" INCLUDES="-I$(CWD)/$(INC_DIR)" HTS_HEADERS="" $(MAKE) tabix.o $(FILTER) && ar rcs libtabixpp.a tabix.o
+	+cd $(TABIXPP_DIR) && rm -f tabix.o libtabixpp.a && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" INCLUDES="-I$(CWD)/$(INC_DIR)" HTS_HEADERS="" $(MAKE) tabix.o $(FILTER) && ar rcs libtabixpp.a tabix.o
 	+cp $(TABIXPP_DIR)/libtabixpp.a $(LIB_DIR) && cp $(TABIXPP_DIR)/tabix.hpp $(INC_DIR)
 	+echo "Name: tabixpp" > $(LIB_DIR)/pkgconfig/tabixpp.pc
 	+echo "Description: Self-packaged tabixpp" >> $(LIB_DIR)/pkgconfig/tabixpp.pc
@@ -717,7 +716,7 @@ $(LIB_DIR)/libtabixpp.a: $(LIB_DIR)/libhts.a $(TABIXPP_DIR)/*.cpp $(TABIXPP_DIR)
 # We need to use /usr first for CMake search or Ubuntu 22.04 will decide pybind11 is installed in / when actually it is only fully installed in /usr.
 $(LIB_DIR)/libvcflib.a: $(LIB_DIR)/libhts.a $(LIB_DIR)/libtabixpp.a $(VCFLIB_DIR)/src/*.cpp $(VCFLIB_DIR)/src/*.hpp $(VCFLIB_DIR)/contrib/*/*.cpp $(VCFLIB_DIR)/contrib/*/*.h
 	+rm -f $(VCFLIB_DIR)/contrib/WFA2-lib/VERSION
-	+. ./source_me.sh && cd $(VCFLIB_DIR) && rm -Rf build && mkdir build && cd build && PKG_CONFIG_PATH="$(CWD)/$(LIB_DIR)/pkgconfig:$(PKG_CONFIG_PATH)" cmake -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON -DZIG=OFF -DCMAKE_C_FLAGS="$(CFLAGS)" -DCMAKE_CXX_FLAGS="$(CXXFLAGS) ${CPPFLAGS}" -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_INSTALL_PREFIX=$(CWD) -DCMAKE_PREFIX_PATH="/usr;$(OMP_PREFIXES)" .. && cmake --build .
+	+cd $(VCFLIB_DIR) && rm -Rf build && mkdir build && cd build && PKG_CONFIG_PATH="$(CWD)/$(LIB_DIR)/pkgconfig:$(PKG_CONFIG_PATH)" cmake -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON -DZIG=OFF -DCMAKE_C_FLAGS="$(CFLAGS)" -DCMAKE_CXX_FLAGS="$(CXXFLAGS) ${CPPFLAGS}" -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_INSTALL_PREFIX=$(CWD) -DCMAKE_PREFIX_PATH="/usr;$(OMP_PREFIXES)" .. && cmake --build .
 	+cp $(VCFLIB_DIR)/contrib/filevercmp/*.h* $(INC_DIR)
 	+cp $(VCFLIB_DIR)/contrib/fastahack/*.h* $(INC_DIR)
 	+cp $(VCFLIB_DIR)/contrib/smithwaterman/*.h* $(INC_DIR)
@@ -731,10 +730,10 @@ $(BIN_DIR)/vcf2tsv: $(VCFLIB_DIR)/src/*.cpp $(VCFLIB_DIR)/src/*.h $(LIB_DIR)/lib
 	+cp $(VCFLIB_DIR)/build/vcf2tsv $(BIN_DIR)
 
 $(FASTAHACK_DIR)/fastahack: $(FASTAHACK_DIR)/*.c $(FASTAHACK_DIR)/*.h $(FASTAHACK_DIR)/*.cpp
-	+. ./source_me.sh && cd $(FASTAHACK_DIR) && $(MAKE) $(FILTER)
+	+cd $(FASTAHACK_DIR) && $(MAKE) $(FILTER)
 
 $(LIB_DIR)/libgssw.a: $(GSSW_DIR)/src/gssw.c $(GSSW_DIR)/src/gssw.h
-	+. ./source_me.sh && cd $(GSSW_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && cp lib/libgssw.a $(CWD)/$(LIB_DIR)/ && cp src/gssw.h $(CWD)/$(INC_DIR)/
+	+cd $(GSSW_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && cp lib/libgssw.a $(CWD)/$(LIB_DIR)/ && cp src/gssw.h $(CWD)/$(INC_DIR)/
 
 $(INC_DIR)/lru_cache.h: $(DEP_DIR)/lru_cache/*.h $(DEP_DIR)/lru_cache/*.cc
 	+cd $(DEP_DIR)/lru_cache && cp *.h* $(CWD)/$(INC_DIR)/
@@ -749,26 +748,26 @@ $(INC_DIR)/dynamic/dynamic.hpp: $(DYNAMIC_DIR)/include/dynamic/*.hpp $(DYNAMIC_D
 	+mkdir -p $(INC_DIR)/dynamic && cp -r $(CWD)/$(DYNAMIC_DIR)/include/dynamic/* $(INC_DIR)/dynamic/
 
 $(INC_DIR)/sparsehash/sparse_hash_map: $(wildcard $(SPARSEHASH_DIR)/**/*.cc) $(wildcard $(SPARSEHASH_DIR)/**/*.h)
-	+. ./source_me.sh && cd $(SPARSEHASH_DIR) && ./autogen.sh && LDFLAGS="$(LD_LIB_DIR_FLAGS) $(LDFLAGS)" ./configure --prefix=$(CWD) $(FILTER) && $(MAKE) $(FILTER) && $(MAKE) install
+	+cd $(SPARSEHASH_DIR) && ./autogen.sh && LDFLAGS="$(LD_LIB_DIR_FLAGS) $(LDFLAGS)" ./configure --prefix=$(CWD) $(FILTER) && $(MAKE) $(FILTER) && $(MAKE) install
 
 $(INC_DIR)/sparsepp/spp.h: $(wildcard $(SPARSEHASH_DIR)/sparsepp/*.h)
 	+cp -r $(SPARSEPP_DIR)/sparsepp $(INC_DIR)/
 
 #$(INC_DIR)/Variant.h
 $(LIB_DIR)/libvcfh.a: $(DEP_DIR)/libVCFH/*.cpp $(DEP_DIR)/libVCFH/*.hpp
-	+. ./source_me.sh && cd $(DEP_DIR)/libVCFH && $(MAKE) $(FILTER) && cp libvcfh.a $(CWD)/$(LIB_DIR)/ && cp vcfheader.hpp $(CWD)/$(INC_DIR)/
+	+cd $(DEP_DIR)/libVCFH && $(MAKE) $(FILTER) && cp libvcfh.a $(CWD)/$(LIB_DIR)/ && cp vcfheader.hpp $(CWD)/$(INC_DIR)/
 
 $(LIB_DIR)/libsonlib.a: $(CWD)/$(DEP_DIR)/sonLib/C/inc/*.h $(CWD)/$(DEP_DIR)/sonLib/C/impl/*.c
-	+. ./source_me.sh && cd $(DEP_DIR)/sonLib && $(MAKE) clean && kyotoTycoonLib="" CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && cp lib/sonLib.a $(CWD)/$(LIB_DIR)/libsonlib.a && mkdir -p $(CWD)/$(INC_DIR)/sonLib && cp lib/*.h $(CWD)/$(INC_DIR)/sonLib
+	+cd $(DEP_DIR)/sonLib && $(MAKE) clean && kyotoTycoonLib="" CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && cp lib/sonLib.a $(CWD)/$(LIB_DIR)/libsonlib.a && mkdir -p $(CWD)/$(INC_DIR)/sonLib && cp lib/*.h $(CWD)/$(INC_DIR)/sonLib
 
 $(LIB_DIR)/libpinchesandcacti.a: $(LIB_DIR)/libsonlib.a $(CWD)/$(DEP_DIR)/pinchesAndCacti/inc/*.h $(CWD)/$(DEP_DIR)/pinchesAndCacti/impl/*.c
-	+. ./source_me.sh && cd $(DEP_DIR)/pinchesAndCacti && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && cd $(CWD)/$(DEP_DIR)/sonLib && cp lib/stPinchesAndCacti.a $(CWD)/$(LIB_DIR)/libpinchesandcacti.a && cp lib/3EdgeConnected.a $(CWD)/$(LIB_DIR)/lib3edgeconnected.a && mkdir -p $(CWD)/$(INC_DIR)/sonLib && cp lib/*.h $(CWD)/$(INC_DIR)/sonLib
+	+cd $(DEP_DIR)/pinchesAndCacti && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && cd $(CWD)/$(DEP_DIR)/sonLib && cp lib/stPinchesAndCacti.a $(CWD)/$(LIB_DIR)/libpinchesandcacti.a && cp lib/3EdgeConnected.a $(CWD)/$(LIB_DIR)/lib3edgeconnected.a && mkdir -p $(CWD)/$(INC_DIR)/sonLib && cp lib/*.h $(CWD)/$(INC_DIR)/sonLib
 
 # When building raptor we need to make sure to pre-generate and fix up the lexer
 # We also need to clear out its cmake stuff in case it found a wrong Bison and cached it.
 $(LIB_DIR)/libraptor2.a: $(RAPTOR_DIR)/src/* $(wildcard $(RAPTOR_DIR)/build/*)
 	which bison
-	+. ./source_me.sh && cd $(RAPTOR_DIR)/build && rm -Rf CMakeCache.txt CMakeFiles CTestTestfile.cmake Makefile cmake_install.cmake src tests utils && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" cmake .. && rm -f src/turtle_parser.c && rm -f src/turtle_lexer.c && make turtle_lexer_tgt && make -f src/CMakeFiles/raptor2.dir/build.make src/turtle_lexer.c && sed -i.bak '/yycleanup/d' src/turtle_lexer.c && $(MAKE) $(FILTER) && cp src/libraptor2.a $(CWD)/$(LIB_DIR)
+	+cd $(RAPTOR_DIR)/build && rm -Rf CMakeCache.txt CMakeFiles CTestTestfile.cmake Makefile cmake_install.cmake src tests utils && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" cmake .. && rm -f src/turtle_parser.c && rm -f src/turtle_lexer.c && make turtle_lexer_tgt && make -f src/CMakeFiles/raptor2.dir/build.make src/turtle_lexer.c && sed -i.bak '/yycleanup/d' src/turtle_lexer.c && $(MAKE) $(FILTER) && cp src/libraptor2.a $(CWD)/$(LIB_DIR)
 	+touch $(LIB_DIR)/libraptor2.a
 
 # We need rapper from Raptor for the tests
@@ -782,7 +781,7 @@ $(INC_DIR)/raptor2/raptor2.h: $(LIB_DIR)/libraptor2.a $(RAPTOR_DIR)/build/*
 	+touch $(INC_DIR)/raptor2/raptor2.h
 
 $(LIB_DIR)/libstructures.a: $(STRUCTURES_DIR)/src/include/structures/*.hpp $(STRUCTURES_DIR)/src/*.cpp $(STRUCTURES_DIR)/Makefile
-	+. ./source_me.sh && cd $(STRUCTURES_DIR) && $(MAKE) clean && $(MAKE) lib/libstructures.a $(FILTER) && cp lib/libstructures.a $(CWD)/$(LIB_DIR)/ && cp -r src/include/structures $(CWD)/$(INC_DIR)/
+	+cd $(STRUCTURES_DIR) && $(MAKE) clean && $(MAKE) lib/libstructures.a $(FILTER) && cp lib/libstructures.a $(CWD)/$(LIB_DIR)/ && cp -r src/include/structures $(CWD)/$(INC_DIR)/
 
 $(INC_DIR)/sha1.hpp: $(SHA1_DIR)/sha1.hpp
 	+cp $(SHA1_DIR)/*.h* $(CWD)/$(INC_DIR)/
@@ -812,15 +811,15 @@ $(LIB_DIR)/libdwfl.a: $(LIB_DIR)/libelf.a
 # running on.
 $(LIB_DIR)/libelf.a: $(ELFUTILS_DIR)/libebl/*.c $(ELFUTILS_DIR)/libebl/*.h $(ELFUTILS_DIR)/libdw/*.c $(ELFUTILS_DIR)/libdw/*.h $(ELFUTILS_DIR)/libelf/*.c $(ELFUTILS_DIR)/libelf/*.h $(ELFUTILS_DIR)/src/*.c $(ELFUTILS_DIR)/src/*.h $(LIB_DIR)/cleaned_old_elfutils
 	+cd $(CWD)/$(INC_DIR)/ && rm -Rf elfutils gelf.h libelf.h dwarf.h libdwflP.h libdwfl.h libebl.h libelf.h
-	+. ./source_me.sh && cd $(ELFUTILS_DIR) && autoreconf -i -f && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" ./configure --enable-maintainer-mode --disable-libdebuginfod --disable-debuginfod --prefix=$(CWD) $(FILTER)
-	+. ./source_me.sh && cd $(ELFUTILS_DIR)/libelf && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) libelf.a $(FILTER)
-	+. ./source_me.sh && cd $(ELFUTILS_DIR)/libebl && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) libebl.a $(FILTER)
-	+. ./source_me.sh && cd $(ELFUTILS_DIR)/libdwfl && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) libdwfl.a $(FILTER)
-	+. ./source_me.sh && cd $(ELFUTILS_DIR)/libdwelf && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) libdwelf.a $(FILTER)
-	+. ./source_me.sh && cd $(ELFUTILS_DIR)/lib && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) libeu.a $(FILTER)
-	+. ./source_me.sh && cd $(ELFUTILS_DIR)/libcpu && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) libcpu.a $(FILTER)
-	+. ./source_me.sh && cd $(ELFUTILS_DIR)/backends && $(MAKE) clean CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" && $(MAKE) libebl_backends.a $(FILTER)
-	+. ./source_me.sh && cd $(ELFUTILS_DIR)/libdw && $(MAKE) clean CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" && $(MAKE) libdw.a known-dwarf.h $(FILTER)
+	+cd $(ELFUTILS_DIR) && autoreconf -i -f && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" ./configure --enable-maintainer-mode --disable-libdebuginfod --disable-debuginfod --prefix=$(CWD) $(FILTER)
+	+cd $(ELFUTILS_DIR)/libelf && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) libelf.a $(FILTER)
+	+cd $(ELFUTILS_DIR)/libebl && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) libebl.a $(FILTER)
+	+cd $(ELFUTILS_DIR)/libdwfl && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) libdwfl.a $(FILTER)
+	+cd $(ELFUTILS_DIR)/libdwelf && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) libdwelf.a $(FILTER)
+	+cd $(ELFUTILS_DIR)/lib && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) libeu.a $(FILTER)
+	+cd $(ELFUTILS_DIR)/libcpu && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) libcpu.a $(FILTER)
+	+cd $(ELFUTILS_DIR)/backends && $(MAKE) clean CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" && $(MAKE) libebl_backends.a $(FILTER)
+	+cd $(ELFUTILS_DIR)/libdw && $(MAKE) clean CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" && $(MAKE) libdw.a known-dwarf.h $(FILTER)
 	+cd $(ELFUTILS_DIR) && mkdir -p $(CWD)/$(INC_DIR)/elfutils && cp libdw/known-dwarf.h libdw/libdw.h libebl/libebl.h libelf/elf-knowledge.h version.h libdwfl/libdwfl.h libdwelf/libdwelf.h $(CWD)/$(INC_DIR)/elfutils && cp libelf/gelf.h libelf/libelf.h libdw/dwarf.h $(CWD)/$(INC_DIR) && cp libebl/libebl.a libdw/libdw.a libdwfl/libdwfl.a libdwelf/libdwelf.a libelf/libelf.a $(CWD)/$(LIB_DIR)/
 
 $(OBJ_DIR)/sha1.o: $(SHA1_DIR)/sha1.cpp $(SHA1_DIR)/sha1.hpp
@@ -829,30 +828,30 @@ $(SHARED_OBJ_DIR)/sha1.o: $(SHA1_DIR)/sha1.cpp $(SHA1_DIR)/sha1.hpp
 	+$(CXX) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -fPIC -c -o $@ $< $(FILTER)
 
 $(LIB_DIR)/libfml.a: $(FERMI_DIR)/*.h $(FERMI_DIR)/*.c
-	. ./source_me.sh && cd $(FERMI_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && cp *.h $(CWD)/$(INC_DIR)/ && cp libfml.a $(CWD)/$(LIB_DIR)/
+	cd $(FERMI_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && cp *.h $(CWD)/$(INC_DIR)/ && cp libfml.a $(CWD)/$(LIB_DIR)/
 
 # We don't need to hack the build to point at our htslib because sublinearLS gets its htslib from the include flags we set
 # But we do need to hack out the return type error to work around https://github.com/yoheirosen/sublinear-Li-Stephens/issues/6
 # TODO: This probably means actually calling some things in the library is unsafe!
 $(LIB_DIR)/libsublinearLS.a: $(LINLS_DIR)/src/*.cpp $(LINLS_DIR)/src/*.hpp $(LIB_DIR)/libhts.a
-	. ./source_me.sh && cd $(LINLS_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(filter-out -Werror=return-type,$(CFLAGS))" CXXFLAGS="-fPIC $(filter-out -Werror=return-type,$(CXXFLAGS))" INCLUDE_FLAGS="-I$(CWD)/$(INC_DIR)" $(MAKE) libs $(FILTER) && cp lib/libsublinearLS.a $(CWD)/$(LIB_DIR)/ && mkdir -p $(CWD)/$(INC_DIR)/sublinearLS && cp src/*.hpp $(CWD)/$(INC_DIR)/sublinearLS/
+	cd $(LINLS_DIR) && $(MAKE) clean && CFLAGS="-fPIC $(filter-out -Werror=return-type,$(CFLAGS))" CXXFLAGS="-fPIC $(filter-out -Werror=return-type,$(CXXFLAGS))" INCLUDE_FLAGS="-I$(CWD)/$(INC_DIR)" $(MAKE) libs $(FILTER) && cp lib/libsublinearLS.a $(CWD)/$(LIB_DIR)/ && mkdir -p $(CWD)/$(INC_DIR)/sublinearLS && cp src/*.hpp $(CWD)/$(INC_DIR)/sublinearLS/
 
 $(LIB_DIR)/libbdsg.a: $(INC_DIR)/BooPHF.h $(LIBBDSG_DIR)/Makefile $(LIBBDSG_DIR)/bdsg/src/*.cpp $(LIBBDSG_DIR)/bdsg/include/bdsg/*.hpp $(LIBBDSG_DIR)/bdsg/include/bdsg/internal/*.hpp $(LIBBDSG_DIR)/bdsg/include/bdsg/overlays/*.hpp $(LIB_DIR)/libhandlegraph.a $(LIB_DIR)/libsdsl.a $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a $(INC_DIR)/sparsepp/spp.h $(INC_DIR)/dynamic/dynamic.hpp $(INC_DIR)/mio/mmap.hpp
-	+. ./source_me.sh && rm -Rf $(CWD)/$(INC_DIR)/bdsg && cd $(LIBBDSG_DIR) && $(MAKE) clean && CPLUS_INCLUDE_PATH=$(CWD)/$(INC_DIR):$(CWD)/$(INC_DIR)/dynamic:$(CPLUS_INCLUDE_PATH) CXXFLAGS="$(INCLUDE_FLAGS) -fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && cp lib/libbdsg.a $(CWD)/$(LIB_DIR) && cp -r bdsg/include/* $(CWD)/$(INC_DIR)
+	+rm -Rf $(CWD)/$(INC_DIR)/bdsg && cd $(LIBBDSG_DIR) && $(MAKE) clean && CPLUS_INCLUDE_PATH=$(CWD)/$(INC_DIR):$(CWD)/$(INC_DIR)/dynamic:$(CPLUS_INCLUDE_PATH) CXXFLAGS="$(INCLUDE_FLAGS) -fPIC $(CXXFLAGS)" $(MAKE) $(FILTER) && cp lib/libbdsg.a $(CWD)/$(LIB_DIR) && cp -r bdsg/include/* $(CWD)/$(INC_DIR)
 
 $(INC_DIR)/mio/mmap.hpp: $(MIO_DIR)/include/mio/*
-	+. ./source_me.sh && cp -r $(MIO_DIR)/include/mio $(CWD)/$(INC_DIR)/
+	+cp -r $(MIO_DIR)/include/mio $(CWD)/$(INC_DIR)/
 
 # It would be better to copy the atomic_queue directory rather than its contents, but to avoid re-writing mmmultimap...
 $(INC_DIR)/atomic_queue.h: $(ATOMIC_QUEUE_DIR)/include/*
-	+. ./source_me.sh && cp -r $(ATOMIC_QUEUE_DIR)/include/atomic_queue/* $(CWD)/$(INC_DIR)/
+	+cp -r $(ATOMIC_QUEUE_DIR)/include/atomic_queue/* $(CWD)/$(INC_DIR)/
 
 $(INC_DIR)/mmmultiset.hpp: $(MMMULTIMAP_DIR)/src/mmmultiset.hpp $(INC_DIR)/mmmultimap.hpp
 $(INC_DIR)/mmmultimap.hpp: $(MMMULTIMAP_DIR)/src/mmmultimap.hpp $(MMMULTIMAP_DIR)/src/mmmultiset.hpp $(INC_DIR)/mio/mmap.hpp $(INC_DIR)/atomic_queue.h
-	+. ./source_me.sh && cp $(MMMULTIMAP_DIR)/src/mmmultimap.hpp $(MMMULTIMAP_DIR)/src/mmmultiset.hpp $(CWD)/$(INC_DIR)/
+	+cp $(MMMULTIMAP_DIR)/src/mmmultimap.hpp $(MMMULTIMAP_DIR)/src/mmmultiset.hpp $(CWD)/$(INC_DIR)/
 
 $(INC_DIR)/ips4o.hpp: $(IPS4O_DIR)/ips4o.hpp $(IPS4O_DIR)/ips4o/*
-	+. ./source_me.sh && cp -r $(IPS4O_DIR)/ips4o* $(CWD)/$(INC_DIR)/
+	+cp -r $(IPS4O_DIR)/ips4o* $(CWD)/$(INC_DIR)/
 
 # The xg repo has a cmake build system based all around external projects, and
 # we need it to use our installed versions of everything instead.
@@ -860,7 +859,7 @@ $(INC_DIR)/ips4o.hpp: $(IPS4O_DIR)/ips4o.hpp $(IPS4O_DIR)/ips4o/*
 $(LIB_DIR)/libxg.a: $(XG_DIR)/src/*.hpp $(XG_DIR)/src/*.cpp $(INC_DIR)/mmmultimap.hpp $(INC_DIR)/ips4o.hpp $(LIB_DIR)/libhandlegraph.a $(LIB_DIR)/libsdsl.a $(LIB_DIR)/libdivsufsort.a $(LIB_DIR)/libdivsufsort64.a $(INC_DIR)/mio/mmap.hpp $(INC_DIR)/atomic_queue.h
 	+rm -f $@
 	+cp -r $(XG_DIR)/src/*.hpp $(CWD)/$(INC_DIR)
-	+. ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -fPIC -DNO_GFAKLUGE -c -o $(XG_DIR)/xg.o $(XG_DIR)/src/xg.cpp $(FILTER)
+	+$(CXX) $(INCLUDE_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -fPIC -DNO_GFAKLUGE -c -o $(XG_DIR)/xg.o $(XG_DIR)/src/xg.cpp $(FILTER)
 	+ar rs $@ $(XG_DIR)/xg.o
 
 # Auto-git-versioning
@@ -922,42 +921,42 @@ $(OBJ_DIR)/version.o: $(SRC_DIR)/version.cpp $(SRC_DIR)/version.hpp $(SRC_DIR)/v
 # Use static pattern rules so the dependency files will not be ignored if the output exists
 # See <https://stackoverflow.com/a/34983297>
 $(OBJ) $(OBJ_DIR)/main.o: $(OBJ_DIR)/%.o : $(SRC_DIR)/%.cpp $(OBJ_DIR)/%.d $(DEPS)
-	. ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER)
+	$(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER)
 	@touch $@
 $(SHARED_OBJ): $(SHARED_OBJ_DIR)/%.o : $(SRC_DIR)/%.cpp $(SHARED_OBJ_DIR)/%.d $(DEPS)
-	. ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -fPIC -c -o $@ $< $(FILTER)
+	$(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -fPIC -c -o $@ $< $(FILTER)
 	@touch $@
 $(ALGORITHMS_OBJ): $(ALGORITHMS_OBJ_DIR)/%.o : $(ALGORITHMS_SRC_DIR)/%.cpp $(ALGORITHMS_OBJ_DIR)/%.d $(DEPS)
-	. ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER)
+	$(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER)
 	@touch $@
 $(ALGORITHMS_SHARED_OBJ): $(ALGORITHMS_SHARED_OBJ_DIR)/%.o : $(ALGORITHMS_SRC_DIR)/%.cpp $(ALGORITHMS_SHARED_OBJ_DIR)/%.d $(DEPS)
-	. ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -fPIC -c -o $@ $< $(FILTER)
+	$(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -fPIC -c -o $@ $< $(FILTER)
 	@touch $@
 $(IO_OBJ): $(IO_OBJ_DIR)/%.o : $(IO_SRC_DIR)/%.cpp $(IO_OBJ_DIR)/%.d $(DEPS)
-	. ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER)
+	$(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER)
 	@touch $@
 $(IO_SHARED_OBJ): $(IO_SHARED_OBJ_DIR)/%.o : $(IO_SRC_DIR)/%.cpp $(IO_SHARED_OBJ_DIR)/%.d $(DEPS)
-	. ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -fPIC -c -o $@ $< $(FILTER)
+	$(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -fPIC -c -o $@ $< $(FILTER)
 	@touch $@
 $(SUBCOMMAND_OBJ): $(SUBCOMMAND_OBJ_DIR)/%.o : $(SUBCOMMAND_SRC_DIR)/%.cpp $(SUBCOMMAND_OBJ_DIR)/%.d $(DEPS)
-	. ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER)
+	$(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER)
 	@touch $@
 $(UNITTEST_OBJ): $(UNITTEST_OBJ_DIR)/%.o : $(UNITTEST_SRC_DIR)/%.cpp $(UNITTEST_OBJ_DIR)/%.d $(DEPS)
-	. ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER)
+	$(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER)
 	@touch $@
 $(UNITTEST_SUPPORT_OBJ): $(UNITTEST_SUPPORT_OBJ_DIR)/%.o : $(UNITTEST_SUPPORT_SRC_DIR)/%.cpp $(UNITTEST_SUPPORT_OBJ_DIR)/%.d $(DEPS)
-	. ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER)
+	$(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER)
 	@touch $@
 	
 # Config objects get individual rules
 $(CONFIG_OBJ_DIR)/allocator_config_jemalloc.o: $(CONFIG_SRC_DIR)/allocator_config_jemalloc.cpp $(CONFIG_OBJ_DIR)/allocator_config_jemalloc.d $(DEPS) $(LIB_DIR)/libjemalloc.a
-	. ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER)
+	$(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER)
 	@touch $@
 $(CONFIG_OBJ_DIR)/allocator_config_jemalloc_debug.o: $(CONFIG_SRC_DIR)/allocator_config_jemalloc_debug.cpp $(CONFIG_OBJ_DIR)/allocator_config_jemalloc_debug.d $(DEPS) $(LIB_DIR)/libjemalloc_debug.a
-	. ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER)
+	$(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER)
 	@touch $@
 $(CONFIG_OBJ_DIR)/allocator_config_system.o: $(CONFIG_SRC_DIR)/allocator_config_system.cpp $(CONFIG_OBJ_DIR)/allocator_config_system.d $(DEPS)
-	. ./source_me.sh && $(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER)
+	$(CXX) $(INCLUDE_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(DEPGEN_FLAGS) -c -o $@ $< $(FILTER)
 	@touch $@
 
 # Use a fake rule to build .d files, so we don't complain if they don't exist.
diff --git a/README.md b/README.md
index 200ecff311c..b80954cadff 100644
--- a/README.md
+++ b/README.md
@@ -101,7 +101,7 @@ Note that a 64-bit OS is required. Ubuntu 20.04 should work.
 
 #### Linux: Build
 
-When you are ready, build with `. ./source_me.sh && make`. You can use `make -j16` to run 16 build threads at a time, which greatly accelerates the process. If you have more CPU cores, you can use higher numbers.
+When you are ready, build with `make`. You can use `make -j16` to run 16 build threads at a time, which greatly accelerates the process. If you have more CPU cores, you can use higher numbers.
 
 Note that vg can take anywhere from 10 minutes to more than an hour to compile depending on your machine and the number of threads used. 
 
@@ -161,7 +161,7 @@ Homebrew provides another package management solution for OSX, and may be prefer
 
 With dependencies installed, VG can now be built:
 
-    . ./source_me.sh && make
+    make
 
 As with Linux, you can add `-j16` or other numbers at the end to run multiple build tasks at once, if your computer can handle them.
     
diff --git a/configure.py b/configure.py
deleted file mode 100644
index 1290b07696d..00000000000
--- a/configure.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import os
-import sys
-import shutil
-
-
-def make_source_me():
-    var_dict = {
-            "LIBRARY_PATH":"lib",
-            "LD_LIBRARY_PATH":"lib",
-            "LD_INCLUDE_PATH":"include",
-            "C_INCLUDE_PATH":"include",
-            "CPLUS_INCLUDE_PATH":"include",
-            "INCLUDE_PATH":"include",
-            "PATH":"bin"                                                                   }
-
-    my_wd = os.getcwd()
-    out_file = "source_me.sh"
-
-    with open(out_file, "w") as ofi:
-        for i in var_dict:
-            o_line  = "export" + " " + i + "=" + my_wd + "/" + var_dict[i] + ":$" + i + "\n"
-            ofi.write(o_line)
-
-def set_compiler():
-    compiler_dict = {
-        "CC": "gcc",
-        "CXX": "g++"
-        }
-
-    out_file = "source_me.sh"
-    with open(out_file, "a") as ofi:
-        for i in compiler_dict:
-            o_line = "export" + " " + i + "=" + "$(which " + compiler_dict[i] + ")\n"
-            ofi.write(o_line)
-
-def check_deps():
-    return
-
-## pulled these out with this line: for i in `cat source_me.sh | cut -f 2 -d " " | cut -f 1 -d "="`; do echo "\"\$$i\":\"\","; done
-if __name__ == "__main__":
-    make_source_me() 
-    set_compiler()
-## "$CC":"",
-            #"$CXX":"",  
diff --git a/deps/libbdsg b/deps/libbdsg
index 88181dbc808..e98cda26c25 160000
--- a/deps/libbdsg
+++ b/deps/libbdsg
@@ -1 +1 @@
-Subproject commit 88181dbc80835e93de681e84e245c891e250f77c
+Subproject commit e98cda26c2522bee80292705494dca8dcffa1c5c
diff --git a/scripts/setup-server b/scripts/setup-server
index e3965b61ca9..794f12d1917 100644
--- a/scripts/setup-server
+++ b/scripts/setup-server
@@ -36,7 +36,7 @@ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 100 --slave
 git clone --recursive https://github.com/vgteam/vg.git
 
 # build vg
-cd vg && source ./source_me.sh && make -j 32 static && sudo cp bin/vg /usr/local/bin/
+cd vg && make -j 32 static && sudo cp bin/vg /usr/local/bin/
 sudo cp scripts/vg_sim_pos_compare.py /usr/local/bin/
 cd ~
 
diff --git a/source_me.sh b/source_me.sh
deleted file mode 100755
index 0bb948c9280..00000000000
--- a/source_me.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-export LIBRARY_PATH=`pwd`/lib:$LIBRARY_PATH
-export LD_LIBRARY_PATH=`pwd`/lib:$LD_LIBRARY_PATH
-export DYLD_LIBRARY_PATH=`pwd`/lib:$DYLD_LIBRARY_PATH
-export LD_INCLUDE_PATH=`pwd`/include:$LD_INCLUDE_PATH
-# Setting include directories via C_INCLUDE_PATH/CPLUS_INCLUDE_PATH will
-# automatically get them demoted to the end of the search list even if a -I
-# option is passed to try and bump them up earlier, before other -I options.
-# We leave the Makefile in charge of finding all the include directories.
-export CFLAGS="-I $(pwd)/include ${CFLAGS}"
-export CXXFLAGS="-I $(pwd)/include -I$(pwd)/include/dynamic ${CXXFLAGS}"
-export PATH=`pwd`/bin:`pwd`/scripts:"$PATH"
-export CC=$(which gcc)
-export CXX=$(which g++)
-
-#
-#  disable until file arguments work as in normal bash :(
-#
-# add bash autocompletion
-#if test -n "$BASH_VERSION"
-#then
-#
-#	 . ./autocomp.bash
-#fi
diff --git a/test/Makefile b/test/Makefile
index 5ae913873f1..bdf0526ace7 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -20,7 +20,7 @@ $(vg):
 	cd .. && $(MAKE) bin/vg
 
 build_graph: build_graph.cpp
-	cd .. && . ./source_me.sh && $(MAKE) test/build_graph
+	cd .. && $(MAKE) test/build_graph
 
 clean:
 	rm -f build_graph
diff --git a/vgci/vgci.sh b/vgci/vgci.sh
index 4526dac20aa..9a5e53b08f3 100755
--- a/vgci/vgci.sh
+++ b/vgci/vgci.sh
@@ -252,7 +252,6 @@ then
     if [ "${LOCAL_BUILD}" == "1" ]
     then
         # Just build vg here
-        . ./source_me.sh
         make -j ${NUM_CORES}
 
         if [ "$?" -ne 0 ]

From dd7157a7d2da845b4e2af991d91bcf8c40682e50 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Mon, 28 Oct 2024 11:59:09 -0400
Subject: [PATCH 28/54] Avoid breaking any commands that still want to source
 source_me.sh

---
 source_me.sh | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 source_me.sh

diff --git a/source_me.sh b/source_me.sh
new file mode 100644
index 00000000000..d650cc14afe
--- /dev/null
+++ b/source_me.sh
@@ -0,0 +1,4 @@
+# We used to have a script here to set up all the include and library search
+# paths for the vg build. But now the Makefile knows how to do it all for the
+# build, and the vg binary knows where to look for its dynamic libraries.
+echo 1>&2 "Sourcing source_me.sh is no longer necessary"

From 570844ceb311078160dc9d0e810e5a2bfed1376c Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Mon, 28 Oct 2024 12:46:59 -0400
Subject: [PATCH 29/54] Pass CC and CXX through to SDSL build

---
 Makefile       | 5 ++---
 deps/sdsl-lite | 2 +-
 doc/wiki       | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/Makefile b/Makefile
index ff91be32fa5..0e3afd9e14e 100644
--- a/Makefile
+++ b/Makefile
@@ -22,7 +22,6 @@ LIB_DIR:=lib
 # INC_DIR must be a relative path
 INC_DIR:=include
 CWD:=$(shell pwd)
-CXX ?= g++
 PKG_CONFIG ?= pkg-config
 
 SFX :=
@@ -570,9 +569,9 @@ $(LIB_DIR)/libjemalloc_debug.a: $(JEMALLOC_DIR)/src/*.c
 # See https://stackoverflow.com/a/19822767
 $(LIB_DIR)/%sdsl.a $(LIB_DIR)/%divsufsort.a $(LIB_DIR)/%divsufsort64.a : $(SDSL_DIR)/lib/*.cpp $(SDSL_DIR)/include/sdsl/*.hpp
 ifeq ($(shell uname -s),Darwin)
-	+cd $(SDSL_DIR) && AS_INTEGRATED_ASSEMBLER=1 BUILD_PORTABLE=1 CXXFLAGS="-fPIC $(CPPFLAGS) $(CXXFLAGS)" ./install.sh $(CWD) $(FILTER)
+	+cd $(SDSL_DIR) && AS_INTEGRATED_ASSEMBLER=1 BUILD_PORTABLE=1 CXXFLAGS="-fPIC $(CPPFLAGS) $(CXXFLAGS)" CC=$(CC) CXX=$(CXX) ./install.sh $(CWD) $(FILTER)
 else
-	+cd $(SDSL_DIR) && BUILD_PORTABLE=1 CXXFLAGS="-fPIC $(CPPFLAGS) $(CXXFLAGS)" ./install.sh $(CWD) $(FILTER)
+	+cd $(SDSL_DIR) && BUILD_PORTABLE=1 CXXFLAGS="-fPIC $(CPPFLAGS) $(CXXFLAGS)" CC=$(CC) CXX=$(CXX) ./install.sh $(CWD) $(FILTER)
 endif
 
 $(LIB_DIR)/libssw.a: $(SSW_DIR)/*.c $(SSW_DIR)/*.cpp $(SSW_DIR)/*.h
diff --git a/deps/sdsl-lite b/deps/sdsl-lite
index 863d0118cf3..86fa3534c1b 160000
--- a/deps/sdsl-lite
+++ b/deps/sdsl-lite
@@ -1 +1 @@
-Subproject commit 863d0118cf303f9c9c55576e0f6b2f70ecd9689a
+Subproject commit 86fa3534c1bf02b5468bbe58e2e0b8f6ae2d6fa4
diff --git a/doc/wiki b/doc/wiki
index f70ea363837..f28a1e56005 160000
--- a/doc/wiki
+++ b/doc/wiki
@@ -1 +1 @@
-Subproject commit f70ea36383784fc731454c1b09014f65ad4f74d5
+Subproject commit f28a1e56005c729cf5c2dad6a251447bedba2949

From 66b81ed8202d1db767e6513935bed1cb5ab0a3c0 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Mon, 28 Oct 2024 12:52:12 -0400
Subject: [PATCH 30/54] Propagate CC and CXX to all build commands

---
 Makefile | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 0e3afd9e14e..047e845be3f 100644
--- a/Makefile
+++ b/Makefile
@@ -27,6 +27,9 @@ PKG_CONFIG ?= pkg-config
 SFX :=
 EXE:=vg$(SFX)
 
+# Expose compiler we want to use to all build commands as an environment variable.
+export CC CXX
+
 all: $(BIN_DIR)/$(EXE)
 
 # Magic dependencies (see <http://make.mad-scientist.net/papers/advanced-auto-dependency-generation/#tldr>)
@@ -569,9 +572,9 @@ $(LIB_DIR)/libjemalloc_debug.a: $(JEMALLOC_DIR)/src/*.c
 # See https://stackoverflow.com/a/19822767
 $(LIB_DIR)/%sdsl.a $(LIB_DIR)/%divsufsort.a $(LIB_DIR)/%divsufsort64.a : $(SDSL_DIR)/lib/*.cpp $(SDSL_DIR)/include/sdsl/*.hpp
 ifeq ($(shell uname -s),Darwin)
-	+cd $(SDSL_DIR) && AS_INTEGRATED_ASSEMBLER=1 BUILD_PORTABLE=1 CXXFLAGS="-fPIC $(CPPFLAGS) $(CXXFLAGS)" CC=$(CC) CXX=$(CXX) ./install.sh $(CWD) $(FILTER)
+	+cd $(SDSL_DIR) && AS_INTEGRATED_ASSEMBLER=1 BUILD_PORTABLE=1 CXXFLAGS="-fPIC $(CPPFLAGS) $(CXXFLAGS)" ./install.sh $(CWD) $(FILTER)
 else
-	+cd $(SDSL_DIR) && BUILD_PORTABLE=1 CXXFLAGS="-fPIC $(CPPFLAGS) $(CXXFLAGS)" CC=$(CC) CXX=$(CXX) ./install.sh $(CWD) $(FILTER)
+	+cd $(SDSL_DIR) && BUILD_PORTABLE=1 CXXFLAGS="-fPIC $(CPPFLAGS) $(CXXFLAGS)" ./install.sh $(CWD) $(FILTER)
 endif
 
 $(LIB_DIR)/libssw.a: $(SSW_DIR)/*.c $(SSW_DIR)/*.cpp $(SSW_DIR)/*.h

From c82ae3fbb1af426725ef43c019cd8d9ef64c9f70 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Mon, 28 Oct 2024 12:55:19 -0400
Subject: [PATCH 31/54] Pass CXX and CC in CMake compiler format to directly
 invoked CMakes

---
 Makefile | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/Makefile b/Makefile
index 047e845be3f..6b939290cc3 100644
--- a/Makefile
+++ b/Makefile
@@ -616,9 +616,9 @@ $(INC_DIR)/kff_io.hpp: $(LIB_DIR)/libkff.a
 
 $(LIB_DIR)/libkff.a: $(KFF_DIR)/kff_io.cpp $(KFF_DIR)/kff_io.hpp.in
 ifeq ($(shell uname -s),Darwin)
-	+cd $(KFF_DIR) && rm -Rf build && mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-fPIC -Wall -Ofast -g $(CXXFLAGS)" .. && AS_INTEGRATED_ASSEMBLER=1 $(MAKE) $(FILTER) && cp kff_io.hpp $(CWD)/$(INC_DIR) && mv libkff.a $(CWD)/$(LIB_DIR)
+	+cd $(KFF_DIR) && rm -Rf build && mkdir build && cd build && cmake -DCMAKE_C_COMPILER="$(CC)" -DCMAKE_CXX_COMPILER="$(CXX)" -DCMAKE_CXX_FLAGS="-fPIC -Wall -Ofast -g $(CXXFLAGS)" .. && AS_INTEGRATED_ASSEMBLER=1 $(MAKE) $(FILTER) && cp kff_io.hpp $(CWD)/$(INC_DIR) && mv libkff.a $(CWD)/$(LIB_DIR)
 else
-	+cd $(KFF_DIR) && rm -Rf build && mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-fPIC -Wall -Ofast -g $(CXXFLAGS)" .. && $(MAKE) $(FILTER) && cp kff_io.hpp $(CWD)/$(INC_DIR) && mv libkff.a $(CWD)/$(LIB_DIR)
+	+cd $(KFF_DIR) && rm -Rf build && mkdir build && cd build && cmake -DCMAKE_C_COMPILER="$(CC)" -DCMAKE_CXX_COMPILER="$(CXX)" -DCMAKE_CXX_FLAGS="-fPIC -Wall -Ofast -g $(CXXFLAGS)" .. && $(MAKE) $(FILTER) && cp kff_io.hpp $(CWD)/$(INC_DIR) && mv libkff.a $(CWD)/$(LIB_DIR)
 endif
 
 $(INC_DIR)/BooPHF.h: $(BBHASH_DIR)/BooPHF.h
@@ -664,10 +664,10 @@ $(LIB_DIR)/cleaned_old_elfutils:
 $(LIB_DIR)/libvgio.a: $(LIB_DIR)/libhts.a $(LIB_DIR)/libhandlegraph.a $(LIB_DIR)/pkgconfig/htslib.pc $(LIB_DIR)/cleaned_old_protobuf_v003 $(LIBVGIO_DIR)/CMakeLists.txt $(LIBVGIO_DIR)/src/*.cpp $(LIBVGIO_DIR)/include/vg/io/*.hpp $(LIBVGIO_DIR)/deps/vg.proto
 	+rm -f $(CWD)/$(INC_DIR)/vg.pb.h $(CWD)/$(INC_DIR)/vg/vg.pb.h
 	+rm -Rf $(CWD)/$(INC_DIR)/vg/io/
-	+export CXXFLAGS="$(CPPFLAGS) $(CXXFLAGS)" && export LDFLAGS="$(LD_LIB_DIR_FLAGS) $(LDFLAGS)" && cd $(LIBVGIO_DIR) && rm -Rf CMakeCache.txt CMakeFiles *.cmake install_manifest.txt *.pb.cc *.pb.h *.a && rm -rf build-vg && mkdir build-vg && cd build-vg && PKG_CONFIG_PATH=$(CWD)/$(LIB_DIR)/pkgconfig:$(PKG_CONFIG_PATH) cmake -DCMAKE_CXX_STANDARD=$(CXX_STANDARD) -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_PREFIX_PATH="/usr;$(OMP_PREFIXES)" -DCMAKE_INSTALL_PREFIX=$(CWD) -DCMAKE_INSTALL_LIBDIR=lib .. $(FILTER) && $(MAKE) clean && VERBOSE=1 $(MAKE) $(FILTER) && $(MAKE) install
+	+export CXXFLAGS="$(CPPFLAGS) $(CXXFLAGS)" && export LDFLAGS="$(LD_LIB_DIR_FLAGS) $(LDFLAGS)" && cd $(LIBVGIO_DIR) && rm -Rf CMakeCache.txt CMakeFiles *.cmake install_manifest.txt *.pb.cc *.pb.h *.a && rm -rf build-vg && mkdir build-vg && cd build-vg && PKG_CONFIG_PATH=$(CWD)/$(LIB_DIR)/pkgconfig:$(PKG_CONFIG_PATH) cmake -DCMAKE_C_COMPILER="$(CC)" -DCMAKE_CXX_COMPILER="$(CXX)" -DCMAKE_CXX_STANDARD=$(CXX_STANDARD) -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_PREFIX_PATH="/usr;$(OMP_PREFIXES)" -DCMAKE_INSTALL_PREFIX=$(CWD) -DCMAKE_INSTALL_LIBDIR=lib .. $(FILTER) && $(MAKE) clean && VERBOSE=1 $(MAKE) $(FILTER) && $(MAKE) install
 
 $(LIB_DIR)/libhandlegraph.a: $(LIBHANDLEGRAPH_DIR)/src/include/handlegraph/*.hpp $(LIBHANDLEGRAPH_DIR)/src/*.cpp
-	+cd $(LIBHANDLEGRAPH_DIR) && rm -Rf build CMakeCache.txt CMakeFiles && mkdir build && cd build && CXXFLAGS="$(CXXFLAGS) $(CPPFLAGS)" cmake -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_INSTALL_PREFIX=$(CWD) -DCMAKE_INSTALL_LIBDIR=lib .. && $(MAKE) $(FILTER) && $(MAKE) install
+	+cd $(LIBHANDLEGRAPH_DIR) && rm -Rf build CMakeCache.txt CMakeFiles && mkdir build && cd build && CXXFLAGS="$(CXXFLAGS) $(CPPFLAGS)" cmake -DCMAKE_C_COMPILER="$(CC)" -DCMAKE_CXX_COMPILER="$(CXX)" -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_INSTALL_PREFIX=$(CWD) -DCMAKE_INSTALL_LIBDIR=lib .. && $(MAKE) $(FILTER) && $(MAKE) install
 
 
 # On Linux, libdeflate builds a .so.
@@ -718,7 +718,7 @@ $(LIB_DIR)/libtabixpp.a: $(LIB_DIR)/libhts.a $(TABIXPP_DIR)/*.cpp $(TABIXPP_DIR)
 # We need to use /usr first for CMake search or Ubuntu 22.04 will decide pybind11 is installed in / when actually it is only fully installed in /usr.
 $(LIB_DIR)/libvcflib.a: $(LIB_DIR)/libhts.a $(LIB_DIR)/libtabixpp.a $(VCFLIB_DIR)/src/*.cpp $(VCFLIB_DIR)/src/*.hpp $(VCFLIB_DIR)/contrib/*/*.cpp $(VCFLIB_DIR)/contrib/*/*.h
 	+rm -f $(VCFLIB_DIR)/contrib/WFA2-lib/VERSION
-	+cd $(VCFLIB_DIR) && rm -Rf build && mkdir build && cd build && PKG_CONFIG_PATH="$(CWD)/$(LIB_DIR)/pkgconfig:$(PKG_CONFIG_PATH)" cmake -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON -DZIG=OFF -DCMAKE_C_FLAGS="$(CFLAGS)" -DCMAKE_CXX_FLAGS="$(CXXFLAGS) ${CPPFLAGS}" -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_INSTALL_PREFIX=$(CWD) -DCMAKE_PREFIX_PATH="/usr;$(OMP_PREFIXES)" .. && cmake --build .
+	+cd $(VCFLIB_DIR) && rm -Rf build && mkdir build && cd build && PKG_CONFIG_PATH="$(CWD)/$(LIB_DIR)/pkgconfig:$(PKG_CONFIG_PATH)" cmake -DCMAKE_C_COMPILER="$(CC)" -DCMAKE_CXX_COMPILER="$(CXX)" -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON -DZIG=OFF -DCMAKE_C_FLAGS="$(CFLAGS)" -DCMAKE_CXX_FLAGS="$(CXXFLAGS) ${CPPFLAGS}" -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_INSTALL_PREFIX=$(CWD) -DCMAKE_PREFIX_PATH="/usr;$(OMP_PREFIXES)" .. && cmake --build .
 	+cp $(VCFLIB_DIR)/contrib/filevercmp/*.h* $(INC_DIR)
 	+cp $(VCFLIB_DIR)/contrib/fastahack/*.h* $(INC_DIR)
 	+cp $(VCFLIB_DIR)/contrib/smithwaterman/*.h* $(INC_DIR)
@@ -744,7 +744,7 @@ $(INC_DIR)/lru_cache.h: $(DEP_DIR)/lru_cache/*.h $(DEP_DIR)/lru_cache/*.cc
 $(INC_DIR)/dynamic/dynamic.hpp: $(DYNAMIC_DIR)/include/dynamic/*.hpp $(DYNAMIC_DIR)/include/dynamic/*/*.hpp
 	+rm -Rf $(INC_DIR)/dynamic.hpp $(INC_DIR)/dynamic
 	# annoyingly doesn't have an install option on the cmake, so we manually move their external dependency headers
-	+cd $(CWD)/$(DYNAMIC_DIR) && rm -Rf build && mkdir -p build && cd build && export CXXFLAGS="$(CPPFLAGS) $(CXXFLAGS)" && cmake -DCMAKE_VERBOSE_MAKEFILE=ON .. && make && cp -r $(CWD)/$(DYNAMIC_DIR)/deps/hopscotch_map/include/* $(CWD)/$(INC_DIR)/
+	+cd $(CWD)/$(DYNAMIC_DIR) && rm -Rf build && mkdir -p build && cd build && export CXXFLAGS="$(CPPFLAGS) $(CXXFLAGS)" && cmake -DCMAKE_C_COMPILER="$(CC)" -DCMAKE_CXX_COMPILER="$(CXX)" -DCMAKE_VERBOSE_MAKEFILE=ON .. && make && cp -r $(CWD)/$(DYNAMIC_DIR)/deps/hopscotch_map/include/* $(CWD)/$(INC_DIR)/
 	# Do the copy of the main file last so we can tell if this recipe failed and redo it.
 	# Otherwise we get dynamic.hpp without its deps
 	+mkdir -p $(INC_DIR)/dynamic && cp -r $(CWD)/$(DYNAMIC_DIR)/include/dynamic/* $(INC_DIR)/dynamic/
@@ -769,7 +769,7 @@ $(LIB_DIR)/libpinchesandcacti.a: $(LIB_DIR)/libsonlib.a $(CWD)/$(DEP_DIR)/pinche
 # We also need to clear out its cmake stuff in case it found a wrong Bison and cached it.
 $(LIB_DIR)/libraptor2.a: $(RAPTOR_DIR)/src/* $(wildcard $(RAPTOR_DIR)/build/*)
 	which bison
-	+cd $(RAPTOR_DIR)/build && rm -Rf CMakeCache.txt CMakeFiles CTestTestfile.cmake Makefile cmake_install.cmake src tests utils && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" cmake .. && rm -f src/turtle_parser.c && rm -f src/turtle_lexer.c && make turtle_lexer_tgt && make -f src/CMakeFiles/raptor2.dir/build.make src/turtle_lexer.c && sed -i.bak '/yycleanup/d' src/turtle_lexer.c && $(MAKE) $(FILTER) && cp src/libraptor2.a $(CWD)/$(LIB_DIR)
+	+cd $(RAPTOR_DIR)/build && rm -Rf CMakeCache.txt CMakeFiles CTestTestfile.cmake Makefile cmake_install.cmake src tests utils && CFLAGS="-fPIC $(CFLAGS)" CXXFLAGS="-fPIC $(CXXFLAGS)" cmake -DCMAKE_C_COMPILER="$(CC)" -DCMAKE_CXX_COMPILER="$(CXX)" .. && rm -f src/turtle_parser.c && rm -f src/turtle_lexer.c && make turtle_lexer_tgt && make -f src/CMakeFiles/raptor2.dir/build.make src/turtle_lexer.c && sed -i.bak '/yycleanup/d' src/turtle_lexer.c && $(MAKE) $(FILTER) && cp src/libraptor2.a $(CWD)/$(LIB_DIR)
 	+touch $(LIB_DIR)/libraptor2.a
 
 # We need rapper from Raptor for the tests

From 0c75c1f4e86330e355627fe9bb4c7d0639fdf23b Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Mon, 28 Oct 2024 14:01:21 -0400
Subject: [PATCH 32/54] Drop removed files from CI build script

---
 .gitlab-ci.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 65d745c2e2b..62ac9ac0517 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -86,8 +86,6 @@ local-build-test-job:
   script:
     - THREADS=8
     - nvm version
-    - python3 ./configure.py
-    - source ./source_me.sh
     - make get-deps
     - make -j${THREADS}
     - echo Testing

From 89beb2f6a7557e17221da457b0d9ab0d4f8c8296 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Mon, 28 Oct 2024 16:01:51 -0400
Subject: [PATCH 33/54] Expose vg to tube map for test

---
 .gitlab-ci.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 62ac9ac0517..3fbdc26907a 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -94,6 +94,8 @@ local-build-test-job:
     - make test
     - make static -j${THREADS}
     # Also test as a backend for the tube map
+    # Tube map expects vg on PATH
+    - export PATH="$(pwd)/bin:${PATH}"
     - git clone https://github.com/vgteam/sequenceTubeMap.git
     - cd sequenceTubeMap
     # Tube map expects local IPv6 but Kubernetes won't let us have it

From 2e16f837fbb8bdd2741dfbc5b841fe829a0502f6 Mon Sep 17 00:00:00 2001
From: Jordan Eizenga <joeizeng@gmail.com>
Date: Mon, 28 Oct 2024 13:50:45 -0700
Subject: [PATCH 34/54] add option to output haplotypes afte vg rna

---
 src/subcommand/rna_main.cpp | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/src/subcommand/rna_main.cpp b/src/subcommand/rna_main.cpp
index 0e59220fae4..515c2112e16 100644
--- a/src/subcommand/rna_main.cpp
+++ b/src/subcommand/rna_main.cpp
@@ -54,6 +54,7 @@ void help_rna(char** argv) {
          << "\nOutput options:" << endl
 
          << "    -b, --write-gbwt FILE      write pantranscriptome transcript paths as GBWT index file" << endl
+         << "    -v, --write-hap-gbwt FILE  write input haplotypes as a GBWT with node IDs matching the output graph" << endl
          << "    -f, --write-fasta FILE     write pantranscriptome transcript sequences as fasta file" << endl
          << "    -i, --write-info FILE      write pantranscriptome transcript info table as tsv file" << endl
          << "    -q, --out-exclude-ref      exclude reference transcripts from pantranscriptome output" << endl
@@ -88,6 +89,7 @@ int32_t main_rna(int32_t argc, char** argv) {
     bool gbwt_add_bidirectional = false;
     string fasta_out_filename = "";
     string info_out_filename = "";
+    string hap_gbwt_out_filename = "";
     int32_t num_threads = 1;
     bool show_progress = false;
 
@@ -110,8 +112,9 @@ int32_t main_rna(int32_t argc, char** argv) {
                 {"remove-non-gene",  no_argument, 0, 'd'},
                 {"do-not-sort",  no_argument, 0, 'o'},
                 {"add-ref-paths",  no_argument, 0, 'r'},
-                {"add-hap-paths",  no_argument, 0, 'a'},      
+                {"add-hap-paths",  no_argument, 0, 'a'},
                 {"write-gbwt",  required_argument, 0, 'b'},
+                {"write-hap-gbwt",  required_argument, 0, 'v'},
                 {"write-fasta",  required_argument, 0, 'f'},
                 {"write-info",  required_argument, 0, 'i'},
                 {"out-ref-paths",  no_argument, 0, 'u'},
@@ -124,7 +127,7 @@ int32_t main_rna(int32_t argc, char** argv) {
             };
 
         int32_t option_index = 0;
-        c = getopt_long(argc, argv, "n:m:y:s:l:zjec:k:dorab:f:i:uqgt:ph?", long_options, &option_index);
+        c = getopt_long(argc, argv, "n:m:y:s:l:zjec:k:dorab:v:f:i:uqgt:ph?", long_options, &option_index);
 
         /* Detect the end of the options. */
         if (c == -1)
@@ -188,10 +191,14 @@ int32_t main_rna(int32_t argc, char** argv) {
         case 'a':
             add_projected_transcript_paths = true;
             break;
-
+                
         case 'b':
             gbwt_out_filename = optarg;
             break;
+            
+        case 'v':
+            hap_gbwt_out_filename = optarg;
+            break;
 
         case 'f':
             fasta_out_filename = optarg;
@@ -486,6 +493,19 @@ int32_t main_rna(int32_t argc, char** argv) {
         gbwt_builder.finish();
         save_gbwt(gbwt_builder.index, gbwt_out_filename);
     }
+    
+    // Write a haplotype GBWT with node IDs updated to match the spliced graph.
+    if (!hap_gbwt_out_filename.empty()) {
+        if (!haplotype_index.get()) {
+            cerr << "[vg rna] Warning: not saving updated haplotypes to " << hap_gbwt_out_filename << " because haplotypes were not provided as input" << endl;
+        }
+        else {
+            ofstream hap_gbwt_ostream;
+            hap_gbwt_ostream.open(hap_gbwt_out_filename);
+            
+            haplotype_index->serialize(hap_gbwt_ostream);
+        }
+    }
 
     // Write transcript sequences in transcriptome as fasta file.
     if (!fasta_out_filename.empty()) {
@@ -496,7 +516,7 @@ int32_t main_rna(int32_t argc, char** argv) {
         transcriptome.write_transcript_sequences(&fasta_ostream, exclude_reference_transcripts);
      
         fasta_ostream.close();
-    }    
+    }
 
     // Write transcript info in transcriptome as tsv file.
     if (!info_out_filename.empty()) {

From 4e6c34d0b290b3fd5d8849fb297f7b96bc1fa9a8 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Wed, 30 Oct 2024 10:35:55 -0400
Subject: [PATCH 35/54] Use sublinear-Li-Stephens that can use CXX

---
 deps/sublinear-Li-Stephens | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deps/sublinear-Li-Stephens b/deps/sublinear-Li-Stephens
index 241001025d7..ef54e081fae 160000
--- a/deps/sublinear-Li-Stephens
+++ b/deps/sublinear-Li-Stephens
@@ -1 +1 @@
-Subproject commit 241001025d7cf516c0c9577f94e858e76560269f
+Subproject commit ef54e081faebb3fef7ff54cf5d4f8068836951f7

From a57b473efb79f6986b03eb5d1db394df38309c3f Mon Sep 17 00:00:00 2001
From: Jouni Siren <jouni.siren@iki.fi>
Date: Thu, 31 Oct 2024 20:59:09 -0700
Subject: [PATCH 36/54] Start getting rid of the old path/thread terminology

---
 src/subcommand/gbwt_main.cpp | 75 ++++++++++++++++++------------------
 test/t/37_vg_gbwt.t          | 62 ++++++++++++++---------------
 2 files changed, 69 insertions(+), 68 deletions(-)

diff --git a/src/subcommand/gbwt_main.cpp b/src/subcommand/gbwt_main.cpp
index b22f0c0ce8e..af88d10f2e9 100644
--- a/src/subcommand/gbwt_main.cpp
+++ b/src/subcommand/gbwt_main.cpp
@@ -37,7 +37,7 @@ struct GBWTConfig {
     build_mode build = build_none;
     merge_mode merge = merge_none;
     path_cover_mode path_cover = path_cover_none;
-    bool metadata_mode = false, thread_mode = false;
+    bool metadata_mode = false, path_mode = false;
 
     // Input GBWT construction.
     HaplotypeIndexer haplotype_indexer;
@@ -55,8 +55,8 @@ struct GBWTConfig {
 
     // Other parameters and flags.
     bool show_progress = false;
-    bool count_threads = false;
-    bool metadata = false, contigs = false, haplotypes = false, samples = false, list_names = false, thread_names = false, tags = false;
+    bool count_paths = false;
+    bool metadata = false, contigs = false, haplotypes = false, samples = false, list_names = false, path_names = false, tags = false;
     bool include_named_paths = false;
     size_t num_paths = default_num_paths(), context_length = default_context_length();
     bool num_paths_set = false;
@@ -70,7 +70,7 @@ struct GBWTConfig {
 
     // File names.
     std::string gbwt_output; // Output GBWT.
-    std::string thread_output; // Threads in SDSL format.
+    std::string path_output; // Paths in SDSL format.
     std::string graph_output; // Output GBWTGraph.
     std::string segment_translation; // Segment to node translation output.
     std::string r_index_name; // Output r-index.
@@ -163,7 +163,7 @@ void step_4_path_cover(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& con
 void step_5_gbwtgraph(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& config);
 void step_6_r_index(GBWTHandler& gbwts, GBWTConfig& config);
 void step_7_metadata(GBWTHandler& gbwts, GBWTConfig& config);
-void step_8_threads(GBWTHandler& gbwts, GBWTConfig& config);
+void step_8_paths(GBWTHandler& gbwts, GBWTConfig& config);
 
 void report_time_memory(const std::string& what, double start_time, const GBWTConfig& config);
 void print_metadata(std::ostream& out, const GBWTHandler& gbwts);
@@ -233,9 +233,9 @@ int main_gbwt(int argc, char** argv) {
         step_7_metadata(gbwts, config);
     }
 
-    // Thread options.
-    if (config.thread_mode) {
-        step_8_threads(gbwts, config);
+    // Path options.
+    if (config.path_mode) {
+        step_8_paths(gbwts, config);
     }
 
     return 0;
@@ -260,7 +260,7 @@ void help_gbwt(char** argv) {
     std::cerr << "        --id-interval N     store path ids at one out of N positions (default " << gbwt::DynamicGBWT::SAMPLE_INTERVAL << ")" << std::endl;
     std::cerr << std::endl;
     std::cerr << "Multithreading:" << std::endl;
-    std::cerr << "        --num-jobs N        use at most N parallel build jobs (for -v, -G, -l, -P; default " << GBWTConfig::default_build_jobs() << ")" << std::endl;
+    std::cerr << "        --num-jobs N        use at most N parallel build jobs (for -v, -G, -A, -l, -P; default " << GBWTConfig::default_build_jobs() << ")" << std::endl;
     std::cerr << "        --num-threads N     use N parallel search threads (for -b and -r; default " << omp_get_max_threads() << ")" << std::endl;
     std::cerr << std::endl;
     std::cerr << "Step 1: GBWT construction (requires -o and one of { -v, -G, -Z, -E, A }):" << std::endl;
@@ -330,12 +330,12 @@ void help_gbwt(char** argv) {
     std::cerr << "    -H, --haplotypes        print the number of haplotypes" << std::endl;
     std::cerr << "    -S, --samples           print the number of samples" << std::endl;
     std::cerr << "    -L, --list-names        list contig/sample names (use with -C or -S)" << std::endl;
-    std::cerr << "    -T, --thread-names      list thread names" << std::endl;
+    std::cerr << "    -T, --thread-names      list path names" << std::endl; // FIXME change to --path-names; make --thread-names a deprecated alias
     std::cerr << "        --tags              list GBWT tags" << std::endl;
     std::cerr << std::endl;
-    std::cerr << "Step 8: Threads (one input GBWT):" << std::endl;
-    std::cerr << "    -c, --count-threads     print the number of threads" << std::endl;
-    std::cerr << "    -e, --extract FILE      extract threads in SDSL format to FILE" << std::endl;
+    std::cerr << "Step 8: Paths (one input GBWT):" << std::endl;
+    std::cerr << "    -c, --count-threads     print the number of paths" << std::endl; // FIXME also here, change to --count-paths
+    std::cerr << "    -e, --extract FILE      extract paths in SDSL format to FILE" << std::endl;
     std::cerr << std::endl;
 }
 
@@ -519,11 +519,11 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) {
         { "haplotypes", no_argument, 0, 'H' },
         { "samples", no_argument, 0, 'S' },
         { "list-names", no_argument, 0, 'L' },
-        { "thread-names", no_argument, 0, 'T' },
+        { "thread-names", no_argument, 0, 'T' }, // FIXME
         { "tags", no_argument, 0, OPT_TAGS },
 
-        // Threads
-        { "count-threads", no_argument, 0, 'c' },
+        // Paths
+        { "count-threads", no_argument, 0, 'c' }, // FIXME
         { "extract", required_argument, 0, 'e' },
 
         { "help", no_argument, 0, 'h' },
@@ -864,7 +864,7 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) {
             config.metadata_mode = true;
             break;
         case 'T':
-            config.thread_names = true;
+            config.path_names = true;
             config.metadata_mode = true;
             break;
         case OPT_TAGS:
@@ -872,14 +872,14 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) {
             config.metadata_mode = true;
             break;
 
-        // Threads
+        // Paths
         case 'c':
-            config.count_threads = true;
-            config.thread_mode = true;
+            config.count_paths = true;
+            config.path_mode = true;
             break;
         case 'e':
-            config.thread_output = optarg;
-            config.thread_mode = true;
+            config.path_output = optarg;
+            config.path_mode = true;
             break;
 
         case 'h':
@@ -1077,9 +1077,9 @@ void validate_gbwt_config(GBWTConfig& config) {
         }
     }
 
-    if (config.thread_mode) {
+    if (config.path_mode) {
         if (!one_input_gbwt) {
-            std::cerr << "error: [vg gbwt] thread operations require one input GBWT" << std::endl;
+            std::cerr << "error: [vg gbwt] path operations require one input GBWT" << std::endl;
             std::exit(EXIT_FAILURE);
         }
     }
@@ -1373,6 +1373,7 @@ void step_1_build_gbwts(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& co
         if (config.show_progress) {
             std::cerr << "Input type: " << (config.gam_format ? "GAM" : "GAF") << std::endl;
         }
+        // FIXME: Parallelize this.
         std::unique_ptr<gbwt::DynamicGBWT> temp = config.haplotype_indexer.build_gbwt(*(graphs.path_graph), config.input_filenames, (config.gam_format ? "GAM" : "GAF"));
         gbwts.use(*temp);
     }
@@ -1458,7 +1459,7 @@ void remove_samples(GBWTHandler& gbwts, GBWTConfig& config) {
 
     gbwts.use_dynamic();
     if (!(gbwts.dynamic.hasMetadata() && gbwts.dynamic.metadata.hasPathNames() && gbwts.dynamic.metadata.hasSampleNames())) {
-        std::cerr << "error: [vg gbwt] the index does not contain metadata with thread and sample names" << std::endl;
+        std::cerr << "error: [vg gbwt] the index does not contain metadata with path and sample names" << std::endl;
         std::exit(EXIT_FAILURE);
     }
 
@@ -1471,11 +1472,11 @@ void remove_samples(GBWTHandler& gbwts, GBWTConfig& config) {
         }
         std::vector<gbwt::size_type> path_ids = gbwts.dynamic.metadata.removeSample(sample_id);
         if (path_ids.empty()) {
-            std::cerr << "warning: [vg gbwt] no threads associated with sample " << sample_name << std::endl;
+            std::cerr << "warning: [vg gbwt] no paths associated with sample " << sample_name << std::endl;
             continue;
         }
         if (config.show_progress) {
-            std::cerr << "Removing " << path_ids.size() << " threads for sample " << sample_name << std::endl;
+            std::cerr << "Removing " << path_ids.size() << " paths for sample " << sample_name << std::endl;
         }
         gbwts.dynamic.remove(path_ids);
     }
@@ -1666,7 +1667,7 @@ void step_7_metadata(GBWTHandler& gbwts, GBWTConfig& config) {
         }
     }
 
-    if (config.thread_names) {
+    if (config.path_names) {
         auto& metadata = get_metadata();
         if (metadata.hasPathNames()) {
             // Precompute some metadata
@@ -1676,7 +1677,7 @@ void step_7_metadata(GBWTHandler& gbwts, GBWTConfig& config) {
                 std::cout << gbwtgraph::compose_path_name(gbwts.compressed, i, sense) << std::endl;
             }
         } else {
-            std::cerr << "error: [vg gbwt] the metadata does not contain thread names" << std::endl;
+            std::cerr << "error: [vg gbwt] the metadata does not contain path names" << std::endl;
         }
     }
     
@@ -1692,19 +1693,19 @@ void step_7_metadata(GBWTHandler& gbwts, GBWTConfig& config) {
 
 //----------------------------------------------------------------------------
 
-void step_8_threads(GBWTHandler& gbwts, GBWTConfig& config) {
-    // Extract threads in SDSL format.
-    if (!config.thread_output.empty()) {
+void step_8_paths(GBWTHandler& gbwts, GBWTConfig& config) {
+    // Extract paths in SDSL format.
+    if (!config.path_output.empty()) {
         double start = gbwt::readTimer();
         if (config.show_progress) {
-            std::cerr << "Extracting threads to " << config.thread_output << std::endl;
+            std::cerr << "Extracting paths to " << config.path_output << std::endl;
         }
         gbwts.use_compressed();
         if (config.show_progress) {
             std::cerr << "Starting the extraction" << std::endl;
         }
         gbwt::size_type node_width = gbwt::bit_length(gbwts.compressed.sigma() - 1);
-        gbwt::text_buffer_type out(config.thread_output, std::ios::out, gbwt::MEGABYTE, node_width);
+        gbwt::text_buffer_type out(config.path_output, std::ios::out, gbwt::MEGABYTE, node_width);
         for (gbwt::size_type id = 0; id < gbwts.compressed.sequences(); id += 2) { // Ignore reverse complements.
             gbwt::vector_type sequence = gbwts.compressed.extract(id);
             for (auto node : sequence) {
@@ -1713,11 +1714,11 @@ void step_8_threads(GBWTHandler& gbwts, GBWTConfig& config) {
             out.push_back(gbwt::ENDMARKER);
         }
         out.close();
-        report_time_memory("Threads extracted", start, config);
+        report_time_memory("Paths extracted", start, config);
     }
 
-    // There are two sequences for each thread.
-    if (config.count_threads) {
+    // There are two sequences for each path.
+    if (config.count_paths) {
         gbwts.use_compressed();
         std::cout << (gbwts.compressed.sequences() / 2) << std::endl;
     }
diff --git a/test/t/37_vg_gbwt.t b/test/t/37_vg_gbwt.t
index 6c11196fe7a..9fdc0c8a4fb 100644
--- a/test/t/37_vg_gbwt.t
+++ b/test/t/37_vg_gbwt.t
@@ -33,11 +33,11 @@ cmp x.gbwt parse_x.gbwt
 is $? 0 "identical construction results with vg gbwt and from VCF parse"
 
 # Single chromosome: metadata for haplotypes
-is $(vg gbwt -c x.gbwt) 2 "chromosome X: 2 threads"
+is $(vg gbwt -c x.gbwt) 2 "chromosome X: 2 paths"
 is $(vg gbwt -C x.gbwt) 1 "chromosome X: 1 contig"
 is $(vg gbwt -H x.gbwt) 2 "chromosome X: 2 haplotypes"
 is $(vg gbwt -S x.gbwt) 1 "chromosome X: 1 sample"
-is $(vg gbwt -T x.gbwt | wc -l) 2 "chromosome X: 2 thread names"
+is $(vg gbwt -T x.gbwt | wc -l) 2 "chromosome X: 2 path names"
 is $(vg gbwt -C -L x.gbwt | wc -l) 1 "chromosome X: 1 contig name"
 is $(vg gbwt -S -L x.gbwt | wc -l) 1 "chromosome X: 1 sample name"
 
@@ -52,7 +52,7 @@ vg index -G x2.ref.gbwt -T x.vg
 is $? 0 "chromosome X reference GBWT with vg index"
 cmp x.ref.gbwt x2.ref.gbwt
 is $? 0 "identical construction results with vg gbwt and vg index"
-is $(vg gbwt -c x.ref.gbwt) 1 "chromosome X reference: 1 thread"
+is $(vg gbwt -c x.ref.gbwt) 1 "chromosome X reference: 1 path"
 
 rm -f x.ref.gbwt x2.ref.gbwt
 
@@ -115,7 +115,7 @@ vg gbwt -x xy-alt.xg -o xy.1000gp.gbwt --preset 1000gp -v small/xy2.vcf.gz
 is $? 0 "construction preset: 1000gp"
 
 # Multiple chromosomes: metadata for haplotypes
-is $(vg gbwt -c xy.merge.gbwt) 4 "multiple chromosomes: 4 threads"
+is $(vg gbwt -c xy.merge.gbwt) 4 "multiple chromosomes: 4 paths"
 is $(vg gbwt -C xy.merge.gbwt) 2 "multiple chromosomes: 2 contigs"
 is $(vg gbwt -H xy.merge.gbwt) 2 "multiple chromosomes: 2 haplotypes"
 is $(vg gbwt -S xy.merge.gbwt) 1 "multiple chromosomes: 1 sample"
@@ -131,7 +131,7 @@ vg index -G xy2.contigs.gbwt -T xy.xg
 is $? 0 "paths as contigs with vg index"
 cmp xy.contigs.gbwt xy2.contigs.gbwt
 is $? 0 "identical construction results with vg gbwt and vg index"
-is $(vg gbwt -c xy.contigs.gbwt) 2 "paths as contigs: 2 threads"
+is $(vg gbwt -c xy.contigs.gbwt) 2 "paths as contigs: 2 paths"
 is $(vg gbwt -C xy.contigs.gbwt) 2 "paths as contigs: 2 contigs"
 is $(vg gbwt -H xy.contigs.gbwt) 1 "paths as contigs: 1 haplotype"
 is $(vg gbwt -S xy.contigs.gbwt) 1 "paths as contigs: 1 sample"
@@ -194,35 +194,35 @@ rm -f x.gbwt empty.gbwt x2.gbwt
 vg gbwt -x xy-alt.xg -o xy.gbwt -v small/xy2.vcf.gz
 vg gbwt -E -o xy.ref.gbwt -x xy.xg
 vg gbwt -m -o xy.both.gbwt xy.gbwt xy.ref.gbwt
-is $(vg gbwt -c xy.both.gbwt) 6 "haplotypes and paths: 6 threads"
+is $(vg gbwt -c xy.both.gbwt) 6 "haplotypes and paths: 6 paths"
 
 # Remove the reference sample that GBWTs use for paths
 vg gbwt -R _gbwt_ref -o xy.removed.gbwt xy.both.gbwt
 is $? 0 "samples can be removed from a GBWT index"
-is $(vg gbwt -c xy.removed.gbwt) 4 "haplotypes only: 4 threads"
+is $(vg gbwt -c xy.removed.gbwt) 4 "haplotypes only: 4 paths"
 
 rm -f xy.gbwt xy.ref.gbwt xy.both.gbwt xy.removed.gbwt
 
 
 # Build a three-sample GBWT from a simple GFA
 vg gbwt -o all.gbwt -G graphs/three_samples.gfa
-is $(vg gbwt -c all.gbwt) 12 "all samples: 12 threads"
+is $(vg gbwt -c all.gbwt) 12 "all samples: 12 paths"
 is $(vg gbwt -H all.gbwt) 6 "all samples: 6 haplotypes"
 
 # Remove samples 1 and 3
 vg gbwt -R sample1 -R sample3 -o removed.gbwt all.gbwt
 is $? 0 "multiple samples can be removed from a GBWT index"
-is $(vg gbwt -c removed.gbwt) 4 "sample 2: 4 threads"
+is $(vg gbwt -c removed.gbwt) 4 "sample 2: 4 paths"
 is $(vg gbwt -H removed.gbwt) 2 "sample 2: 2 haplotypes"
 
 rm -f all.gbwt removed.gbwt
 
 
-# Extract threads from GBWT
+# Extract paths from GBWT
 vg gbwt -x x.vg -o x.gbwt -v small/xy2.vcf.gz
 vg gbwt -e x.extract x.gbwt
-is $? 0 "threads can be extracted from GBWT"
-is $(cat x.extract | wc -c) 121 "correct size for the thread file"
+is $? 0 "paths can be extracted from GBWT"
+is $(cat x.extract | wc -c) 121 "correct size for the paths file"
 
 rm -f x.gbwt x.extract
 
@@ -271,7 +271,7 @@ rm -f extracted.gbwt extracted2.gbwt extracted2.gg
 vg gbwt -P -n 16 -x xy.xg -g xy.cover.gg -o xy.cover.gbwt
 is $? 0 "Path cover GBWTGraph construction"
 is $(md5sum xy.cover.gg | cut -f 1 -d\ ) 6a2738f51472e0ba1553a815a005b157 "GBWTGraph was serialized correctly"
-is $(vg gbwt -c xy.cover.gbwt) 32 "path cover: 32 threads"
+is $(vg gbwt -c xy.cover.gbwt) 32 "path cover: 32 paths"
 is $(vg gbwt -C xy.cover.gbwt) 2 "path cover: 2 contigs"
 is $(vg gbwt -H xy.cover.gbwt) 16 "path cover: 16 haplotypes"
 is $(vg gbwt -S xy.cover.gbwt) 16 "path cover: 16 samples"
@@ -282,10 +282,10 @@ rm -f xy.cover.gg xy.cover.gbwt
 vg gbwt -P -n 16 -x xy.xg -g xy.cover.gg -o xy.cover.gbwt --pass-paths
 is $? 0 "Path cover GBWTGraph construction"
 is $(md5sum xy.cover.gg | cut -f 1 -d\ ) 6a2738f51472e0ba1553a815a005b157 "GBWTGraph was serialized correctly"
-is $(vg gbwt -c xy.cover.gbwt) 34 "path cover w/ paths: 34 threads"
-is $(vg gbwt -C xy.cover.gbwt) 2 "path cover w/ paths: 2 contigs"
-is $(vg gbwt -H xy.cover.gbwt) 17 "path cover w/ paths: 17 haplotypes"
-is $(vg gbwt -S xy.cover.gbwt) 17 "path cover w/ paths: 17 samples"
+is $(vg gbwt -c xy.cover.gbwt) 34 "path cover w/ reference paths: 34 paths"
+is $(vg gbwt -C xy.cover.gbwt) 2 "path cover w/ reference paths: 2 contigs"
+is $(vg gbwt -H xy.cover.gbwt) 17 "path cover w/ reference paths: 17 haplotypes"
+is $(vg gbwt -S xy.cover.gbwt) 17 "path cover w/ reference paths: 17 samples"
 
 rm -f xy.cover.gg xy.cover.gbwt
 
@@ -293,7 +293,7 @@ rm -f xy.cover.gg xy.cover.gbwt
 vg gbwt -x xy-alt.xg -g xy.local.gg -l -n 16 -o xy.local.gbwt -v small/xy2.vcf.gz
 is $? 0 "Local haplotypes GBWTGraph construction"
 is $(md5sum xy.local.gg | cut -f 1 -d\ ) 00429586246711abcf1367a97d3c468c "GBWTGraph was serialized correctly"
-is $(vg gbwt -c xy.local.gbwt) 32 "local haplotypes: 32 threads"
+is $(vg gbwt -c xy.local.gbwt) 32 "local haplotypes: 32 paths"
 is $(vg gbwt -C xy.local.gbwt) 2 "local haplotypes: 2 contigs"
 is $(vg gbwt -H xy.local.gbwt) 16 "local haplotypes: 16 haplotypes"
 is $(vg gbwt -S xy.local.gbwt) 16 "local haplotypes: 16 samples"
@@ -314,10 +314,10 @@ rm -f xy.local.gg xy.local.gbwt xy.local.gbz
 vg gbwt -x xy-alt.xg -g xy.local.gg -l -n 16 -o xy.local.gbwt -v small/xy2.vcf.gz --pass-paths
 is $? 0 "Local haplotypes GBWTGraph construction"
 is $(md5sum xy.local.gg | cut -f 1 -d\ ) 6a2738f51472e0ba1553a815a005b157 "GBWTGraph was serialized correctly"
-is $(vg gbwt -c xy.local.gbwt) 34 "local haplotypes w/ paths: 34 threads"
-is $(vg gbwt -C xy.local.gbwt) 2 "local haplotypes w/ paths: 2 contigs"
-is $(vg gbwt -H xy.local.gbwt) 17 "local haplotypes w/ paths: 17 haplotypes"
-is $(vg gbwt -S xy.local.gbwt) 17 "local haplotypes w/ paths: 17 samples"
+is $(vg gbwt -c xy.local.gbwt) 34 "local haplotypes w/ reference paths: 34 paths"
+is $(vg gbwt -C xy.local.gbwt) 2 "local haplotypes w/ reference paths: 2 contigs"
+is $(vg gbwt -H xy.local.gbwt) 17 "local haplotypes w/ reference paths: 17 haplotypes"
+is $(vg gbwt -S xy.local.gbwt) 17 "local haplotypes w/ reference paths: 17 samples"
 
 rm -f xy.local.gg xy.local.gbwt
 
@@ -325,11 +325,11 @@ rm -f xy.local.gg xy.local.gbwt
 vg gbwt -G haplotype-sampling/micb-kir3dl1.gfa -g large.gbz --gbz-format
 vg gbwt -Z large.gbz -l -n 16 --pass-paths -o large.local.gbwt
 is $? 0 "Local haplotypes with reference paths from a larger GBZ"
-is $(vg gbwt -c large.local.gbwt) 36 "local haplotypes w/ paths: 36 threads"
-is $(vg gbwt -C large.local.gbwt) 2 "local haplotypes w/ paths: 2 contigs"
-is $(vg gbwt -H large.local.gbwt) 18 "local haplotypes w/ paths: 18 haplotypes"
-is $(vg gbwt -S large.local.gbwt) 18 "local haplotypes w/ paths: 18 samples"
-is $(vg gbwt --tags large.local.gbwt | grep -c reference_samples) 1 "local haplotypes w/ paths: reference_samples set"
+is $(vg gbwt -c large.local.gbwt) 36 "local haplotypes w/ reference paths: 36 paths"
+is $(vg gbwt -C large.local.gbwt) 2 "local haplotypes w/ reference paths: 2 contigs"
+is $(vg gbwt -H large.local.gbwt) 18 "local haplotypes w/ reference paths: 18 haplotypes"
+is $(vg gbwt -S large.local.gbwt) 18 "local haplotypes w/ reference paths: 18 samples"
+is $(vg gbwt --tags large.local.gbwt | grep -c reference_samples) 1 "local haplotypes w/ reference paths: reference_samples set"
 
 rm -f large.gbz large.local.gbwt
 
@@ -339,7 +339,7 @@ vg gbwt -x x.vg -o x.gbwt -v small/xy2.vcf.gz
 vg gbwt -a -n 16 -x xy.xg -g augmented.gg -o augmented.gbwt x.gbwt
 is $? 0 "Augmented GBWTGraph construction"
 is $(md5sum augmented.gg | cut -f 1 -d\ ) 00429586246711abcf1367a97d3c468c "GBWTGraph was serialized correctly"
-is $(vg gbwt -c augmented.gbwt) 18 "augmented: 18 threads"
+is $(vg gbwt -c augmented.gbwt) 18 "augmented: 18 paths"
 is $(vg gbwt -C augmented.gbwt) 2 "augmented: 2 contigs"
 is $(vg gbwt -H augmented.gbwt) 18 "augmented: 18 haplotypes"
 is $(vg gbwt -S augmented.gbwt) 17 "augmented: 17 samples"
@@ -355,7 +355,7 @@ rm -f x.vg y.vg x.xg xy.xg xy-alt.xg
 vg gbwt -o gfa.gbwt -G graphs/components_walks.gfa
 is $? 0 "GBWT construction from GFA"
 is $(md5sum gfa.gbwt | cut -f 1 -d\ ) 44c27c37c7af6911c26aea2a41008460 "GBWT was serialized correctly"
-is $(vg gbwt -c gfa.gbwt) 4 "gfa: 4 threads"
+is $(vg gbwt -c gfa.gbwt) 4 "gfa: 4 paths"
 is $(vg gbwt -C gfa.gbwt) 2 "gfa: 2 contigs"
 is $(vg gbwt -H gfa.gbwt) 2 "gfa: 2 haplotypes"
 is $(vg gbwt -S gfa.gbwt) 1 "gfa: 1 sample"
@@ -376,7 +376,7 @@ is $(md5sum gfa2.gbz | cut -f 1 -d\ ) ab241a3f79a781a367b701cb8888bf01 "GBZ was
 # Build GBWT and GBWTGraph from GFA with node chopping
 vg gbwt -o chopping.gbwt -g chopping.gg --translation chopping.trans --max-node 2 -G graphs/chopping_walks.gfa
 is $? 0 "GBWT+GBWTGraph construction from GFA with chopping"
-is $(vg gbwt -c chopping.gbwt) 3 "chopping: 3 threads"
+is $(vg gbwt -c chopping.gbwt) 3 "chopping: 3 paths"
 is $(vg gbwt -C chopping.gbwt) 1 "chopping: 1 contig"
 is $(vg gbwt -H chopping.gbwt) 3 "chopping: 3 haplotypes"
 is $(vg gbwt -S chopping.gbwt) 2 "chopping: 2 samples"
@@ -391,7 +391,7 @@ is $(wc -l < from_gbz.trans) 8 "from GBZ: 8 translations"
 # Build GBWT and GBWTGraph from GFA with both paths and walks
 vg gbwt -o ref_paths.gbwt -g ref_paths.gg --translation ref_paths.trans -G graphs/components_paths_walks.gfa
 is $? 0 "GBWT+GBWTGraph construction from GFA with reference paths"
-is $(vg gbwt -c ref_paths.gbwt) 6 "ref paths: 6 threads"
+is $(vg gbwt -c ref_paths.gbwt) 6 "ref paths: 6 paths"
 is $(vg gbwt -C ref_paths.gbwt) 2 "ref paths: 2 contigs"
 is $(vg gbwt -H ref_paths.gbwt) 3 "ref paths: 3 haplotypes"
 is $(vg gbwt -S ref_paths.gbwt) 2 "ref paths: 2 samples"

From 6377996ef9f148b02e6e890e023d32dbf01ad96c Mon Sep 17 00:00:00 2001
From: Jouni Siren <jouni.siren@iki.fi>
Date: Thu, 31 Oct 2024 21:13:18 -0700
Subject: [PATCH 37/54] Rename some long options

---
 src/subcommand/gbwt_main.cpp | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/src/subcommand/gbwt_main.cpp b/src/subcommand/gbwt_main.cpp
index af88d10f2e9..5e55928a4fe 100644
--- a/src/subcommand/gbwt_main.cpp
+++ b/src/subcommand/gbwt_main.cpp
@@ -330,11 +330,11 @@ void help_gbwt(char** argv) {
     std::cerr << "    -H, --haplotypes        print the number of haplotypes" << std::endl;
     std::cerr << "    -S, --samples           print the number of samples" << std::endl;
     std::cerr << "    -L, --list-names        list contig/sample names (use with -C or -S)" << std::endl;
-    std::cerr << "    -T, --thread-names      list path names" << std::endl; // FIXME change to --path-names; make --thread-names a deprecated alias
+    std::cerr << "    -T, --path-names        list path names" << std::endl;
     std::cerr << "        --tags              list GBWT tags" << std::endl;
     std::cerr << std::endl;
     std::cerr << "Step 8: Paths (one input GBWT):" << std::endl;
-    std::cerr << "    -c, --count-threads     print the number of paths" << std::endl; // FIXME also here, change to --count-paths
+    std::cerr << "    -c, --count-paths       print the number of paths" << std::endl;
     std::cerr << "    -e, --extract FILE      extract paths in SDSL format to FILE" << std::endl;
     std::cerr << std::endl;
 }
@@ -424,7 +424,11 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) {
     constexpr int OPT_PASS_PATHS = 1400;
     constexpr int OPT_GBZ_FORMAT = 1500;
     constexpr int OPT_TAGS = 1700;
-    
+
+    // Deprecated options.
+    constexpr int OPT_THREAD_NAMES = 2000;
+    constexpr int OPT_COUNT_THREADS = 2001;
+
     // Make a collection of all the known tags and their descriptions. Use an ordered map so that we can do some typo guessing.
     // Values are description and list of prohibited characters.
     const std::map<std::string, std::pair<std::string, std::unordered_set<char>>> KNOWN_TAGS = {
@@ -519,11 +523,13 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) {
         { "haplotypes", no_argument, 0, 'H' },
         { "samples", no_argument, 0, 'S' },
         { "list-names", no_argument, 0, 'L' },
-        { "thread-names", no_argument, 0, 'T' }, // FIXME
+        { "path-names", no_argument, 0, 'T' },
+        { "thread-names", no_argument, 0, OPT_THREAD_NAMES },
         { "tags", no_argument, 0, OPT_TAGS },
 
         // Paths
-        { "count-threads", no_argument, 0, 'c' }, // FIXME
+        { "count-paths", no_argument, 0, 'c' },
+        { "count-threads", no_argument, 0, OPT_COUNT_THREADS },
         { "extract", required_argument, 0, 'e' },
 
         { "help", no_argument, 0, 'h' },
@@ -867,6 +873,11 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) {
             config.path_names = true;
             config.metadata_mode = true;
             break;
+        case OPT_THREAD_NAMES:
+            std::cerr << "warning: [vg gbwt] option --thread-names is deprecated; use --path-names instead" << std::endl;
+            config.path_names = true;
+            config.metadata_mode = true;
+            break;
         case OPT_TAGS:
             config.tags = true;
             config.metadata_mode = true;
@@ -877,6 +888,11 @@ GBWTConfig parse_gbwt_config(int argc, char** argv) {
             config.count_paths = true;
             config.path_mode = true;
             break;
+        case OPT_COUNT_THREADS:
+            std::cerr << "warning: [vg gbwt] option --count-threads is deprecated; use --count-paths instead" << std::endl;
+            config.count_paths = true;
+            config.path_mode = true;
+            break;
         case 'e':
             config.path_output = optarg;
             config.path_mode = true;

From 7c6b977c989b90e538cd7b1ab1ff422b718473d2 Mon Sep 17 00:00:00 2001
From: Jouni Siren <jouni.siren@iki.fi>
Date: Thu, 31 Oct 2024 22:28:58 -0700
Subject: [PATCH 38/54] Remove GBWT options from vg index

---
 src/subcommand/index_main.cpp | 218 ++++------------------------------
 test/t/05_vg_find.t           |   4 +-
 test/t/06_vg_index.t          |  73 ++----------
 test/t/07_vg_map.t            |   3 +-
 test/t/11_vg_paths.t          |   3 +-
 test/t/13_vg_sim.t            |   4 +-
 test/t/18_vg_call.t           |   2 +-
 test/t/26_deconstruct.t       |   3 +-
 test/t/33_vg_mpmap.t          |   3 +-
 test/t/37_vg_gbwt.t           |  32 +----
 test/t/38_vg_prune.t          |   6 +-
 test/t/46_vg_minimizer.t      |   6 +-
 12 files changed, 62 insertions(+), 295 deletions(-)

diff --git a/src/subcommand/index_main.cpp b/src/subcommand/index_main.cpp
index 26a761162f2..731fdcff26d 100644
--- a/src/subcommand/index_main.cpp
+++ b/src/subcommand/index_main.cpp
@@ -1,4 +1,4 @@
-// index.cpp: define the "vg index" subcommand, which makes xg, GCSA2, and GBWT indexes
+// index.cpp: define the "vg index" subcommand, which makes xg, GCSA2, and distance indexes
 
 #include <omp.h>
 #include <unistd.h>
@@ -11,7 +11,6 @@
 #include "subcommand.hpp"
 
 #include "../vg.hpp"
-#include "../haplotype_indexer.hpp"
 #include "xg.hpp"
 #include <vg/io/stream.hpp>
 #include <vg/io/vpkg.hpp>
@@ -23,12 +22,10 @@
 #include "../integrated_snarl_finder.hpp"
 #include "../snarl_distance_index.hpp"
 #include "../source_sink_overlay.hpp"
-#include "../gbwt_helper.hpp"
 #include "../gbwtgraph_helper.hpp"
 #include "../gcsa_helper.hpp"
 
 #include <gcsa/algorithms.h>
-#include <gbwt/variants.h>
 #include <bdsg/overlays/packed_subgraph_overlay.hpp>
 #include <handlegraph/algorithms/weakly_connected_components.hpp>
 
@@ -47,24 +44,6 @@ void help_index(char** argv) {
          << "xg options:" << endl
          << "    -x, --xg-name FILE     use this file to store a succinct, queryable version of the graph(s), or read for GCSA or distance indexing" << endl
          << "    -L, --xg-alts          include alt paths in xg" << endl
-         << "gbwt options (more in vg gbwt):" << endl
-         << "    -v, --vcf-phasing FILE generate threads from the haplotypes in the VCF file FILE" << endl
-         << "    -W, --ignore-missing   don't warn when variants in the VCF are missing from the graph; silently skip them" << endl
-         << "    -T, --store-threads    generate threads from the embedded paths" << endl
-         << "    -M, --store-gam FILE   generate threads from the alignments in gam FILE (many allowed)" << endl
-         << "    -F, --store-gaf FILE   generate threads from the alignments in gaf FILE (many allowed)" << endl
-         << "    -G, --gbwt-name FILE   store the threads as GBWT in FILE" << endl
-         << "    -z, --actual-phasing   do not make unphased homozygous genotypes phased"<< endl
-         << "    -P, --force-phasing    replace unphased genotypes with randomly phased ones" << endl
-         << "    -o, --discard-overlaps skip overlapping alternate alleles if the overlap cannot be resolved" << endl
-         << "    -B, --batch-size N     number of samples per batch (default 200)" << endl
-         << "    -u, --buffer-size N    GBWT construction buffer size in millions of nodes (default 100)" << endl
-         << "    -n, --id-interval N    store haplotype ids at one out of N positions (default 1024)" << endl
-         << "    -R, --range X..Y       process samples X to Y (inclusive)" << endl
-         << "    -r, --rename V=P       rename contig V in the VCFs to path P in the graph (may repeat)" << endl
-         << "    --rename-variants      when renaming contigs, find variants in the graph based on the new name" << endl
-         << "    -I, --region C:S-E     operate on only the given 1-based region of the given VCF contig (may repeat)" << endl
-         << "    -E, --exclude SAMPLE   exclude any samples with the given name from haplotype indexing" << endl
          << "gcsa options:" << endl
          << "    -g, --gcsa-out FILE    output a GCSA2 index to the given file" << endl
          //<< "    -i, --dbg-in FILE      use kmers from FILE instead of input VG (may repeat)" << endl
@@ -83,12 +62,6 @@ void help_index(char** argv) {
          << "                           if N is 0 then don't store distances, only the snarl tree" << endl;
 }
 
-void multiple_thread_sources() {
-    std::cerr << "error: [vg index] cannot generate threads from multiple sources (VCF, GAM, GAF, paths)" << std::endl;
-    std::cerr << "error: [vg index] GBWT indexes can be built separately and merged with vg gbwt" << std::endl;
-    std::exit(EXIT_FAILURE);
-}
-
 int main_index(int argc, char** argv) {
 
     if (argc == 2) {
@@ -101,25 +74,18 @@ int main_index(int argc, char** argv) {
     #define OPT_DISTANCE_SNARL_LIMIT 1002
 
     // Which indexes to build.
-    bool build_xg = false, build_gbwt = false, build_gcsa = false, build_dist = false;
+    bool build_xg = false, build_gcsa = false, build_dist = false;
 
     // Files we should read.
     string vcf_name, mapping_name;
     vector<string> dbg_names;
 
     // Files we should write.
-    string xg_name, gbwt_name, gcsa_name, dist_name;
-
+    string xg_name, gcsa_name, dist_name;
 
     // General
     bool show_progress = false;
 
-    // GBWT
-    HaplotypeIndexer haplotype_indexer;
-    enum thread_source_type { thread_source_none, thread_source_vcf, thread_source_paths, thread_source_gam, thread_source_gaf };
-    thread_source_type thread_source = thread_source_none;
-    vector<string> aln_file_names;
-
     // GCSA
     gcsa::size_type kmer_size = gcsa::Key::MAX_LENGTH;
     gcsa::ConstructionParameters params;
@@ -152,7 +118,7 @@ int main_index(int argc, char** argv) {
             {"thread-db", required_argument, 0, 'F'},
             {"xg-alts", no_argument, 0, 'L'},
 
-            // GBWT
+            // GBWT. These have been removed and will return an error.
             {"vcf-phasing", required_argument, 0, 'v'},
             {"ignore-missing", no_argument, 0, 'W'},
             {"store-threads", no_argument, 0, 'T'},
@@ -211,7 +177,6 @@ int main_index(int argc, char** argv) {
             break;
         case 'p':
             show_progress = true;
-            haplotype_indexer.show_progress = true;
             break;
 
         // XG
@@ -223,111 +188,26 @@ int main_index(int argc, char** argv) {
             xg_alts = true;
             break;
 
-        // GBWT
-        case 'v':
-            if (thread_source != thread_source_none) {
-                multiple_thread_sources();
-            }
-            thread_source = thread_source_vcf;
-            vcf_name = optarg;
-            break;
-        case 'W':
-            haplotype_indexer.warn_on_missing_variants = false;
-            break;
-        case 'T':
-            if (thread_source != thread_source_none) {
-                multiple_thread_sources();
-            }
-            thread_source = thread_source_paths;
-            break;
-        case 'M':
-            if (thread_source != thread_source_none && thread_source != thread_source_gam) {
-                multiple_thread_sources();
-            }
-            thread_source = thread_source_gam;
-            build_gbwt = true;
-            aln_file_names.push_back(optarg);
-            break;
-        case 'F':
-            if (thread_source != thread_source_none && thread_source != thread_source_gaf) {
-                multiple_thread_sources();
-            }
-            thread_source = thread_source_gaf;
-            build_gbwt = true;
-            aln_file_names.push_back(optarg);
-            break;
-        case 'G':
-            build_gbwt = true;
-            gbwt_name = optarg;
-            break;
-        case 'z':
-            haplotype_indexer.phase_homozygous = false;
-            break;
-        case 'P':
-            haplotype_indexer.force_phasing = true;
-            break;
-        case 'o':
-            haplotype_indexer.discard_overlaps = true;
-            break;
-        case 'B':
-            haplotype_indexer.samples_in_batch = std::max(parse<size_t>(optarg), 1ul);
-            break;
-        case 'u':
-            haplotype_indexer.gbwt_buffer_size = std::max(parse<size_t>(optarg), 1ul);
-            break;
-        case 'n':
-            haplotype_indexer.id_interval = parse<size_t>(optarg);
-            break;
-        case 'R':
-            {
-                // Parse first..last
-                string temp(optarg);
-                size_t found = temp.find("..");
-                if(found == string::npos || found == 0 || found + 2 == temp.size()) {
-                    cerr << "error: [vg index] could not parse range " << temp << endl;
-                    exit(1);
-                }
-                haplotype_indexer.sample_range.first = parse<size_t>(temp.substr(0, found));
-                haplotype_indexer.sample_range.second = parse<size_t>(temp.substr(found + 2)) + 1;
-            }
-            break;
-        case 'r':
-            {
-                // Parse the rename old=new
-                string key_value(optarg);
-                auto found = key_value.find('=');
-                if (found == string::npos || found == 0 || found + 1 == key_value.size()) {
-                    cerr << "error: [vg index] could not parse rename " << key_value << endl;
-                    exit(1);
-                }
-                // Parse out the two parts
-                string vcf_contig = key_value.substr(0, found);
-                string graph_contig = key_value.substr(found + 1);
-                // Add the name mapping
-                haplotype_indexer.path_to_vcf[graph_contig] = vcf_contig;
-            }
-            break;
-        case OPT_RENAME_VARIANTS:
-            haplotype_indexer.rename_variants = true;
-            break;
-        case 'I':
-            {
-                // We want to parse this region specifier
-                string region(optarg);
-                
-                Region parsed;
-                parse_region(region, parsed);
-                if (parsed.start <= 0 || parsed.end <= 0) {
-                    // We need both range bounds, and we can't accept 0 since input is 1-based.
-                    cerr << "error: [vg index] could not parse 1-based region " << optarg << endl;
-                }
-                
-                // Make sure to correct the coordinates to 0-based exclusive-end, from 1-based inclusive-end
-                haplotype_indexer.regions[parsed.seq] = make_pair((size_t) (parsed.start - 1), (size_t) parsed.end);
-            }
-            break;
+        // GBWT. The options remain, but they are no longer supported.
+        case 'v': // Fall through
+        case 'W': // Fall through
+        case 'T': // Fall through
+        case 'M': // Fall through
+        case 'F': // Fall through
+        case 'G': // Fall through
+        case 'z': // Fall through
+        case 'P': // Fall through
+        case 'o': // Fall through
+        case 'B': // Fall through
+        case 'u': // Fall through
+        case 'n': // Fall through
+        case 'R': // Fall through
+        case 'r': // Fall through
+        case OPT_RENAME_VARIANTS: // Fall through
+        case 'I': // Fall through
         case 'E':
-            haplotype_indexer.excluded_samples.insert(optarg);
+            std::cerr << "error: [vg index] GBWT construction options have been removed; use vg gbwt instead" << std::endl;
+            std::exit(EXIT_FAILURE);
             break;
 
         // GCSA
@@ -391,37 +271,11 @@ int main_index(int argc, char** argv) {
     }
 
 
-    if (xg_name.empty() && gbwt_name.empty() &&
-        gcsa_name.empty() && !build_gai_index && !build_vgi_index && dist_name.empty()) {
+    if (xg_name.empty() && gcsa_name.empty() && !build_gai_index && !build_vgi_index && dist_name.empty()) {
         cerr << "error: [vg index] index type not specified" << endl;
         return 1;
     }
 
-    if (build_gbwt && thread_source == thread_source_none) {
-        cerr << "error: [vg index] cannot build GBWT without threads" << endl;
-        return 1;
-    }
-
-    if (thread_source != thread_source_none && !build_gbwt) {
-        cerr << "error: [vg index] no GBWT output specified for the threads" << endl;
-        return 1;
-    }
-
-    if (thread_source == thread_source_gam || thread_source == thread_source_gaf) {
-        for (const auto& name : aln_file_names) {
-            if (name == "-") {
-                cerr << "error: [vg index] GAM (-M) and GAF (-F) input files cannot be read from stdin (-)" << endl;
-                return 1;
-            }
-        }
-    }
-
-    if (thread_source != thread_source_none && file_names.size() != 1) {
-        cerr << "error: [vg index] exactly one graph required for generating threads" << std::endl;
-        cerr << "error: [vg index] you may combine the graphs with vg index -x combined.xg --xg-alts" << std::endl;
-        return 1;
-    }
-
     if (file_names.size() <= 0 && dbg_names.empty()){
         //cerr << "No graph provided for indexing. Please provide a .vg file or GCSA2-format deBruijn graph to index." << endl;
         //return 1;
@@ -481,30 +335,6 @@ int main_index(int argc, char** argv) {
         vg::io::save_handle_graph(&xg_index, xg_name);
     }
 
-    // Generate threads
-    if (thread_source != thread_source_none) {
-
-        // Load the only input graph.
-        unique_ptr<PathHandleGraph> path_handle_graph;
-        path_handle_graph = vg::io::VPKG::load_one<PathHandleGraph>(file_names[0]);
-
-        std::unique_ptr<gbwt::DynamicGBWT> gbwt_index(nullptr);
-        if (thread_source == thread_source_vcf) {
-            std::vector<std::string> parse_files = haplotype_indexer.parse_vcf(vcf_name, *path_handle_graph);
-            path_handle_graph.reset(); // Save memory by deleting the graph.
-            gbwt_index = haplotype_indexer.build_gbwt(parse_files);
-        } else if (thread_source == thread_source_paths) {
-            gbwt_index = haplotype_indexer.build_gbwt(*path_handle_graph);
-        } else if (thread_source == thread_source_gam) {
-            gbwt_index = haplotype_indexer.build_gbwt(*path_handle_graph, aln_file_names, "GAM");
-        } else if (thread_source == thread_source_gaf) {
-            gbwt_index = haplotype_indexer.build_gbwt(*path_handle_graph, aln_file_names, "GAF");
-        }
-        if (build_gbwt && gbwt_index.get() != nullptr) {
-            save_gbwt(*gbwt_index, gbwt_name, show_progress);
-        }
-    } // End of thread indexing.
-
     // Build GCSA
     if (build_gcsa) {
 
diff --git a/test/t/05_vg_find.t b/test/t/05_vg_find.t
index f7c54e36eee..67a51741da1 100644
--- a/test/t/05_vg_find.t
+++ b/test/t/05_vg_find.t
@@ -112,7 +112,7 @@ rm -f t.xg t.vg t.x:30:35.vg t.x:10:20.vg q.x:30:35.vg q.x:10:20.vg t.bed
 
 vg construct -r small/xy.fa -v small/xy2.vcf.gz -R x -C -a > x.vg 2> /dev/null
 vg index -x x.xg x.vg
-vg index -G x.gbwt -v small/xy2.vcf.gz x.vg
+vg gbwt -v small/xy2.vcf.gz -o x.gbwt -x x.vg
 is $(vg find -p x -x x.xg -K 16 -H x.gbwt | cut -f 5 | sort | uniq -c  | tail -n 1 | awk '{ print $1 }') 1510 "we find the expected number of kmers with haplotype frequency equal to 2"
 rm -f x.vg x.xg x.gbwt
 
@@ -124,7 +124,7 @@ is $? 0 "GFA i/o for find -n consistent with converting both ways"
 
 # Find nodes that map to the provided ids
 vg construct -m 32 -r small/xy.fa -v small/xy2.vcf.gz -R x -C -a > x.vg 2> /dev/null
-vg index -G x.gbwt -v small/xy2.vcf.gz x.vg
+vg gbwt -v small/xy2.vcf.gz -o x.gbwt -x x.vg
 vg prune -u -m x.mapping -g x.gbwt -e 1 x.vg > x.unfolded.vg
 
 rm -f expected.gfa
diff --git a/test/t/06_vg_index.t b/test/t/06_vg_index.t
index a8ac6add8dc..0e3edf78dbc 100644
--- a/test/t/06_vg_index.t
+++ b/test/t/06_vg_index.t
@@ -7,7 +7,7 @@ PATH=../bin:$PATH # for vg
 
 export LC_ALL="en_US.utf8" # force ekg's favorite sort order
 
-plan tests 54
+plan tests 45
 
 # Single graph without haplotypes
 vg construct -r small/x.fa -v small/x.vcf.gz > x.vg
@@ -39,9 +39,6 @@ rm -f x3.gcsa x3.gcsa.lcp
 # Single graph with haplotypes
 vg construct -r small/x.fa -v small/x.vcf.gz -a > x.vg
 
-vg index -G x.gbwt -v small/x.vcf.gz x.vg
-is $? 0 "building a GBWT index of a graph with haplotypes"
-
 vg index -x x.xg x.vg
 is $? 0 "building an XG index of a graph with haplotypes"
 
@@ -55,51 +52,25 @@ is $(vg paths -x x-ap.xg -L | wc -l) $(vg paths -v x.vg -L | wc -l) "xg index do
 vg index -g x.gcsa x.vg
 is $? 0 "building a GCSA index of a graph with haplotypes"
 
-vg index -x x2.xg -G x2.gbwt -v small/x.vcf.gz -g x2.gcsa x.vg
+vg index -x x2.xg -g x2.gcsa x.vg
 is $? 0 "building all indexes at once"
 
-cmp x.xg x2.xg && cmp x.gbwt x2.gbwt && cmp x.gcsa x2.gcsa && cmp x.gcsa.lcp x2.gcsa.lcp
+cmp x.xg x2.xg && cmp x.gcsa x2.gcsa && cmp x.gcsa.lcp x2.gcsa.lcp
 is $? 0 "the indexes are identical"
 
-vg index -x x2-ap.xg -G x2-ap.gbwt -v small/x.vcf.gz -g x2-ap.gcsa x.vg -L
+vg index -x x2-ap.xg -g x2-ap.gcsa x.vg -L
 is $? 0 "building all indexes at once, while leaving alt paths in xg"
 
-cmp x.gbwt x2-ap.gbwt && cmp x.gcsa x2-ap.gcsa && cmp x.gcsa.lcp x2-ap.gcsa.lcp
+cmp x.gcsa x2-ap.gcsa && cmp x.gcsa.lcp x2-ap.gcsa.lcp
 is $? 0 "the indexes are identical with -L"
 
 is $(vg paths -x x2-ap.xg -L | wc -l) $(vg paths -v x.vg -L | wc -l) "xg index does contains alt paths with index -L all at once"
 
-# Exclude a sample from the GBWT index
-vg index -G empty.gbwt -v small/x.vcf.gz --exclude 1 x.vg
-is $? 0 "samples can be excluded from haplotype indexing"
-is $(vg gbwt -c empty.gbwt) 0 "excluded samples were not included in the GBWT index"
-
-# Make GBWT from GAM
-vg paths -v x.vg -X -Q _alt > x-alts.gam
-vg index x.vg -M x-alts.gam -G x-gam.gbwt
-# Make GBWT from GAF
-vg convert x.vg -G x-alts.gam > x-alts.gaf
-vg index x.vg -F x-alts.gaf -G x-gaf.gbwt
-cmp x-gaf.gbwt x-gam.gbwt
-is $? 0 "GBWT from GAF same as from GAM"
-
 rm -f x.vg
-rm -f x.xg x-ap.xg x.gbwtx.gcsa x.gcsa.lcp
-rm -f x2.xg x2.gbwt x2.gcsa x2.gcsa.lcp
-rm -f x2-ap.xg x2-ap.gbwt x2-ap.gcsa x2-ap.gcsa.lcp
-rm -f empty.gbwt
-rm -f x-alts.gam x-alts.gaf x-gam.gbwt x-gaf.gbwt
-
-
-# Subregion graph with haplotypes
-vg construct -r small/x.fa -v small/x.vcf.gz -a --region x:100-200 > x.part.vg
-
-vg index -x x.part.xg -G x.part.gbwt --region x:100-200 -v small/x.vcf.gz x.part.vg 2>log.txt
-is $? 0 "building GBWT index for a regional graph"
-
-is "$(cat log.txt | wc -c)" "0" "no warnings about missing variants produced"
-
-rm -f x.part.vg x.part.xg x.part.gbwt log.txt
+rm -f x.xg x-ap.xg x.gcsa x.gcsa.lcp
+rm -f x2.xg x2.gcsa x2.gcsa.lcp
+rm -f x2-ap.xg x2-ap.gcsa x2-ap.gcsa.lcp
+rm -f x-alts.gam x-alts.gaf
 
 
 # Multiple graphs without haplotypes
@@ -129,9 +100,6 @@ vg construct -r small/xy.fa -v small/xy2.vcf.gz -R x -C -a > x.vg 2> /dev/null
 vg construct -r small/xy.fa -v small/xy2.vcf.gz -R y -C -a > y.vg 2> /dev/null
 vg ids -j x.vg y.vg
 
-vg index -G x.gbwt -v small/xy2.vcf.gz x.vg && vg index -G y.gbwt -v small/xy2.vcf.gz y.vg && vg gbwt -m -f -o xy.gbwt x.gbwt y.gbwt
-is $? 0 "building a GBWT index of multiple graphs with haplotypes"
-
 vg index -x xy.xg x.vg y.vg
 is $? 0 "building an XG index of multiple graphs with haplotypes"
 
@@ -144,31 +112,14 @@ is $? 0 "building XG and GCSA indexes at once"
 vg index -x xy-alt.xg -L x.vg y.vg
 is $? 0 "building an XG index with alt paths"
 
-vg index -G xy2.gbwt -v small/xy2.vcf.gz xy-alt.xg
-is $? 0 "building a GBWT index from an XG index"
-
-cmp xy.xg xy2.xg && cmp xy.gcsa xy2.gcsa && cmp xy.gcsa.lcp xy2.gcsa.lcp && cmp xy.gbwt xy2.gbwt
+cmp xy.xg xy2.xg && cmp xy.gcsa xy2.gcsa && cmp xy.gcsa.lcp xy2.gcsa.lcp
 is $? 0 "the indexes are identical"
 
 rm -f x.vg y.vg
-rm -f x.gbwt y.gbwt
-rm -f xy.xg xy.gbwt xy.gcsa xy.gcsa.lcp
-rm -f xy2.xg xy2.gbwt xy2.gcsa xy2.gcsa.lcp
+rm -f xy.xg xy.gcsa xy.gcsa.lcp
+rm -f xy2.xg xy2.gcsa xy2.gcsa.lcp
 rm -f xy-alt.xg
 
-
-# GBWT construction options
-vg construct -r small/xy.fa -v small/xy2.vcf.gz -R x -C -a > x.vg 2> /dev/null
-
-vg index -G x_ref.gbwt -T x.vg
-is $? 0 "GBWT can be built for paths"
-
-rm -f x_ref.gbwt
-
-# We do not test GBWT construction parameters (-B, -u, -n) because they matter only for large inputs.
-# We do not test chromosome-length path generation (-P, -o) for the same reason.
-
-
 # Other tests
 vg construct -m 1000 -r small/x.fa -v small/x.vcf.gz >x.vg
 vg index -x x.xg x.vg bogus123.vg 2>/dev/null
diff --git a/test/t/07_vg_map.t b/test/t/07_vg_map.t
index 08a99edc398..a7c1e41c9a8 100644
--- a/test/t/07_vg_map.t
+++ b/test/t/07_vg_map.t
@@ -152,7 +152,8 @@ is $? 1 "error on vg map -f <nonexistent-file> (paired, RHS)"
 
 # Now do the GBWT
 vg construct -a -r small/x.fa -v small/x.vcf.gz >x.vg
-vg index -x x.xg -g x.gcsa -v small/x.vcf.gz --gbwt-name x.gbwt -k 16 x.vg
+vg index -x x.xg -g x.gcsa -k 16 x.vg
+vg gbwt -v small/x.vcf.gz -o x.gbwt -x x.vg
 
 # This read is all ref which matches no haplotype in x.vcf.gz and visits some unused nodes
 is "$(vg map -x x.xg -g x.gcsa --gbwt-name x.gbwt --hap-exp 1 --full-l-bonus 0 -f reads/x.unvisited.fq -j | jq -r '.score')" "36" "mapping a read that touches unused nodes gets the base score"
diff --git a/test/t/11_vg_paths.t b/test/t/11_vg_paths.t
index e21a8fe2c09..e83fa8de659 100644
--- a/test/t/11_vg_paths.t
+++ b/test/t/11_vg_paths.t
@@ -11,7 +11,8 @@ plan tests 26
 
 vg construct -r small/x.fa -v small/x.vcf.gz -a > x.vg
 vg construct -r small/x.fa -v small/x.vcf.gz > x2.vg
-vg index -x x.xg -G x.gbwt -v small/x.vcf.gz x.vg
+vg index -x x.xg x.vg
+vg gbwt -v small/x.vcf.gz -o x.gbwt -x x.vg
 
 # List path/thread names from various input formats
 is "$(vg paths --list -v x2.vg)" "x" "path listing works from vg"
diff --git a/test/t/13_vg_sim.t b/test/t/13_vg_sim.t
index 0481a82e463..ecc0799eeda 100644
--- a/test/t/13_vg_sim.t
+++ b/test/t/13_vg_sim.t
@@ -11,7 +11,7 @@ plan tests 36
 vg construct -r small/x.fa -v small/x.vcf.gz >x.vg
 vg construct -r small/x.fa -v small/x.vcf.gz -a >x2.vg
 vg index -x x.xg x.vg
-vg index -G x.gbwt -v small/x.vcf.gz x2.vg
+vg gbwt -o x.gbwt -v small/x.vcf.gz -x x2.vg
 
 is $(vg sim -l 100 -n 100 -x x.xg | wc -l) 100 \
     "vg sim creates the correct number of reads"
@@ -58,7 +58,7 @@ rm -f x.vg x2.vg x.xg x.gbwt n.vg n.fa n.xg
 
 vg construct -r small/xy.fa -v small/x.vcf.gz -a >xy.vg
 vg index -x xy.xg xy.vg
-vg index -G xy.gbwt -v small/x.vcf.gz xy.vg
+vg gbwt -o xy.gbwt -v small/x.vcf.gz -x xy.vg
 
 vg sim -s 12345 -n 1000 -l 2 -e 0.1 -x xy.xg -g xy.gbwt --sample-name 1 --any-path >/dev/null
 is $? "0" "Sample simulation works along with --any-path"
diff --git a/test/t/18_vg_call.t b/test/t/18_vg_call.t
index 6a457f4eb9f..b97cee3fb08 100644
--- a/test/t/18_vg_call.t
+++ b/test/t/18_vg_call.t
@@ -89,7 +89,7 @@ is "${REF_COUNT_V}" "${REF_COUNT_A}" "Same number of reference calls with -a as
 
 # Output snarl traversals into a GBWT then genotype that
 vg call HGSVC_alts.xg -k HGSVC_alts.pack -s HG00514 -T | gzip > HGSVC_travs.gaf.gz
-vg index HGSVC_alts.xg -F HGSVC_travs.gaf.gz -G HGSVC_travs.gbwt
+vg gbwt -o HGSVC_travs.gbwt -x HGSVC_alts.xg -A HGSVC_travs.gaf.gz
 vg call HGSVC_alts.xg -k HGSVC_alts.pack  -g HGSVC_travs.gbwt -s HG00514 > HGSVC_travs.vcf
 vg call HGSVC_alts.xg -k HGSVC_alts.pack -s HG00514 > HGSVC_direct.vcf
 # extract the called genotypes
diff --git a/test/t/26_deconstruct.t b/test/t/26_deconstruct.t
index 03c7f8f3a87..47a28c4ae35 100644
--- a/test/t/26_deconstruct.t
+++ b/test/t/26_deconstruct.t
@@ -121,7 +121,8 @@ is "$?" 0 "deconstructing vg graph gives same output as xg graph"
 rm -f tiny_names.gfa tiny_names.vg tiny_names.xg tiny_names_decon.vcf tiny_names_decon_vg.vcf
 
 vg construct -r small/x.fa -v small/x.vcf.gz -a > x.vg
-vg index -x x.xg -G x.gbwt -v small/x.vcf.gz x.vg
+vg index -x x.xg x.vg
+vg gbwt -v small/x.vcf.gz -o x.gbwt -x x.vg
 vg deconstruct x.xg -g x.gbwt | bgzip > x.decon.vcf.gz
 tabix -f -p vcf  x.decon.vcf.gz
 cat small/x.fa |  bcftools consensus small/x.vcf.gz -s 1 -H 1 > small.s1.h1.fa
diff --git a/test/t/33_vg_mpmap.t b/test/t/33_vg_mpmap.t
index 16af8c4cef9..e569b2b9ed6 100644
--- a/test/t/33_vg_mpmap.t
+++ b/test/t/33_vg_mpmap.t
@@ -11,7 +11,8 @@ plan tests 21
 # Exercise the GBWT
 # Index a couple of nearly identical contigs
 vg construct -m 1000 -a -r small/xy.fa -v small/xy2.vcf.gz >xy2.vg
-vg index -x xy2.xg -g xy2.gcsa -v small/xy2.vcf.gz --gbwt-name xy2.gbwt -k 16 xy2.vg
+vg index -x xy2.xg -g xy2.gcsa -k 16 xy2.vg
+vg gbwt -v small/xy2.vcf.gz -o xy2.gbwt -x xy2.vg
 
 # We turn off the background model calibration with -B and ignore it with -P 1
 
diff --git a/test/t/37_vg_gbwt.t b/test/t/37_vg_gbwt.t
index 9fdc0c8a4fb..870b5d9674a 100644
--- a/test/t/37_vg_gbwt.t
+++ b/test/t/37_vg_gbwt.t
@@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap
 
 PATH=../bin:$PATH # for vg
 
-plan tests 159
+plan tests 149
 
 
 # Build vg graphs for two chromosomes
@@ -21,10 +21,6 @@ vg index -x xy-alt.xg -L x.vg y.vg
 # Single chromosome: haplotypes
 vg gbwt -x x.vg -o x.gbwt -v small/xy2.vcf.gz
 is $? 0 "chromosome X GBWT with vg gbwt"
-vg index -G x2.gbwt -v small/xy2.vcf.gz x.vg
-is $? 0 "chromosome X GBWT with vg index"
-cmp x.gbwt x2.gbwt
-is $? 0 "identical construction results with vg gbwt and vg index"
 vg gbwt -x x.vg -o parse --parse-only -v small/xy2.vcf.gz
 is $? 0 "chromosome X VCF parse"
 ../deps/gbwt/bin/build_gbwt -p -r parse_x > /dev/null 2> /dev/null
@@ -41,20 +37,16 @@ is $(vg gbwt -T x.gbwt | wc -l) 2 "chromosome X: 2 path names"
 is $(vg gbwt -C -L x.gbwt | wc -l) 1 "chromosome X: 1 contig name"
 is $(vg gbwt -S -L x.gbwt | wc -l) 1 "chromosome X: 1 sample name"
 
-rm -f x.gbwt x2.gbwt parse_x.gbwt
+rm -f x.gbwt parse_x.gbwt
 rm -f parse_x parse_x_0_1
 
 
 # Single chromosome: paths
 vg gbwt -E -o x.ref.gbwt -x x.vg
 is $? 0 "chromosome X reference GBWT with vg gbwt"
-vg index -G x2.ref.gbwt -T x.vg
-is $? 0 "chromosome X reference GBWT with vg index"
-cmp x.ref.gbwt x2.ref.gbwt
-is $? 0 "identical construction results with vg gbwt and vg index"
 is $(vg gbwt -c x.ref.gbwt) 1 "chromosome X reference: 1 path"
 
-rm -f x.ref.gbwt x2.ref.gbwt
+rm -f x.ref.gbwt
 
 
 # Single chromosome: alignments
@@ -62,17 +54,13 @@ vg paths -v x.vg -X -Q _alt > x.alts.gam
 vg convert -G x.alts.gam x.vg > x.alts.gaf
 vg gbwt -A -o x.alts.gaf.gbwt -x x.vg x.alts.gaf
 is $? 0 "chromosome X GAF with vg gbwt"
-vg index -F x.alts.gaf -G x2.alts.gaf.gbwt x.vg
-is $? 0 "chromosome X GAF with vg index"
-cmp x.alts.gaf.gbwt x2.alts.gaf.gbwt
-is $? 0 "identical construction results with vg gbwt and vg index"
 vg gbwt -A --gam-format -o x.alts.gam.gbwt -x x.vg x.alts.gam
 is $? 0 "chromosome X GAM with vg gbwt"
 cmp x.alts.gaf.gbwt x.alts.gaf.gbwt
 is $? 0 "identical construction results from GAF and GAM"
 
 rm -f x.alts.gam x.alts.gaf
-rm -f x.alts.gaf.gbwt x2.alts.gaf.gbwt x.alts.gam.gbwt
+rm -f x.alts.gaf.gbwt x.alts.gam.gbwt
 
 
 # Graph region: haplotypes
@@ -80,12 +68,8 @@ vg construct -r small/x.fa -v small/x.vcf.gz -a --region x:100-200 > x.part.vg
 vg gbwt -x x.part.vg -o x.part.gbwt --vcf-region x:100-200 -v small/x.vcf.gz 2> log.txt
 is $? 0 "chromosome X subgraph GBWT with vg gbwt"
 is "$(cat log.txt | wc -c)" 0 "no warnings about missing variants"
-vg index -G x2.part.gbwt --region x:100-200 -v small/x.vcf.gz x.part.vg 2> log.txt
-is $? 0 "chromosome X subgraph GBWT with vg index"
-cmp x.part.gbwt x2.part.gbwt
-is $? 0 "identical construction results with vg gbwt and vg index"
 
-rm -f x.part.vg x.part.gbwt x2.part.gbwt log.txt
+rm -f x.part.vg x.part.gbwt log.txt
 
 
 # Multiple chromosomes: haplotypes
@@ -127,16 +111,12 @@ rm -f xy.1000gp.gbwt
 # Multiple chromosomes: paths as contigs
 vg gbwt -E -o xy.contigs.gbwt -x xy.xg
 is $? 0 "paths as contigs with vg gbwt"
-vg index -G xy2.contigs.gbwt -T xy.xg
-is $? 0 "paths as contigs with vg index"
-cmp xy.contigs.gbwt xy2.contigs.gbwt
-is $? 0 "identical construction results with vg gbwt and vg index"
 is $(vg gbwt -c xy.contigs.gbwt) 2 "paths as contigs: 2 paths"
 is $(vg gbwt -C xy.contigs.gbwt) 2 "paths as contigs: 2 contigs"
 is $(vg gbwt -H xy.contigs.gbwt) 1 "paths as contigs: 1 haplotype"
 is $(vg gbwt -S xy.contigs.gbwt) 1 "paths as contigs: 1 sample"
 
-rm -f xy.contigs.gbwt xy2.contigs.gbwt 
+rm -f xy.contigs.gbwt
 
 
 # Build an r-index
diff --git a/test/t/38_vg_prune.t b/test/t/38_vg_prune.t
index 3cd5db9cde0..1aa7533c260 100644
--- a/test/t/38_vg_prune.t
+++ b/test/t/38_vg_prune.t
@@ -10,7 +10,7 @@ plan tests 21
 
 # Build a graph with one path and two threads
 vg construct -m 32 -r small/xy.fa -v small/xy2.vcf.gz -R x -C -a > x.vg 2> /dev/null
-vg index -G x.gbwt -v small/xy2.vcf.gz x.vg
+vg gbwt -o x.gbwt -v small/xy2.vcf.gz -x x.vg
 
 # Basic pruning: 5 components, 51 nodes, 51 edges
 vg prune -e 1 x.vg > y.vg
@@ -54,8 +54,8 @@ rm -f x.vg x.gbwt
 vg construct -m 32 -r small/xy.fa -v small/xy2.vcf.gz -R x -C -a > x.vg 2> /dev/null
 vg construct -m 32 -r small/xy.fa -v small/xy2.vcf.gz -R y -C -a > y.vg 2> /dev/null
 vg ids -j -m xy.mapping x.vg y.vg
-vg index -G x.gbwt -v small/xy2.vcf.gz x.vg
-vg index -G y.gbwt -v small/xy2.vcf.gz y.vg
+vg gbwt -o x.gbwt -v small/xy2.vcf.gz -x x.vg
+vg gbwt -o y.gbwt -v small/xy2.vcf.gz -x y.vg
 
 # Prune a single-chromosome graph using multi-chromosome GBWT
 vg gbwt -m -o xy.gbwt x.gbwt y.gbwt
diff --git a/test/t/46_vg_minimizer.t b/test/t/46_vg_minimizer.t
index 06f9133f09d..e4090b54c9d 100644
--- a/test/t/46_vg_minimizer.t
+++ b/test/t/46_vg_minimizer.t
@@ -62,8 +62,10 @@ rm -f x.vg x.xg x.gbwt x.snarls x.dist x.mi x.gg x.gbz
 vg construct -r small/xy.fa -v small/xy2.vcf.gz -R x -C -a > x.vg 2> /dev/null
 vg construct -r small/xy.fa -v small/xy2.vcf.gz -R y -C -a > y.vg 2> /dev/null
 vg ids -j x.vg y.vg
-vg index -x x.xg -G x.gbwt -v small/xy2.vcf.gz x.vg
-vg index -x y.xg -G y.gbwt -v small/xy2.vcf.gz y.vg
+vg index -x x.xg x.vg
+vg gbwt -o x.gbwt -x x.vg -v small/xy2.vcf.gz
+vg index -x y.xg y.vg
+vg gbwt -o y.gbwt -x y.vg -v small/xy2.vcf.gz
 
 # Appending to the index
 vg minimizer --no-dist -t 1 -o x.mi -g x.gbwt x.xg

From 5a2c3af1fda989c2e31ad9da5c1897ec999ac91a Mon Sep 17 00:00:00 2001
From: Jouni Siren <jouni.siren@iki.fi>
Date: Thu, 31 Oct 2024 23:01:56 -0700
Subject: [PATCH 39/54] Update SDSL, GCSA, GBWT, GBWTGraph

---
 deps/gbwt      | 2 +-
 deps/gbwtgraph | 2 +-
 deps/gcsa2     | 2 +-
 deps/sdsl-lite | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/deps/gbwt b/deps/gbwt
index 029c2b45667..2aab62f0664 160000
--- a/deps/gbwt
+++ b/deps/gbwt
@@ -1 +1 @@
-Subproject commit 029c2b456675eab0fbb55de21c8002e1660cb0d3
+Subproject commit 2aab62f0664b2ce8eb4cffd6b3872c89b85fdfa6
diff --git a/deps/gbwtgraph b/deps/gbwtgraph
index de61d340695..d2a423b5eb5 160000
--- a/deps/gbwtgraph
+++ b/deps/gbwtgraph
@@ -1 +1 @@
-Subproject commit de61d340695f64b8aa978816b195d6687e7fda5a
+Subproject commit d2a423b5eb54c3c2f0676cb2925b46ad0ba7e189
diff --git a/deps/gcsa2 b/deps/gcsa2
index 7dbf9305817..8b6b049ab64 160000
--- a/deps/gcsa2
+++ b/deps/gcsa2
@@ -1 +1 @@
-Subproject commit 7dbf93058171b5af420f1b6166d4d2bf54cd8748
+Subproject commit 8b6b049ab6444e891bb7c38ec8f38169ce62e5fb
diff --git a/deps/sdsl-lite b/deps/sdsl-lite
index 86fa3534c1b..115dfd5b737 160000
--- a/deps/sdsl-lite
+++ b/deps/sdsl-lite
@@ -1 +1 @@
-Subproject commit 86fa3534c1bf02b5468bbe58e2e0b8f6ae2d6fa4
+Subproject commit 115dfd5b7371ef99a1e0611389b956bb88bf9fc4

From ba0ed68ebff2d05dc23e6190b572284d7f750527 Mon Sep 17 00:00:00 2001
From: Jouni Siren <jouni.siren@iki.fi>
Date: Sat, 2 Nov 2024 04:09:56 -0700
Subject: [PATCH 40/54] Multithreaded GAF/GAM to GBWT

---
 src/gbwt_helper.cpp          |  20 -----
 src/gbwt_helper.hpp          |   7 --
 src/haplotype_indexer.cpp    | 146 +++++++++++++++++++++++++++--------
 src/haplotype_indexer.hpp    |  11 ++-
 src/subcommand/gbwt_main.cpp |   4 +-
 test/t/37_vg_gbwt.t          |   4 +-
 6 files changed, 126 insertions(+), 66 deletions(-)

diff --git a/src/gbwt_helper.cpp b/src/gbwt_helper.cpp
index 0958ba45533..19306a7dfe3 100644
--- a/src/gbwt_helper.cpp
+++ b/src/gbwt_helper.cpp
@@ -88,26 +88,6 @@ gbwt::size_type gbwt_node_width(const HandleGraph& graph) {
     return gbwt::bit_length(gbwt::Node::encode(graph.max_node_id(), true));
 }
 
-void finish_gbwt_constuction(gbwt::GBWTBuilder& builder,
-    const std::vector<std::string>& sample_names,
-    const std::vector<std::string>& contig_names,
-    size_t haplotype_count, bool print_metadata,
-    const std::string& header) {
-
-    builder.finish();
-    builder.index.metadata.setSamples(sample_names);
-    builder.index.metadata.setHaplotypes(haplotype_count);
-    builder.index.metadata.setContigs(contig_names);
-    if (print_metadata) {
-        #pragma omp critical
-        {
-            std::cerr << header << ": ";
-            gbwt::operator<<(std::cerr, builder.index.metadata);
-            std::cerr << std::endl;
-        }
-    }
-}
-
 //------------------------------------------------------------------------------
 
 void load_gbwt(gbwt::GBWT& index, const std::string& filename, bool show_progress) {
diff --git a/src/gbwt_helper.hpp b/src/gbwt_helper.hpp
index 33e1545066c..57b819a579b 100644
--- a/src/gbwt_helper.hpp
+++ b/src/gbwt_helper.hpp
@@ -67,13 +67,6 @@ gbwt::vector_type path_predecessors(const PathHandleGraph& graph, const std::str
 /// Determine the node width in bits for the GBWT nodes based on the given graph.
 gbwt::size_type gbwt_node_width(const HandleGraph& graph);
 
-/// Finish GBWT construction and optionally print the metadata.
-void finish_gbwt_constuction(gbwt::GBWTBuilder& builder,
-    const std::vector<std::string>& sample_names,
-    const std::vector<std::string>& contig_names,
-    size_t haplotype_count, bool print_metadata,
-    const std::string& header = "GBWT");
-
 //------------------------------------------------------------------------------
 
 /*
diff --git a/src/haplotype_indexer.cpp b/src/haplotype_indexer.cpp
index 528160485cd..b7c935d7e05 100644
--- a/src/haplotype_indexer.cpp
+++ b/src/haplotype_indexer.cpp
@@ -4,10 +4,12 @@
 
 #include <iostream>
 #include <map>
+#include <mutex>
 #include <vector>
 #include <string>
 
 #include <vg/io/stream.hpp>
+#include <gbwtgraph/algorithms.h>
 #include <gbwtgraph/path_cover.h>
 
 #include "gbwt_helper.hpp"
@@ -394,7 +396,7 @@ std::unique_ptr<gbwt::DynamicGBWT> HaplotypeIndexer::build_gbwt(const std::vecto
         // And add every path that passes the filter (including haplotype paths) from the source graph.
         gbwtgraph::store_paths(builder, *graph, {PathSense::GENERIC, PathSense::REFERENCE, PathSense::HAPLOTYPE}, &path_filter);
 
-        // Finish the construction for this set of threads and put the index back.
+        // Finish the construction for this set of paths and put the index back.
         builder.finish();
         builder.swapIndex(*index);
     }
@@ -413,24 +415,43 @@ std::unique_ptr<gbwt::DynamicGBWT> HaplotypeIndexer::build_gbwt(const PathHandle
     return build_gbwt({}, "GBWT", &graph);
 }
 
-std::unique_ptr<gbwt::DynamicGBWT> HaplotypeIndexer::build_gbwt(const PathHandleGraph& graph,
-    const std::vector<std::string>& aln_filenames, const std::string& aln_format) const {
+std::unique_ptr<gbwt::GBWT> HaplotypeIndexer::build_gbwt(const HandleGraph& graph,
+    const std::vector<std::string>& aln_filenames, const std::string& aln_format, size_t parallel_jobs) const {
 
-    // GBWT metadata.
-    std::vector<std::string> sample_names, contig_names;
-    std::map<std::string, std::pair<size_t, size_t>> sample_info; // name -> (id, count)
-    contig_names.push_back("0"); // An artificial contig.
-    size_t haplotype_count = 0;
+    // Handle multithreading and parallel jobs.
+    parallel_jobs = std::max<size_t>(1, parallel_jobs);
+    int old_threads = omp_get_max_threads();
+    omp_set_num_threads(parallel_jobs);
+
+    // Partition the graph into construction jobs.
+    if (this->show_progress) {
+        #pragma omp critical
+        {
+            std::cerr << "Partitioning the graph into GBWT construction jobs" << std::endl;
+        }
+    }
+    size_t target_size = graph.get_node_count() / parallel_jobs;
+    gbwtgraph::ConstructionJobs jobs = gbwtgraph::gbwt_construction_jobs(graph, target_size);
+    if (this->show_progress) {
+        #pragma omp critical
+        {
+            std::cerr << "Created " << jobs.size() << " parallel construction jobs" << std::endl;
+        }
+    }
 
     // GBWT construction.
-    gbwt::GBWTBuilder builder(gbwt_node_width(graph), this->gbwt_buffer_size * gbwt::MILLION, this->id_interval);
-    builder.index.addMetadata();
+    std::vector<std::mutex> builder_mutexes(jobs.size());
+    std::vector<std::unique_ptr<gbwt::GBWTBuilder>> builders(jobs.size());
+    std::vector<std::vector<std::string>> read_names(jobs.size()); // TODO: Concatenated strings + starting offsets may save space.
+    for (size_t i = 0; i < jobs.size(); i++) {
+        builders[i].reset(new gbwt::GBWTBuilder(gbwt_node_width(graph), this->gbwt_buffer_size * gbwt::MILLION, this->id_interval));
+    }
 
     // Actual work.
     if (this->show_progress) {
         #pragma omp critical
         {
-            std::cerr << "Converting " << aln_format << " to threads" << std::endl;
+            std::cerr << "Converting " << aln_format << " to GBWT paths" << std::endl;
         }
     }
     std::function<void(Alignment&)> lambda = [&](Alignment& aln) {
@@ -438,37 +459,98 @@ std::unique_ptr<gbwt::DynamicGBWT> HaplotypeIndexer::build_gbwt(const PathHandle
         for (auto& m : aln.path().mapping()) {
             buffer.push_back(mapping_to_gbwt(m));
         }
-        builder.insert(buffer, true); // Insert in both orientations.
-        size_t sample_id = 0, sample_count = 0;
-        auto iter = sample_info.find(aln.name());
-        if (iter == sample_info.end()) {
-            sample_id = sample_names.size();
-            sample_names.push_back(aln.name());
-            sample_info[aln.name()] = std::pair<size_t, size_t>(sample_id, sample_count);
-            haplotype_count++;
-        } else {
-            sample_id = iter->second.first;
-            sample_count = iter->second.second;
-            iter->second.second++;
+        size_t job_id = 0;
+        if (buffer.size() > 0) {
+            job_id = jobs.job(gbwt::Node::id(buffer.front()));
+            if (job_id >= jobs.size()) {
+                job_id = 0;
+            }
+        }
+        {
+            // Insert the path into the appropriate builder and record the read name.
+            std::lock_guard<std::mutex> lock(builder_mutexes[job_id]);
+            builders[job_id]->insert(buffer, true);
+            read_names[job_id].push_back(aln.name());
         }
-        builder.index.metadata.addPath(sample_id, 0, 0, sample_count);
     };
     for (auto& file_name : aln_filenames) {
         if (aln_format == "GAM") {
             get_input_file(file_name, [&](istream& in) {
-                vg::io::for_each(in, lambda);
+                vg::io::for_each_parallel(in, lambda);
             });
         } else {
             assert(aln_format == "GAF");
-            vg::io::gaf_unpaired_for_each(graph, file_name, lambda);
+            vg::io::gaf_unpaired_for_each_parallel(graph, file_name, lambda);
         }
     }
-        
-    // Finish the construction and extract the index.
-    finish_gbwt_constuction(builder, sample_names, contig_names, haplotype_count, this->show_progress);
-    std::unique_ptr<gbwt::DynamicGBWT> built(new gbwt::DynamicGBWT());
-    builder.swapIndex(*built);
-    return built;
+
+    // Finish the construction and convert to compressed GBWT.
+    std::vector<gbwt::GBWT> partial_indexes(jobs.size());
+    #pragma omp parallel for schedule(dynamic, 1)
+    for (size_t i = 0; i < jobs.size(); i++) {
+        builders[i]->finish();
+        partial_indexes[i] = gbwt::GBWT(builders[i]->index);
+        builders[i].reset();
+    }
+
+    // Merge the partial indexes.
+    if (this->show_progress) {
+        #pragma omp critical
+        {
+            std::cerr << "Merging the partial GBWTs" << std::endl;
+        }
+    }
+    std::unique_ptr<gbwt::GBWT> result(new gbwt::GBWT(partial_indexes));
+    partial_indexes.clear();
+
+    // Create the metadata.
+    // TODO: This is quite slow.
+    if (this->show_progress) {
+        #pragma omp critical
+        {
+            std::cerr << "Creating metadata" << std::endl;
+        }
+    }
+    result->addMetadata();
+    result->metadata.setContigs({ "unknown" });
+    {
+        std::map<std::string, std::pair<size_t, size_t>> read_info; // name -> (sample id, fragment count)
+        for (auto& names : read_names) {
+            for (const std::string& name : names) {
+                size_t sample_id = 0, fragment_count = 0;
+                auto iter = read_info.find(name);
+                if (iter == read_info.end()) {
+                    sample_id = read_info.size();
+                    read_info[name] = std::pair<size_t, size_t>(sample_id, fragment_count);
+                } else {
+                    sample_id = iter->second.first;
+                    fragment_count = iter->second.second;
+                    iter->second.second++;
+                }
+                result->metadata.addPath(sample_id, 0, 0, fragment_count);
+            }
+            names = std::vector<std::string>();
+        }
+        std::vector<std::string> sample_names(read_info.size());
+        for (auto& p : read_info) {
+            sample_names[p.second.first] = p.first;
+        }
+        read_info.clear();
+        result->metadata.setSamples(sample_names);
+        result->metadata.setHaplotypes(sample_names.size());
+    }
+    if (this->show_progress) {
+        #pragma omp critical
+        {
+            std::cerr << "GBWT: ";
+            gbwt::operator<<(std::cerr, result->metadata);
+            std::cerr << std::endl;
+        }
+    }
+
+    // Restore the number of threads.
+    omp_set_num_threads(old_threads);
+    return result;
 }
 
 }
diff --git a/src/haplotype_indexer.hpp b/src/haplotype_indexer.hpp
index f555aec947b..ca212a94267 100644
--- a/src/haplotype_indexer.hpp
+++ b/src/haplotype_indexer.hpp
@@ -135,10 +135,15 @@ class HaplotypeIndexer : public Progressive {
      * the same name, the corresponding GBWT path names will have the same
      * sample identifier but different values in the count field.
      *
-     * aln_format can be "GAM" or "GAF"
+     * aln_format can be "GAM" or "GAF".
+     *
+     * Runs approximately the given number of jobs in parallel. The exact
+     * number depends on the sizes of weakly connected components in the graph.
+     * Each job uses at most 2 threads.
      */
-    std::unique_ptr<gbwt::DynamicGBWT> build_gbwt(const PathHandleGraph& graph,
-        const std::vector<std::string>& aln_filenames, const std::string& aln_format) const;
+    std::unique_ptr<gbwt::GBWT> build_gbwt(const HandleGraph& graph,
+        const std::vector<std::string>& aln_filenames, const std::string& aln_format,
+        size_t parallel_jobs = 1) const;
 };
 
 }
diff --git a/src/subcommand/gbwt_main.cpp b/src/subcommand/gbwt_main.cpp
index 5e55928a4fe..4eca4e47a81 100644
--- a/src/subcommand/gbwt_main.cpp
+++ b/src/subcommand/gbwt_main.cpp
@@ -1389,8 +1389,8 @@ void step_1_build_gbwts(GBWTHandler& gbwts, GraphHandler& graphs, GBWTConfig& co
         if (config.show_progress) {
             std::cerr << "Input type: " << (config.gam_format ? "GAM" : "GAF") << std::endl;
         }
-        // FIXME: Parallelize this.
-        std::unique_ptr<gbwt::DynamicGBWT> temp = config.haplotype_indexer.build_gbwt(*(graphs.path_graph), config.input_filenames, (config.gam_format ? "GAM" : "GAF"));
+        std::unique_ptr<gbwt::GBWT> temp =
+            config.haplotype_indexer.build_gbwt(*(graphs.path_graph), config.input_filenames, (config.gam_format ? "GAM" : "GAF"), config.build_jobs);
         gbwts.use(*temp);
     }
 
diff --git a/test/t/37_vg_gbwt.t b/test/t/37_vg_gbwt.t
index 870b5d9674a..1046da788d3 100644
--- a/test/t/37_vg_gbwt.t
+++ b/test/t/37_vg_gbwt.t
@@ -52,9 +52,9 @@ rm -f x.ref.gbwt
 # Single chromosome: alignments
 vg paths -v x.vg -X -Q _alt > x.alts.gam
 vg convert -G x.alts.gam x.vg > x.alts.gaf
-vg gbwt -A -o x.alts.gaf.gbwt -x x.vg x.alts.gaf
+vg gbwt -A --num-jobs 1 -o x.alts.gaf.gbwt -x x.vg x.alts.gaf
 is $? 0 "chromosome X GAF with vg gbwt"
-vg gbwt -A --gam-format -o x.alts.gam.gbwt -x x.vg x.alts.gam
+vg gbwt -A --num-jobs 1 --gam-format -o x.alts.gam.gbwt -x x.vg x.alts.gam
 is $? 0 "chromosome X GAM with vg gbwt"
 cmp x.alts.gaf.gbwt x.alts.gaf.gbwt
 is $? 0 "identical construction results from GAF and GAM"

From a585bbcffa162f08dbe2ae9a745e37dde8bb5595 Mon Sep 17 00:00:00 2001
From: Jouni Siren <jouni.siren@iki.fi>
Date: Sun, 3 Nov 2024 01:31:20 -0800
Subject: [PATCH 41/54] Faster metadata construction for GAM/GAF GBWT

---
 src/haplotype_indexer.cpp | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/src/haplotype_indexer.cpp b/src/haplotype_indexer.cpp
index b7c935d7e05..75399a86e64 100644
--- a/src/haplotype_indexer.cpp
+++ b/src/haplotype_indexer.cpp
@@ -2,8 +2,9 @@
  * \file haplotype_indexer.cpp: implementations of haplotype indexing with the GBWT
  */
 
+#include "haplotype_indexer.hpp"
+
 #include <iostream>
-#include <map>
 #include <mutex>
 #include <vector>
 #include <string>
@@ -14,10 +15,9 @@
 
 #include "gbwt_helper.hpp"
 
-#include "haplotype_indexer.hpp"
-
-#include "path.hpp"
 #include "alignment.hpp"
+#include "hash_map.hpp"
+#include "path.hpp"
 
 using namespace std;
 
@@ -442,7 +442,9 @@ std::unique_ptr<gbwt::GBWT> HaplotypeIndexer::build_gbwt(const HandleGraph& grap
     // GBWT construction.
     std::vector<std::mutex> builder_mutexes(jobs.size());
     std::vector<std::unique_ptr<gbwt::GBWTBuilder>> builders(jobs.size());
-    std::vector<std::vector<std::string>> read_names(jobs.size()); // TODO: Concatenated strings + starting offsets may save space.
+    // This is a bit inefficient, as read names are often longer than the SSO threshold for GCC (but not for Clang).
+    // TODO: Maybe use concatenated 0-terminated names?
+    std::vector<std::vector<std::string>> read_names(jobs.size());
     for (size_t i = 0; i < jobs.size(); i++) {
         builders[i].reset(new gbwt::GBWTBuilder(gbwt_node_width(graph), this->gbwt_buffer_size * gbwt::MILLION, this->id_interval));
     }
@@ -504,7 +506,6 @@ std::unique_ptr<gbwt::GBWT> HaplotypeIndexer::build_gbwt(const HandleGraph& grap
     partial_indexes.clear();
 
     // Create the metadata.
-    // TODO: This is quite slow.
     if (this->show_progress) {
         #pragma omp critical
         {
@@ -514,14 +515,15 @@ std::unique_ptr<gbwt::GBWT> HaplotypeIndexer::build_gbwt(const HandleGraph& grap
     result->addMetadata();
     result->metadata.setContigs({ "unknown" });
     {
-        std::map<std::string, std::pair<size_t, size_t>> read_info; // name -> (sample id, fragment count)
+        // We can use 32-bit values, as GBWT metadata uses them as well.
+        string_hash_map<std::string, std::pair<std::uint32_t, std::uint32_t>> read_info; // name -> (sample id, fragment count)
         for (auto& names : read_names) {
             for (const std::string& name : names) {
-                size_t sample_id = 0, fragment_count = 0;
+                std::uint32_t sample_id = 0, fragment_count = 0;
                 auto iter = read_info.find(name);
                 if (iter == read_info.end()) {
                     sample_id = read_info.size();
-                    read_info[name] = std::pair<size_t, size_t>(sample_id, fragment_count);
+                    read_info[name] = std::make_pair(sample_id, fragment_count);
                 } else {
                     sample_id = iter->second.first;
                     fragment_count = iter->second.second;
@@ -535,7 +537,7 @@ std::unique_ptr<gbwt::GBWT> HaplotypeIndexer::build_gbwt(const HandleGraph& grap
         for (auto& p : read_info) {
             sample_names[p.second.first] = p.first;
         }
-        read_info.clear();
+        read_info = string_hash_map<std::string, std::pair<std::uint32_t, std::uint32_t>>();
         result->metadata.setSamples(sample_names);
         result->metadata.setHaplotypes(sample_names.size());
     }

From c1b58030cd2bc4079ff12ea6ec441e3d19cc791f Mon Sep 17 00:00:00 2001
From: Jouni Siren <jouni.siren@iki.fi>
Date: Mon, 4 Nov 2024 00:12:53 -0800
Subject: [PATCH 42/54] Tests for multi-chromosome GAM/GAF GBWT

---
 test/t/37_vg_gbwt.t | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/test/t/37_vg_gbwt.t b/test/t/37_vg_gbwt.t
index 1046da788d3..a5849bc47a5 100644
--- a/test/t/37_vg_gbwt.t
+++ b/test/t/37_vg_gbwt.t
@@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap
 
 PATH=../bin:$PATH # for vg
 
-plan tests 149
+plan tests 155
 
 
 # Build vg graphs for two chromosomes
@@ -50,7 +50,7 @@ rm -f x.ref.gbwt
 
 
 # Single chromosome: alignments
-vg paths -v x.vg -X -Q _alt > x.alts.gam
+vg paths -x x.vg -X -Q _alt > x.alts.gam
 vg convert -G x.alts.gam x.vg > x.alts.gaf
 vg gbwt -A --num-jobs 1 -o x.alts.gaf.gbwt -x x.vg x.alts.gaf
 is $? 0 "chromosome X GAF with vg gbwt"
@@ -108,6 +108,25 @@ rm -f x.gbwt y.gbwt xy.merge.gbwt xy.fast.gbwt xy.parallel.gbwt xy.direct.gbwt x
 rm -f xy.1000gp.gbwt
 
 
+# Multiple chromosomes: alignments
+vg paths -x xy-alt.xg -X -Q _alt > xy.alts.gam
+vg convert -G xy.alts.gam xy.xg > xy.alts.gaf
+vg gbwt -A --num-jobs 1 -o xy.alts.gaf.gbwt -x xy.xg xy.alts.gaf
+is $? 0 "multi-chromosome GAF with vg gbwt"
+vg gbwt -A --num-jobs 1 --gam-format -o xy.alts.gam.gbwt -x xy.xg xy.alts.gam
+is $? 0 "multi-chromosome GAM with vg gbwt"
+cmp xy.alts.gaf.gbwt xy.alts.gaf.gbwt
+is $? 0 "identical construction results from GAF and GAM"
+
+vg gbwt -A --num-jobs 2 -o multi.gbwt -x xy.xg xy.alts.gaf
+is $? 0 "multi-chromosome GAF with vg gbwt using multiple jobs"
+is $(vg gbwt -c xy.alts.gaf.gbwt) 58 "single job: 58 paths"
+is $(vg gbwt -c multi.gbwt) 58 "multiple jobs: 58 paths"
+
+rm -f xy.alts.gam xy.alts.gaf
+rm -f xy.alts.gaf.gbwt xy.alts.gam.gbwt multi.gbwt
+
+
 # Multiple chromosomes: paths as contigs
 vg gbwt -E -o xy.contigs.gbwt -x xy.xg
 is $? 0 "paths as contigs with vg gbwt"

From 6957377dbfbe6a1a65567b964269994b663f14c2 Mon Sep 17 00:00:00 2001
From: Jouni Siren <jouni.siren@iki.fi>
Date: Mon, 4 Nov 2024 00:35:38 -0800
Subject: [PATCH 43/54] Maybe SDSL will now point to the right hash

---
 deps/sdsl-lite | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deps/sdsl-lite b/deps/sdsl-lite
index 115dfd5b737..4b4b1b8951b 160000
--- a/deps/sdsl-lite
+++ b/deps/sdsl-lite
@@ -1 +1 @@
-Subproject commit 115dfd5b7371ef99a1e0611389b956bb88bf9fc4
+Subproject commit 4b4b1b8951bf7fbdb5e050f98b5e61453964c167

From a4e7ddd41774d8431160490db26670514c3fa921 Mon Sep 17 00:00:00 2001
From: Jouni Siren <jouni.siren@iki.fi>
Date: Mon, 4 Nov 2024 02:39:48 -0800
Subject: [PATCH 44/54] vg was apparently using another branch of SDSL

---
 deps/sdsl-lite | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deps/sdsl-lite b/deps/sdsl-lite
index 4b4b1b8951b..ef23c5fe989 160000
--- a/deps/sdsl-lite
+++ b/deps/sdsl-lite
@@ -1 +1 @@
-Subproject commit 4b4b1b8951bf7fbdb5e050f98b5e61453964c167
+Subproject commit ef23c5fe9899f2b0afa53a32162ed0b06aff0e89

From 612d446061ce17ec14807cb883f33ba9f8895942 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Mon, 4 Nov 2024 17:49:53 -0500
Subject: [PATCH 45/54] Use toil-vg that uses vg gbwt to make gbwt

---
 vgci/vgci.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vgci/vgci.sh b/vgci/vgci.sh
index 9a5e53b08f3..ca04cfbe7b4 100755
--- a/vgci/vgci.sh
+++ b/vgci/vgci.sh
@@ -30,7 +30,7 @@ KEEP_INTERMEDIATE_FILES=0
 # Should we show stdout and stderr from tests? If so, set to "-s".
 SHOW_OPT=""
 # What toil-vg should we install?
-TOIL_VG_PACKAGE="git+https://github.com/vgteam/toil-vg.git@c9bd6414f935e6095574a41a34addbb8d87b41a6"
+TOIL_VG_PACKAGE="git+https://github.com/vgteam/toil-vg.git@d16da00b92c491f90433e151cb4f5a89a44395b8"
 # What toil should we install?
 # Could be something like "toil[aws,mesos]==3.20.0"
 # or "git+https://github.com/DataBiosphere/toil.git@3ab74776a3adebd6db75de16985ce9d734f60743#egg=toil[aws,mesos]"

From 8a12486abcefc7644f00180f2c4b29916c22384c Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Tue, 5 Nov 2024 10:51:24 -0500
Subject: [PATCH 46/54] Use toil-vg that knows about DI2 and uses outfile
 correctly

---
 vgci/vgci.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vgci/vgci.sh b/vgci/vgci.sh
index ca04cfbe7b4..c48ddbcf553 100755
--- a/vgci/vgci.sh
+++ b/vgci/vgci.sh
@@ -30,7 +30,7 @@ KEEP_INTERMEDIATE_FILES=0
 # Should we show stdout and stderr from tests? If so, set to "-s".
 SHOW_OPT=""
 # What toil-vg should we install?
-TOIL_VG_PACKAGE="git+https://github.com/vgteam/toil-vg.git@d16da00b92c491f90433e151cb4f5a89a44395b8"
+TOIL_VG_PACKAGE="git+https://github.com/vgteam/toil-vg.git@45782c7ba5a372e7c3587ac1c63f4895176fc828"
 # What toil should we install?
 # Could be something like "toil[aws,mesos]==3.20.0"
 # or "git+https://github.com/DataBiosphere/toil.git@3ab74776a3adebd6db75de16985ce9d734f60743#egg=toil[aws,mesos]"

From f846f72b6035922daf18e1f4f4b64cb38b55e2e3 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Tue, 5 Nov 2024 12:03:57 -0500
Subject: [PATCH 47/54] Use a gbwtgraph that can store subranges of haplotype
 fragment 0

---
 deps/gbwtgraph | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deps/gbwtgraph b/deps/gbwtgraph
index de61d340695..8819a8cf8f6 160000
--- a/deps/gbwtgraph
+++ b/deps/gbwtgraph
@@ -1 +1 @@
-Subproject commit de61d340695f64b8aa978816b195d6687e7fda5a
+Subproject commit 8819a8cf8f66c22910a1167f548b944b0ac07ecf

From 3da4e03b827444b86f65885fdf9622d8c9dc67c1 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Tue, 5 Nov 2024 12:42:56 -0500
Subject: [PATCH 48/54] Start rewriting subpath copying in terms of metadata

---
 src/algorithms/subgraph.cpp | 40 ++++++++++++++++++++++++++++++++++---
 1 file changed, 37 insertions(+), 3 deletions(-)

diff --git a/src/algorithms/subgraph.cpp b/src/algorithms/subgraph.cpp
index 2ea619ea952..b72d7f4c3f2 100644
--- a/src/algorithms/subgraph.cpp
+++ b/src/algorithms/subgraph.cpp
@@ -297,20 +297,54 @@ void extract_path_range(const PathPositionHandleGraph& source, path_handle_t pat
 /// in the source graph (unless the subpath was not cut up at all)
 void add_subpaths_to_subgraph(const PathPositionHandleGraph& source, MutablePathHandleGraph& subgraph,
                               bool subpath_naming) {
-    std::unordered_map<std::string, std::map<uint64_t, handle_t> > subpaths;
+
+    // We want to organize all visits by base path. This key type holds the
+    // sense, sample and locus names, haplotype, and phase block.
+    using base_metadata_t = std::tuple<PathSense, string, string, size_t, size_t>;
+
+    // This stores, for each source graph base path, for each start offset, the handle at that offset on the path.
+    std::unordered_map<base_metadata_t, std::map<uint64_t, handle_t> > subpaths;
+
+    // This stores information about base paths that don't have subranges, and
+    // their full lengths, so we can avoid generating new subrange metadata
+    // when we just have all of a path.
+    std::unordered_map<base_metadata_t, size_t> full_path_lengths;
+
     subgraph.for_each_handle([&](const handle_t& h) {
             handlegraph::nid_t id = subgraph.get_id(h);
             if (source.has_node(id)) {
                 handle_t handle = source.get_handle(id);
                 source.for_each_step_position_on_handle(handle, [&](const step_handle_t& step, const bool& is_rev, const uint64_t& pos) {
                         path_handle_t path = source.get_path_handle_of_step(step);
-                        std::string path_name = source.get_path_name(path);
-                        subpaths[path_name][pos] = is_rev ? subgraph.flip(h) : h;
+                        // Figure out the base path this visit is on
+                        base_metadata_t key = {source.get_sense(path), source.get_sample_name(path), source.get_contig_name(path), source.get_haplotype(path), source.get_phase_block(path)};
+                        // Figure out the subrange of the base path it is relative to
+                        subrange_t path_subrange = source.get_subrange(path);
+                        uint64_t visit_offset = pos;
+                        if (path_subrange != PathMetadata:NO_SUBRANGE) {
+                            // If we have the position relative to a subrange, adjust by that subrange's offset.
+                            visit_offset += path_subrange.first;
+                        }
+                        subpaths[key][visit_offset] = is_rev ? subgraph.flip(h) : h;
+                        
+                        if (path_subrange == PathMetadata:NO_SUBRANGE) {
+                            // There's no subrange set, so this path is full-length in the source graph.
+                            // See if we know of this path as a full-length path or not
+                            auto it = full_path_lengths.find(key);
+                            if (it == full_path_lengths.end()) {
+                                // We haven't recorded its length yet, so do it.
+                                full_path_lengths.emplace_hint(it, key, source.get_path_length(path));
+                            }
+                        }
                         return true;
                     });
             }
         });
 
+    // TODO: Rewrite to find continuous subpath pieces and copy into the new graph with MutablePathMetadata::create_path
+    // But only use the subpaths if we aren't a complete full-length span of a base path.
+    // We will do this by extracting each contiguous subrange into a vector of handles, then computing metadata for it based on its length and whether it is a full-length span of a path that was a full path in the base graph, and then creating it and filling it in.
+
     function<path_handle_t(const string&, bool, size_t)> new_subpath =
         [&subgraph](const string& path_name, bool is_circular, size_t subpath_offset) {
         PathSense sense;

From a7c3d074397f96fc2d8de8eae08f3c4b3c35f27b Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Tue, 5 Nov 2024 18:45:13 -0500
Subject: [PATCH 49/54] Rewrite path chunking to use subpaths all the time and
 remove the old reference splicing code

---
 src/algorithms/subgraph.cpp   | 146 ++++++++---------
 src/algorithms/subgraph.hpp   |  16 +-
 src/chunker.cpp               | 295 +++++++---------------------------
 src/chunker.hpp               |   6 +-
 src/subcommand/chunk_main.cpp |   2 +-
 5 files changed, 146 insertions(+), 319 deletions(-)

diff --git a/src/algorithms/subgraph.cpp b/src/algorithms/subgraph.cpp
index b72d7f4c3f2..86fac693ba1 100644
--- a/src/algorithms/subgraph.cpp
+++ b/src/algorithms/subgraph.cpp
@@ -1,5 +1,6 @@
 #include "subgraph.hpp"
 #include "../path.hpp"
+#include "../crash.hpp"
 
 namespace vg {
 namespace algorithms {
@@ -290,13 +291,7 @@ void extract_path_range(const PathPositionHandleGraph& source, path_handle_t pat
     }
 }
 
-/// add subpaths to the subgraph, providing a concatenation of subpaths that are discontiguous over the subgraph
-/// based on their order in the path position index provided by the source graph
-/// will clear any path found in both graphs before writing the new steps into it
-/// if subpath_naming is true, a suffix will be added to each path in the subgraph denoting its offset
-/// in the source graph (unless the subpath was not cut up at all)
-void add_subpaths_to_subgraph(const PathPositionHandleGraph& source, MutablePathHandleGraph& subgraph,
-                              bool subpath_naming) {
+void add_subpaths_to_subgraph(const PathPositionHandleGraph& source, MutablePathHandleGraph& subgraph) {
 
     // We want to organize all visits by base path. This key type holds the
     // sense, sample and locus names, haplotype, and phase block.
@@ -306,9 +301,9 @@ void add_subpaths_to_subgraph(const PathPositionHandleGraph& source, MutablePath
     std::unordered_map<base_metadata_t, std::map<uint64_t, handle_t> > subpaths;
 
     // This stores information about base paths that don't have subranges, and
-    // their full lengths, so we can avoid generating new subrange metadata
-    // when we just have all of a path.
-    std::unordered_map<base_metadata_t, size_t> full_path_lengths;
+    // their full lengths and circularity flags, so we can avoid generating new
+    // subrange metadata when we just have all of a path.
+    std::unordered_map<base_metadata_t, std::pair<size_t, bool>> full_path_info;
 
     subgraph.for_each_handle([&](const handle_t& h) {
             handlegraph::nid_t id = subgraph.get_id(h);
@@ -317,23 +312,23 @@ void add_subpaths_to_subgraph(const PathPositionHandleGraph& source, MutablePath
                 source.for_each_step_position_on_handle(handle, [&](const step_handle_t& step, const bool& is_rev, const uint64_t& pos) {
                         path_handle_t path = source.get_path_handle_of_step(step);
                         // Figure out the base path this visit is on
-                        base_metadata_t key = {source.get_sense(path), source.get_sample_name(path), source.get_contig_name(path), source.get_haplotype(path), source.get_phase_block(path)};
+                        base_metadata_t key = {source.get_sense(path), source.get_sample_name(path), source.get_locus_name(path), source.get_haplotype(path), source.get_phase_block(path)};
                         // Figure out the subrange of the base path it is relative to
                         subrange_t path_subrange = source.get_subrange(path);
                         uint64_t visit_offset = pos;
-                        if (path_subrange != PathMetadata:NO_SUBRANGE) {
+                        if (path_subrange != PathMetadata::NO_SUBRANGE) {
                             // If we have the position relative to a subrange, adjust by that subrange's offset.
                             visit_offset += path_subrange.first;
                         }
                         subpaths[key][visit_offset] = is_rev ? subgraph.flip(h) : h;
                         
-                        if (path_subrange == PathMetadata:NO_SUBRANGE) {
+                        if (path_subrange == PathMetadata::NO_SUBRANGE) {
                             // There's no subrange set, so this path is full-length in the source graph.
                             // See if we know of this path as a full-length path or not
-                            auto it = full_path_lengths.find(key);
-                            if (it == full_path_lengths.end()) {
-                                // We haven't recorded its length yet, so do it.
-                                full_path_lengths.emplace_hint(it, key, source.get_path_length(path));
+                            auto it = full_path_info.find(key);
+                            if (it == full_path_info.end()) {
+                                // We haven't recorded its length and circularity yet, so do it.
+                                full_path_info.emplace_hint(it, key, std::make_pair(source.get_path_length(path), source.get_is_circular(path)));
                             }
                         }
                         return true;
@@ -341,66 +336,71 @@ void add_subpaths_to_subgraph(const PathPositionHandleGraph& source, MutablePath
             }
         });
 
-    // TODO: Rewrite to find continuous subpath pieces and copy into the new graph with MutablePathMetadata::create_path
-    // But only use the subpaths if we aren't a complete full-length span of a base path.
-    // We will do this by extracting each contiguous subrange into a vector of handles, then computing metadata for it based on its length and whether it is a full-length span of a path that was a full path in the base graph, and then creating it and filling it in.
+    for (auto& base_and_visits : subpaths) {
+        // For each base path
+        const base_metadata_t& base_path_metadata = base_and_visits.first;
+        const auto& start_to_handle = base_and_visits.second;
+        // If we didn't put anything in the visit collection, it shouldn't be here.
+        crash_unless(!start_to_handle.empty());
 
-    function<path_handle_t(const string&, bool, size_t)> new_subpath =
-        [&subgraph](const string& path_name, bool is_circular, size_t subpath_offset) {
-        PathSense sense;
-        string sample;
-        string locus;
-        size_t haplotype;
-        size_t phase_block;
-        subrange_t subrange;
-        PathMetadata::parse_path_name(path_name, sense, sample, locus, haplotype, phase_block, subrange);
-        if (subrange == PathMetadata::NO_SUBRANGE) {
-            subrange.first = subpath_offset;
-        } else {
-            subrange.first += subpath_offset;
-        }
-        subrange.first = subpath_offset;
-        subrange.second = PathMetadata::NO_END_POSITION;
-        string subpath_name = PathMetadata::create_path_name(sense, sample, locus, haplotype, phase_block, subrange);
-        if (subgraph.has_path(subpath_name)) {
-            subgraph.destroy_path(subgraph.get_path_handle(subpath_name));
-        }
-        return subgraph.create_path_handle(subpath_name, is_circular);
-    };
+        // We're going to walk over all the visits and find contiguous runs
+        auto run_start = start_to_handle.begin();
+        auto run_end = run_start;
+        size_t start_coordinate = run_start->first;
+        while (run_end != start_to_handle.end()) {
+            // Until we run out of runs
+            // Figure out where this node ends on the path
+            size_t stop_coordinate = run_end->first + subgraph.get_length(run_end->second);
+            
+            // Look ahead
+            ++run_end;
 
-    for (auto& subpath : subpaths) {
-        const std::string& path_name = subpath.first;
-        path_handle_t source_path_handle = source.get_path_handle(path_name);
-        // destroy the path if it exists
-        if (subgraph.has_path(path_name)) {
-            subgraph.destroy_path(subgraph.get_path_handle(path_name));
-        }
-        // create a new path.  give it a subpath name if the flag's on and its smaller than original
-        path_handle_t path;
-        if (!subpath_naming || subpath.second.size() == source.get_step_count(source_path_handle) ||
-            subpath.second.empty()) {
-            path = subgraph.create_path_handle(path_name, source.get_is_circular(source_path_handle));
-        } else {
-            path = new_subpath(path_name, source.get_is_circular(source_path_handle), subpath.second.begin()->first);
-        }
-        for (auto p = subpath.second.begin(); p != subpath.second.end(); ++p) {
-            const handle_t& handle = p->second;
-            if (p != subpath.second.begin() && subpath_naming) {
-                auto prev = p;
-                --prev;
-                const handle_t& prev_handle = prev->second;
-                // distance from map
-                size_t delta = p->first - prev->first;
-                // what the distance should be if they're contiguous depends on relative orienations
-                size_t cont_delta = subgraph.get_length(prev_handle);
-                if (delta != cont_delta) {
-                    // we have a discontinuity!  we'll make a new path can continue from there
-                    assert(subgraph.get_step_count(path) > 0);
-                    path = new_subpath(path_name, subgraph.get_is_circular(path), p->first);
+            if (run_end != start_to_handle.end() && run_end->first == stop_coordinate) {
+                // The next visit is still contiguous, so advance.
+                continue;
+            }
+
+            // Otherwise we've reached a break in continuity. We have a
+            // contiguous run from run_start to run_end, visiting the subrange
+            // start_coordinate to stop_coordinate.
+
+            // Find out if we cover a full source graph path.
+            subrange_t run_subrange = {start_coordinate, stop_coordinate};
+            bool is_circular = false;
+            if (start_coordinate == 0) {
+                // We might be a full path
+                auto found_length_and_circularity = full_path_info.find(base_path_metadata);
+                if (found_length_and_circularity != full_path_info.end() && found_length_and_circularity->second.first == stop_coordinate) {
+                    // We are a full path
+                    run_subrange = PathMetadata::NO_SUBRANGE;
+                    // We can be circular.
+                    is_circular = found_length_and_circularity->second.second;
                 }
             }
-            //fill in the path information
-            subgraph.append_step(path, handle);
+            
+            // Make a path with all the metadata
+            path_handle_t new_path = subgraph.create_path(
+                std::get<0>(base_path_metadata),
+                std::get<1>(base_path_metadata),
+                std::get<2>(base_path_metadata),
+                std::get<3>(base_path_metadata),
+                std::get<4>(base_path_metadata),
+                run_subrange,
+                is_circular
+            );
+
+            for (auto it = run_start; it != run_end; ++it) {
+                // Copy the path's visits
+                subgraph.append_step(new_path, it->second);
+            }
+
+            // Set up the next subpath.
+            // Set where it starts.
+            run_start = run_end;
+            if (run_start != start_to_handle.end()) {
+                // And if it will exist, set its start coordinate.
+                start_coordinate = run_start->first;
+            }
         }
     }
 }
diff --git a/src/algorithms/subgraph.hpp b/src/algorithms/subgraph.hpp
index fef9e174235..321baf20e40 100644
--- a/src/algorithms/subgraph.hpp
+++ b/src/algorithms/subgraph.hpp
@@ -34,13 +34,15 @@ void extract_id_range(const HandleGraph& source, const nid_t& id1, const nid_t&
 /// if end < 0, then it will walk to the end of the path
 void extract_path_range(const PathPositionHandleGraph& source, path_handle_t path_handle, int64_t start, int64_t end, MutableHandleGraph& subgraph);
 
-/// add subpaths to the subgraph, providing a concatenation of subpaths that are discontiguous over the subgraph
-/// based on their order in the path position index provided by the source graph
-/// will clear any path found in both graphs before writing the new steps into it
-/// if subpath_naming is true, a suffix will be added to each path in the subgraph denoting its offset
-/// in the source graph (unless the subpath was not cut up at all)
-void add_subpaths_to_subgraph(const PathPositionHandleGraph& source, MutablePathHandleGraph& subgraph,
-                              bool subpath_naming = false);
+/// Add subpaths to the subgraph for all paths visiting its nodes in the source
+/// graph.
+///
+/// Always generates correct path metadata, and a path for each contiguous
+/// fragment of any base path. Assumes the source graph does not contain any
+/// overlapping path fragments on a given base path, and that the subgraph does
+/// not already contain any paths on a base path also present in the source
+/// graph.
+void add_subpaths_to_subgraph(const PathPositionHandleGraph& source, MutablePathHandleGraph& subgraph);
 
 /// We can accumulate a subgraph without accumulating all the edges between its nodes
 /// this helper ensures that we get the full set
diff --git a/src/chunker.cpp b/src/chunker.cpp
index 00fb6f6a3e9..5b1dad389a3 100644
--- a/src/chunker.cpp
+++ b/src/chunker.cpp
@@ -5,6 +5,7 @@
 #include "algorithms/subgraph.hpp"
 #include "vg.hpp"
 #include "clip.hpp"
+#include "crash.hpp"
 
 //#define debug
 
@@ -22,16 +23,9 @@ PathChunker::~PathChunker() {
 
 void PathChunker::extract_subgraph(const Region& region, int64_t context, int64_t length, bool forward_only,
                                    MutablePathMutableHandleGraph& subgraph, Region& out_region) {
-    // This method still depends on VG
-    // (not a super high priority to port, as calling can now be done at genome scale and we no longer
-    // have to chunk up paths)
-    VG* vg_subgraph = dynamic_cast<VG*>(&subgraph);
-    if (vg_subgraph == nullptr) {
-        vg_subgraph = new VG();
-        assert(subgraph.get_node_count() == 0);
-    }
     
     // extract our path range into the graph
+    // TODO: Handle incoming names with subranges.
     path_handle_t path_handle = graph->get_path_handle(region.seq);
     step_handle_t start_step = graph->get_step_at_position(path_handle, region.start);
     handle_t start_handle = graph->get_handle_of_step(start_step);
@@ -53,250 +47,81 @@ void PathChunker::extract_subgraph(const Region& region, int64_t context, int64_
         if (graph->get_is_reverse(step_handle)) {
             step_handle = graph->flip(step_handle);
         }
-        if (!vg_subgraph->has_node(graph->get_id(step_handle))) {
-            vg_subgraph->create_handle(graph->get_sequence(step_handle), graph->get_id(step_handle));
+        if (!subgraph.has_node(graph->get_id(step_handle))) {
+            subgraph.create_handle(graph->get_sequence(step_handle), graph->get_id(step_handle));
         }
     };
     // expand the context and get path information
     // if forward_only true, then we only go forward.
     if (context > 0) {
-        algorithms::expand_subgraph_by_steps(*graph, *vg_subgraph, context, forward_only);
+        algorithms::expand_subgraph_by_steps(*graph, subgraph, context, forward_only);
     }
     if (length > 0) {
-        algorithms::expand_subgraph_by_length(*graph, *vg_subgraph, context, forward_only);
+        algorithms::expand_subgraph_by_length(*graph, subgraph, context, forward_only);
     }
     else if (context == 0 && length == 0) {
-        algorithms::add_connecting_edges_to_subgraph(*graph, *vg_subgraph);
+        algorithms::add_connecting_edges_to_subgraph(*graph, subgraph);
     }
-    algorithms::add_subpaths_to_subgraph(*graph, *vg_subgraph, true);
-
-    // merge back our reference path to use the old chopping code
-    // todo: work with subpaths somehow?
-    if (!vg_subgraph->has_path(region.seq)) {
-        map<size_t, path_handle_t> ref_subpaths;
-        vg_subgraph->for_each_path_handle([&](path_handle_t path_handle) {
-                string path_name = vg_subgraph->get_path_name(path_handle);
-                subrange_t subrange;
-                path_name = Paths::strip_subrange(path_name, &subrange);
-                if (subrange != PathMetadata::NO_SUBRANGE && path_name == region.seq) {
-                    ref_subpaths[subrange.first] = path_handle;
-                }
-            });
-        path_handle_t new_ref_path = vg_subgraph->create_path_handle(region.seq, graph->get_is_circular(path_handle));
-        for (auto& ref_subpath : ref_subpaths) {
-            vg_subgraph->for_each_step_in_path(ref_subpath.second, [&] (step_handle_t subpath_step) {
-                    vg_subgraph->append_step(new_ref_path, vg_subgraph->get_handle_of_step(subpath_step));
-                });
-            vg_subgraph->destroy_path(ref_subpath.second);
-        }
-    }
-                
-    // build the vg of the subgraph
-    vg_subgraph->remove_orphan_edges();
-
-    // get our range endpoints before context expansion
-    list<mapping_t>& mappings = vg_subgraph->paths.get_path(region.seq);
-    assert(!mappings.empty());
-    size_t mappings_size = mappings.size();
-    int64_t input_start_node = graph->get_id(start_handle);
-    int64_t input_end_node = graph->get_id(end_handle);
-
-#ifdef debug
-#pragma omp critical(cerr)
-    {
-        cerr << "Path range in expanded subgraph is " << *mappings.begin() << "-" << *mappings.rbegin() << endl;
-    }
-#endif
-
-    // replaces old xg position_in_path() to check node counts in path
-    function<vector<step_handle_t>(const PathHandleGraph&, handle_t, path_handle_t)> path_steps_of_handle =
-        [] (const PathHandleGraph& graph, handle_t handle, path_handle_t path_handle) {
-        vector<step_handle_t> node_steps = graph.steps_of_handle(handle);
-        vector<step_handle_t> node_path_steps;
-        for (auto step : node_steps) {
-            if (graph.get_path_handle_of_step(step) == path_handle) {
-                node_path_steps.push_back(step);
-            }
-        }
-        return node_path_steps;
-    };
-    
-    // we have no direct way of getting our steps out of the subgraph, so we
-    // go through node ids.  the problem is that cycles can introduce
-    // ambiguity.  we check for that here (only to punt on it later)
-    vector<step_handle_t> start_node_path_steps = path_steps_of_handle(*graph, start_handle, path_handle);
-    vector<step_handle_t> end_node_path_steps = path_steps_of_handle(*graph, end_handle, path_handle);
-    bool end_points_on_cycle = start_node_path_steps.size() > 1 || end_node_path_steps.size() > 1;
-    
-    // keep track of the edges in our original path
-    set<pair<pair<id_t, bool>, pair<id_t, bool>>> path_edge_set =
-        // walking out with the context length (as supported below) won't always work as expansion
-        // can grab an arbitrary amount of path regardless of context.  so we load up the entire path:
-        // (todo: could sniff out limits from subgraph...)
-        get_path_edge_index(graph->path_begin(path_handle), graph->path_back(path_handle), std::max(context, length));
-    
-    // the distance between them and the nodes in our input range
-    size_t left_padding = 0;
-    size_t right_padding = 0;
-    // do we need to rewrite back to our graph?
-    bool rewrite_paths = false;
+    algorithms::add_subpaths_to_subgraph(*graph, subgraph); 
+   
+    // Find the lowest and highest offsets visited by any subpath of the target path we extracted on.
+    PathSense sense = graph->get_sense(path_handle);
+    std::string sample = graph->get_sample_name(path_handle);
+    std::string locus = graph->get_locus_name(path_handle);
+    size_t haplotype = graph->get_haplotype(path_handle);
+    size_t phase_block = graph->get_phase_block(path_handle);
     
-    if (!end_points_on_cycle) {
-        // start and end of our expanded chunk
-        auto start_it = mappings.begin();
-        auto end_it = --mappings.end();
-
-        // find our input range in the expanded path. we know these nodes only appear once.
-        for (; start_it != mappings.end() && start_it->node_id() != input_start_node; ++start_it);
-        for (; end_it != mappings.begin() && end_it->node_id() != input_end_node; --end_it);
-
-        // walk back our start point as we can without rank discontinuities. doesn't matter
-        // if we encounter cycles here, because we keep a running path length
-        auto cur_it = start_it;
-        auto prev_it = cur_it;
-        if (prev_it != mappings.begin()) {
-            for (; prev_it != mappings.begin(); --prev_it) {
-                cur_it = prev_it;
-                --cur_it;
-                handle_t  prev_handle = vg_subgraph->get_handle(prev_it->node_id(),
-                                                            prev_it->is_reverse());
-                handle_t cur_handle = vg_subgraph->get_handle(cur_it->node_id(),
-                                                          cur_it->is_reverse());
-                edge_t edge = vg_subgraph->edge_handle(cur_handle, prev_handle);
-                if (!path_edge_set.count(make_pair(make_pair(vg_subgraph->get_id(edge.first), vg_subgraph->get_is_reverse(edge.first)),
-                                                   make_pair(vg_subgraph->get_id(edge.second), vg_subgraph->get_is_reverse(edge.second))))) {
-#ifdef debug
-#pragma omp critical(cerr)
-                    {
-                        cerr << "found discontinuity between when left scanning path in subgraph: " << *cur_it << " and " << *prev_it << endl;
-
-                    }
-#endif
-                    break;
-                }
-                left_padding += cur_it->length;
-            }
-        }
-        start_it = prev_it;
-        // walk forward the end point
-        cur_it = end_it;
-        prev_it = cur_it;
-        for (++cur_it; cur_it != mappings.end(); ++prev_it, ++cur_it) {
-            handle_t  prev_handle = vg_subgraph->get_handle(prev_it->node_id(),
-                                                        prev_it->is_reverse());
-            handle_t cur_handle = vg_subgraph->get_handle(cur_it->node_id(),
-                                                      cur_it->is_reverse());
-            edge_t edge = vg_subgraph->edge_handle(prev_handle, cur_handle);
-            if (!path_edge_set.count(make_pair(make_pair(vg_subgraph->get_id(edge.first), vg_subgraph->get_is_reverse(edge.first)),
-                                               make_pair(vg_subgraph->get_id(edge.second), vg_subgraph->get_is_reverse(edge.second))))) {
-#ifdef debug
-#pragma omp critical(cerr)
-                    {
-                        cerr << "found discontinuity between when right scanning path in subgraph: " << *prev_it << " and " << *cur_it << endl;
 
-                    }
-#endif
-                break;
-            }
-            right_padding += cur_it->length;
-        }
-        end_it = prev_it;
+    // Find the outer bounds of selected subpaths of the target path
+    size_t min_start = std::numeric_limits<size_t>::max();
+    size_t max_end = 0;
 
-        rewrite_paths = start_it != mappings.begin() || end_it != --mappings.end();
-        
-        // cut out nodes before and after discontinuity
-        mappings.erase(mappings.begin(), start_it);
-        mappings.erase(++end_it, mappings.end());
-    }
-    // We're clipping at a cycle in the reference path.  Just preserve the path as-is from the
-    // input region.  
-    else {
-        mappings.clear();
-        for (step_handle_t step = start_step; step != end_plus_one_step; step = graph->get_next_step(step)) {
-            handle_t step_handle = graph->get_handle_of_step(step);
-            mapping_t mapping;
-            mapping.set_node_id(graph->get_id(step_handle));
-            mapping.set_is_reverse(graph->get_is_reverse(step_handle));
-            mappings.push_back(mapping);
+    subgraph.for_each_path_matching({sense}, {sample}, {locus}, [&](const path_handle_t subpath) {
+        if (subgraph.get_haplotype(subpath) != haplotype || subgraph.get_phase_block(subpath) != phase_block) {
+            // Skip this subpath since it's not the right phase/fragment
+            return true;
         }
-        rewrite_paths = true;
-    }
 
-    // Cut our graph so that our reference path end points are graph tips.  This will let the
-    // snarl finder use the path to find telomeres.
-    path_handle_t sg_path_handle = vg_subgraph->get_path_handle(region.seq);
-    Node* start_node = vg_subgraph->get_node(mappings.begin()->node_id());
-    auto sg_start_steps = path_steps_of_handle(*vg_subgraph, vg_subgraph->get_handle(start_node->id()), sg_path_handle); 
-    if (rewrite_paths && sg_start_steps.size() == 1) {
-        if (!mappings.begin()->is_reverse() && vg_subgraph->start_degree(start_node) != 0) {
-            for (auto edge : vg_subgraph->edges_to(start_node)) {
-#ifdef debug
-#pragma omp critical(cerr)
-                {
-                    cerr << "clipping out edge " << pb2json(*edge) << " in order to make path start a tip" << endl;
-                }
-#endif
-                vg_subgraph->destroy_edge(edge);
-            }
-        } else if (mappings.begin()->is_reverse() && vg_subgraph->end_degree(start_node) != 0) {
-            for (auto edge : vg_subgraph->edges_from(start_node)) {
-#ifdef debug
-#pragma omp critical(cerr)
-                {
-                    cerr << "clipping out edge " << pb2json(*edge) << " in order to make path start a tip" << endl;
-                }
-#endif
-                vg_subgraph->destroy_edge(edge);
-            }
+        subrange_t subpath_subrange = subgraph.get_subrange(subpath);
+        if (subpath_subrange == PathMetadata::NO_SUBRANGE) {
+            // Fill in a 0 start
+            subpath_subrange.first = 0;
         }
-    }
-    Node* end_node = vg_subgraph->get_node(mappings.rbegin()->node_id());
-    auto sg_end_steps = path_steps_of_handle(*vg_subgraph, vg_subgraph->get_handle(end_node->id()), sg_path_handle); 
-    if (rewrite_paths && sg_end_steps.size() == 1) {
-        if (!mappings.rbegin()->is_reverse() && vg_subgraph->end_degree(end_node) != 0) {
-            for (auto edge : vg_subgraph->edges_from(end_node)) {
-#ifdef debug
-#pragma omp critical(cerr)
-                {
-                    cerr << "clipping out edge " << pb2json(*edge) << " in order to make path end a tip" << endl;
-                }
-#endif
-                vg_subgraph->destroy_edge(edge);
-            }
-        } else if (mappings.rbegin()->is_reverse() && vg_subgraph->start_degree(end_node) != 0) {
-            for (auto edge : vg_subgraph->edges_to(end_node)) {
-#ifdef debug
-#pragma omp critical(cerr)
-                {
-                    cerr << "clipping out edge " << pb2json(*edge) << " in order to make path end a tip" << endl;
-                }
-#endif
-                vg_subgraph->destroy_edge(edge);
+        if (subpath_subrange.second == PathMetadata::NO_END_POSITION) {
+            // Compute a length and use that to get the end.
+            // TODO: Sniff for an efficient/available get_path_length.
+            size_t path_length = 0;
+            for (handle_t handle : subgraph.scan_path(subpath)) {
+                path_length += subgraph.get_length(handle);
             }
+            subpath_subrange.second = subpath_subrange.first + path_length;
         }
-    }
-
-    // Sync our updated paths lists back into the Graph protobuf
-    if (rewrite_paths) {
-        vg_subgraph->paths.rebuild_node_mapping();
-        vg_subgraph->paths.rebuild_mapping_aux();
-        vg_subgraph->graph.clear_path();
-        vg_subgraph->paths.to_graph(vg_subgraph->graph);
-    }
 
-    // copy back out of vg if necessary
-    if (dynamic_cast<VG*>(&subgraph) == nullptr) {
-        handlealgs::copy_path_handle_graph(vg_subgraph, &subgraph);
-        delete vg_subgraph;
+        // Min/max in the subrange bounds
+        min_start = std::min(min_start, subpath_subrange.first);
+        max_end = std::max(max_end, subpath_subrange.second);
+
+        return true;
+    });
+
+    // TODO: We assume we actually found some of the target path
+    crash_unless(min_start != std::numeric_limits<size_t>::max());
+
+    // Hackily remove source path subrange offsets if any
+    subrange_t source_subrange = graph->get_subrange(path_handle);
+    if (source_subrange != PathMetadata::NO_SUBRANGE) {
+        // If we requested something on this path region, we can't handle
+        // finding part of an earlier path region.
+        // TODO: Handle it.
+        crash_unless(min_start <= source_subrange.first);
+        min_start -= source_subrange.first;
+        max_end -= source_subrange.first;
     }
 
-    // start could fall inside a node.  we find out where in the path the
-    // 0-offset point of the node is. 
-    int64_t input_start_pos = graph->get_position_of_step(start_step);
-    int64_t input_end_pos = graph->get_position_of_step(end_step);
+    // Produce the output region.
     out_region.seq = region.seq;
-    out_region.start = input_start_pos - left_padding;
-    out_region.end = input_end_pos + graph->get_length(end_handle) + right_padding - 1;
+    out_region.start = min_start;
+    out_region.end = max_end;
 }
 
 void PathChunker::extract_snarls(const Region& region, SnarlManager& snarl_manager, MutablePathMutableHandleGraph& subgraph) {
@@ -350,7 +175,7 @@ void PathChunker::extract_snarls(const Region& region, SnarlManager& snarl_manag
     algorithms::add_connecting_edges_to_subgraph(*graph, subgraph);
 
     // now fill in the paths
-    algorithms::add_subpaths_to_subgraph(*graph, subgraph, true);
+    algorithms::add_subpaths_to_subgraph(*graph, subgraph);
 }
 
 void PathChunker::extract_path_component(const string& path_name, MutablePathMutableHandleGraph& subgraph, Region& out_region) {
@@ -361,18 +186,18 @@ void PathChunker::extract_path_component(const string& path_name, MutablePathMut
         path_ids.insert(graph->get_id(handle));
     }
     
-    extract_component(path_ids, subgraph, true);
+    extract_component(path_ids, subgraph);
     out_region.seq = path_name;
 }
 
-void PathChunker::extract_component(const unordered_set<nid_t>& node_ids, MutablePathMutableHandleGraph& subgraph, bool subpath_naming) {
+void PathChunker::extract_component(const unordered_set<nid_t>& node_ids, MutablePathMutableHandleGraph& subgraph) {
 
     for (nid_t node_id : node_ids) {
         subgraph.create_handle(graph->get_sequence(graph->get_handle(node_id)), node_id);
     }
 
     algorithms::expand_subgraph_by_steps(*graph, subgraph, numeric_limits<uint64_t>::max());
-    algorithms::add_subpaths_to_subgraph(*graph, subgraph, subpath_naming);
+    algorithms::add_subpaths_to_subgraph(*graph, subgraph);
 }
 
 void PathChunker::extract_id_range(vg::id_t start, vg::id_t end, int64_t context, int64_t length,
@@ -389,7 +214,7 @@ void PathChunker::extract_id_range(vg::id_t start, vg::id_t end, int64_t context
     if (length) {
         algorithms::expand_subgraph_by_length(*graph, subgraph, context, forward_only);
     }
-    algorithms::add_subpaths_to_subgraph(*graph, subgraph, true);
+    algorithms::add_subpaths_to_subgraph(*graph, subgraph);
 
     // build the vg
     out_region.start = subgraph.min_node_id();
diff --git a/src/chunker.hpp b/src/chunker.hpp
index b9d88aeb60d..75923cd91aa 100644
--- a/src/chunker.hpp
+++ b/src/chunker.hpp
@@ -49,14 +49,14 @@ class PathChunker {
                         MutablePathMutableHandleGraph& subgraph);
 
     /**
-     * Extract a connected component containing a given path
+     * Extract a connected component containing a given path. Processes path metadata and creates subpaths.
      */
     void extract_path_component(const string& path_name, MutablePathMutableHandleGraph& subgraph, Region& out_region);
    
     /**
-     * Extract a connected component starting from an id set
+     * Extract a connected component starting from an id set. Processes path metadata and creates subpaths.
      */
-    void extract_component(const unordered_set<nid_t>& node_ids, MutablePathMutableHandleGraph& subgraph, bool subpath_naming);   
+    void extract_component(const unordered_set<nid_t>& node_ids, MutablePathMutableHandleGraph& subgraph);   
 
     /**
      * Like above, but use (inclusive) id range instead of region on path.
diff --git a/src/subcommand/chunk_main.cpp b/src/subcommand/chunk_main.cpp
index 57397ed6713..970b75d35b6 100644
--- a/src/subcommand/chunk_main.cpp
+++ b/src/subcommand/chunk_main.cpp
@@ -726,7 +726,7 @@ int main_chunk(int argc, char** argv) {
         map<string, int> trace_thread_frequencies;
         if (!component_ids.empty()) {
             subgraph = vg::io::new_output_graph<MutablePathMutableHandleGraph>(output_format);
-            chunker.extract_component(component_ids[i], *subgraph, false);
+            chunker.extract_component(component_ids[i], *subgraph);
             output_regions[i] = region;
         }
         else if (id_range == false) {

From 5f5514b69df54a5be9eece03f1d88ac6c8620283 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Wed, 6 Nov 2024 12:08:37 -0500
Subject: [PATCH 50/54] Adjust unit tests to match idea of noting the expanded
 extracted visit to a cycle

---
 src/chunker.cpp          | 21 +++++++++++++++++++--
 src/chunker.hpp          |  9 ++++++---
 src/unittest/chunker.cpp | 18 ++++++++++++++++++
 3 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/src/chunker.cpp b/src/chunker.cpp
index 5b1dad389a3..eb76a3cd09d 100644
--- a/src/chunker.cpp
+++ b/src/chunker.cpp
@@ -64,6 +64,13 @@ void PathChunker::extract_subgraph(const Region& region, int64_t context, int64_
     }
     algorithms::add_subpaths_to_subgraph(*graph, subgraph); 
    
+    // Now we need to figure out how we expanded the target path region we
+    // asked for.
+    //
+    // We don't just want the lowest and highest bounds of any subpath, we want
+    // the lowest and highest bound of the subpaths that actually overlap the
+    // targeted region.
+
     // Find the lowest and highest offsets visited by any subpath of the target path we extracted on.
     PathSense sense = graph->get_sense(path_handle);
     std::string sample = graph->get_sample_name(path_handle);
@@ -97,6 +104,13 @@ void PathChunker::extract_subgraph(const Region& region, int64_t context, int64_
             subpath_subrange.second = subpath_subrange.first + path_length;
         }
 
+        if (subpath_subrange.first > region.end || subpath_subrange.second <= region.start) {
+            // This subpath doesn't actually overlap the selected target path
+            // region (which is 0-based, end-inclusive), and so shouldn't count
+            // for extending the selected region along the target path.
+            return true;
+        }
+
         // Min/max in the subrange bounds
         min_start = std::min(min_start, subpath_subrange.first);
         max_end = std::max(max_end, subpath_subrange.second);
@@ -118,10 +132,13 @@ void PathChunker::extract_subgraph(const Region& region, int64_t context, int64_
         max_end -= source_subrange.first;
     }
 
-    // Produce the output region.
+    // We can't represent a region with a 0 end-exclusive coordinate.
+    crash_unless(max_end != 0);
+
+    // Produce the output region. Convert coordinates to be 0-based, end-inclusive.
     out_region.seq = region.seq;
     out_region.start = min_start;
-    out_region.end = max_end;
+    out_region.end = max_end - 1;
 }
 
 void PathChunker::extract_snarls(const Region& region, SnarlManager& snarl_manager, MutablePathMutableHandleGraph& subgraph) {
diff --git a/src/chunker.hpp b/src/chunker.hpp
index 75923cd91aa..56922100613 100644
--- a/src/chunker.hpp
+++ b/src/chunker.hpp
@@ -31,9 +31,12 @@ class PathChunker {
 
     /** Extract subgraph corresponding to given path region into its 
      * own vg graph, and send it to out_stream.  The boundaries of the
-     * extracted graph (which can be different because we expand context and don't
-     * cut nodes) are written to out_region.  If forward_only set, context
-     * is only expanded in the forward direction
+     * extracted region of the target path (which can be different because we
+     * expand context and don't cut nodes) are written to out_region. If the
+     * target path goes through the extracted region multiple times, only the
+     * extended bounds of the visit containing the target region are produced.
+     *
+     * If forward_only set, context is only expanded in the forward direction
      *
      * NOTE: we follow convention of Region coordinates being 0-based 
      * inclusive. 
diff --git a/src/unittest/chunker.cpp b/src/unittest/chunker.cpp
index 91326aa1ca9..24f7d3b645a 100644
--- a/src/unittest/chunker.cpp
+++ b/src/unittest/chunker.cpp
@@ -186,10 +186,28 @@ TEST_CASE("basic graph chunking", "[chunk]") {
         VG subgraph;
         Region out_region;        
         chunker.extract_subgraph(region, 1, 0, false, subgraph, out_region);
+
+        // We include node 4 in the cycle, and also node 3 which is 1 away, so
+        // we include all the loops arounf the cycle and need to start the
+        // extracted path region where it enters node 3 at base 6.
         
         REQUIRE(subgraph.node_count() == 7);
         REQUIRE(subgraph.edge_count() == 9);
+        REQUIRE(out_region.start == 6);
+        
+    }
+
+    SECTION("Partial graph via cyclic path, 0 expansion") {
+        
+        Region region = {"z", 35, 36};
+        VG subgraph;
+        Region out_region;        
+        chunker.extract_subgraph(region, 0, 0, false, subgraph, out_region);
+
+        REQUIRE(subgraph.node_count() == 1);
+        REQUIRE(subgraph.edge_count() == 0);
         REQUIRE(out_region.start == 31);
+        REQUIRE(out_region.end == 36); // End is inclusive
         
     }
 

From 2d04e6a0354ddf097c425c2efeac5ec05ba7e28a Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Wed, 6 Nov 2024 12:36:43 -0500
Subject: [PATCH 51/54] Stop retaining paths when clearing vg::VG

---
 src/algorithms/subgraph.cpp  | 3 +++
 src/subcommand/find_main.cpp | 6 ++++--
 src/vg.cpp                   | 1 +
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/algorithms/subgraph.cpp b/src/algorithms/subgraph.cpp
index 86fac693ba1..5ff7d275a94 100644
--- a/src/algorithms/subgraph.cpp
+++ b/src/algorithms/subgraph.cpp
@@ -305,7 +305,10 @@ void add_subpaths_to_subgraph(const PathPositionHandleGraph& source, MutablePath
     // subrange metadata when we just have all of a path.
     std::unordered_map<base_metadata_t, std::pair<size_t, bool>> full_path_info;
 
+    size_t subgraph_handles = 0;
+
     subgraph.for_each_handle([&](const handle_t& h) {
+            subgraph_handles++;
             handlegraph::nid_t id = subgraph.get_id(h);
             if (source.has_node(id)) {
                 handle_t handle = source.get_handle(id);
diff --git a/src/subcommand/find_main.cpp b/src/subcommand/find_main.cpp
index 48ebda09a6a..2e1fa0355d2 100644
--- a/src/subcommand/find_main.cpp
+++ b/src/subcommand/find_main.cpp
@@ -1,4 +1,5 @@
 #include "subcommand.hpp"
+#include "../crash.hpp"
 #include "../utility.hpp"
 #include "../mapper.hpp"
 #include <vg/io/stream.hpp>
@@ -683,8 +684,9 @@ int main_find(int argc, char** argv) {
                     ofstream out(s.str().c_str());
                     vg::io::save_handle_graph(&graph, out);
                     out.close();
-                    // reset our graph
-                    dynamic_cast<DeletableHandleGraph&>(graph).clear();
+                    // reset our graph so it has no nodes or paths anymore
+                    graph.clear();
+                    crash_unless(graph.get_path_count() == 0);
                 }
                 if (subgraph_k) {
                     prep_graph(); // don't forget to prep the graph, or the kmer set will be wrong[
diff --git a/src/vg.cpp b/src/vg.cpp
index bdd81778d4e..cb43cf7e26d 100644
--- a/src/vg.cpp
+++ b/src/vg.cpp
@@ -631,6 +631,7 @@ void VG::destroy_edge(const handle_t& left, const handle_t& right) {
 }
     
 void VG::clear() {
+    clear_paths();
     graph.mutable_node()->Clear();
     graph.mutable_edge()->Clear();
     clear_indexes();

From 68d2e7394038fe7bb58988a19192615536011146 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Wed, 6 Nov 2024 12:37:38 -0500
Subject: [PATCH 52/54] Stop counting subgraph handles

---
 src/algorithms/subgraph.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/algorithms/subgraph.cpp b/src/algorithms/subgraph.cpp
index 5ff7d275a94..86fac693ba1 100644
--- a/src/algorithms/subgraph.cpp
+++ b/src/algorithms/subgraph.cpp
@@ -305,10 +305,7 @@ void add_subpaths_to_subgraph(const PathPositionHandleGraph& source, MutablePath
     // subrange metadata when we just have all of a path.
     std::unordered_map<base_metadata_t, std::pair<size_t, bool>> full_path_info;
 
-    size_t subgraph_handles = 0;
-
     subgraph.for_each_handle([&](const handle_t& h) {
-            subgraph_handles++;
             handlegraph::nid_t id = subgraph.get_id(h);
             if (source.has_node(id)) {
                 handle_t handle = source.get_handle(id);

From 3e5857c14b41be22e200d7f2f8b0d5bd778e1ca4 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Wed, 6 Nov 2024 12:44:48 -0500
Subject: [PATCH 53/54] Use the same subpath-free input graph for deconstruct
 as before

---
 test/t/26_deconstruct.t | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/t/26_deconstruct.t b/test/t/26_deconstruct.t
index 03c7f8f3a87..492c07a81c2 100644
--- a/test/t/26_deconstruct.t
+++ b/test/t/26_deconstruct.t
@@ -90,7 +90,9 @@ printf "P\ty\t1+,3+,5+,6+,8+,9+,11+,12+,9+,10+,12+,14+,15+\t8M,1M,1M,3M,1M,19M,1
 vg view -Fv cyclic_tiny.gfa > cyclic_tiny.vg
 vg index cyclic_tiny.vg -x cyclic_tiny.xg
 vg find -x cyclic_tiny.xg  -n 10 -n 11 -n 12 -n 13 -n 14 -n 15 -c 1 > cycle.vg
-vg index cycle.vg -x cycle.xg
+# TODO: Make deconstruct see through subpaths to the base path
+vg view cycle.vg | sed 's/\([xy]\)\[[-0-9]*\]/\1/g' >cycle-asfullpaths.gfa
+vg index cycle-asfullpaths.gfa -x cycle.xg
 vg deconstruct cycle.xg -p y -e -t 1 > cycle_decon.vcf
 is $(grep -v "#" cycle_decon.vcf | wc -l) 1 "cyclic reference deconstruction has correct number of variants"
 grep -v "#" cycle_decon.vcf | grep 20 | awk '{print $1 "\t" $2 "\t" $4 "\t" $5 "\t" $10}' > cycle_decon.tsv

From 184019446fb1cebf28602f8c1b4a350080f28b12 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Wed, 6 Nov 2024 12:48:09 -0500
Subject: [PATCH 54/54] Make chunk tests tolerate a subpath end on the
 extracted x subpath

---
 test/t/30_vg_chunk.t | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/t/30_vg_chunk.t b/test/t/30_vg_chunk.t
index 142de98066a..48e52ffdb93 100644
--- a/test/t/30_vg_chunk.t
+++ b/test/t/30_vg_chunk.t
@@ -66,8 +66,8 @@ is $(vg chunk -x x.xg -r 1:1 -c 2 -T | vg view - -j | jq .node | grep id | wc -l
 
 # Check that traces work on a GBWT
 is $(vg chunk -x x.xg -G x.gbwt -r 1:1 -c 2 -T | vg view - -j | jq .node | grep id | wc -l) 5 "id chunker traces correct chunk size"
-is "$(vg chunk -x x.xg -r 1:1 -c 2 -T | vg view - -j | jq -c '.path[] | select(.name != "x[0]")' | wc -l)" 0 "chunker extracts no threads from an empty gPBWT"
-is "$(vg chunk -x x.xg -G x.haps.gbwt -r 1:1 -c 2 -T | vg view - -j | jq -c '.path[] | select(.name != "x[0]")' | wc -l)" 2 "chunker extracts 2 local threads from a gBWT with 2 locally distinct threads in it"
+is "$(vg chunk -x x.xg -r 1:1 -c 2 -T | vg view - -j | jq -c '.path[] | select(.name | startswith("x[0") | not)' | wc -l)" 0 "chunker extracts no threads from an empty gPBWT"
+is "$(vg chunk -x x.xg -G x.haps.gbwt -r 1:1 -c 2 -T | vg view - -j | jq -c '.path[] | select(.name | startswith("x[0") | not)' | wc -l)" 2 "chunker extracts 2 local threads from a gBWT with 2 locally distinct threads in it"
 is "$(vg chunk -x x.xg -G x.gbwt -r 1:1 -c 2 -T | vg view - -j | jq -r '.path[] | select(.name == "thread_0") | .mapping | length')" 3 "chunker can extract a partial haplotype from a GBWT"
 is "$(vg chunk -x x.gbz -r 1:1 -c 2 -T | vg view - -j | jq -r '.path[] | select(.name == "thread_0") | .mapping | length')" 3 "chunker can extract a partial haplotype from a GBZ"
 is "$(vg chunk -x x.gbz -r 1:1 -c 2 -T --no-embedded-haplotypes | vg view - -j | jq -r '.path[] | select(.name == "thread_0") | .mapping | length')" "" "chunker doesn't see haplotypes in the GBZ if asked not to"