From bc051d26718047d3960f283d2730bea2c70302ed Mon Sep 17 00:00:00 2001 From: Ekaterina Kazantseva Date: Wed, 11 Sep 2024 17:15:20 +0400 Subject: [PATCH] ref --- strainy/graph_operations/asm_graph_ops.py | 59 ++++++++- strainy/graph_operations/gfa_ops.py | 10 +- strainy/graph_operations/overlap_graph_ops.py | 124 ++++++++++++------ 3 files changed, 146 insertions(+), 47 deletions(-) diff --git a/strainy/graph_operations/asm_graph_ops.py b/strainy/graph_operations/asm_graph_ops.py index b1ccb3d..50a62fd 100644 --- a/strainy/graph_operations/asm_graph_ops.py +++ b/strainy/graph_operations/asm_graph_ops.py @@ -6,11 +6,11 @@ """ This contains functions for operation with assembly graph: -1. add_child_edge: adds a child unitig with the same sequence as the father unitig or with the given sequence -(aff full path and othr clusters" -2. add_path_links: Adds link ("full path) -3.change_cov: recalculate coverage -4.change_sec" recalculate sequence +1. add_child_edge: adds a child unitig with the same sequence as the father unitig or with the given sequence #TODO +2. add_path_links: Adds GFA links between newly created unitigs that form a "full path." +3. change_cov: recalculate coverage of parent unitig #TODO change coverage +4. change_sec: recalculate sequence of parent unitig +5. strong_tail: determines whether a cluster has strong coverage at both the start and end of a segment """ @@ -23,6 +23,15 @@ def add_child_edge(edge, clN, g, cl, left, right, cons, flye_consensus, change_seq=True, insertmain=True): """ Adds a child unitig with the same sequence as the parental unitig or with the given sequence + This function uses `flye_consensus` to compute the consensus sequence for the unitig and constructs + the new unitig based on the provided `left` and `right` coordinates. The function then creates a new edge + in the graph using `gfa_ops.add_edge` and assigns the appropriate sequence (either the computed consensus + or the parental sequence) to the new unitig. + Returns: new_line (Edge): The newly created edge representing the child unitig in the graph. + Notes: + - If `change_seq` is set to `True`, the sequence of the child unitig is replaced by the consensus + sequence computed by `flye_consensus`. If `False`, the parent unitig's sequence is retained. + - This function is designed to work with the `gfa_ops.add_edge` function to handle edge insertion in the GFA graph. """ ##TODO if cons provided change_seq=True (provide seq not consensus) ##TODO make separare function to add gfa edge and move to gfa_ops @@ -57,9 +66,15 @@ def add_child_edge(edge, clN, g, cl, left, right, cons, flye_consensus, change_s ) + + def add_path_links(graph, edge, paths): """ - Add gfa links between newly created unitigs forming "full path" + Adds GFA links between newly created unitigs that form a "full path." + This function iterates through the provided paths (each representing a series of unitigs) and creates + links between consecutive unitigs. These links are added to the graph using the `gfa_ops.add_link` + function, which represents the connections between unitigs in a GFA. + Returns: None: This function modifies the graph in place by adding links between unitigs. """ for path in paths: for i in range(0, len(path) - 1): @@ -67,7 +82,17 @@ def add_path_links(graph, edge, paths): + def change_cov(g, edge, cons, ln, clusters, othercl, remove_clusters): + """ + Updates the coverage of a parent segment in the GFA graph based on the consensus of specified clusters + (that weren't created into new unitigs). + This function recalculates the coverage of a segment (`edge`) by considering the coverage of the specified + clusters (`othercl`). If the proportion of the segment covered by these clusters is below a threshold + (`parental_min_len`), the segment is marked for removal. The segment's coverage is then updated in the GFA + graph. + """ + #TODO change coverage, not return it cov = 0 len_cl = [] for i in othercl: @@ -82,7 +107,18 @@ def change_cov(g, edge, cons, ln, clusters, othercl, remove_clusters): return cov + + def change_sec(g, edge, othercl, cl,SNP_pos, data, cut = True): + """ + Updates the sequence of a parent unitig in the GFA graph based on the consensus from specified clusters + (that weren't created into new unitigs). + This function modifies the sequence of a segment (`edge`) in the GFA graph by applying the consensus + sequence of other clusters (`othercl`). The consensus is computed using SNP positions and read data, + and the segment sequence is updated accordingly. + Returns: + None: The function modifies the graph in place, updating the sequence of the specified segment. + """ temp = {} other_cl = cl for cluster in othercl: @@ -101,7 +137,18 @@ def change_sec(g, edge, othercl, cl,SNP_pos, data, cut = True): i.sequence=''.join(seq) + + def strong_tail(cluster, cl, ln, data): + """ + Determines whether a cluster has strong coverage at both the start and end of a segment. + This function checks the reads associated with a cluster to determine if there is strong coverage + near the start and end of a segment (unitig). The coverage is considered "strong" if the number of reads + beginning near the start or ending near the end exceeds a predefined threshold (`strong_cluster_min_reads`). + Returns:Tuple[bool, bool]: A tuple with two boolean values: + - The first value is `True` if the cluster has strong coverage at the start of the segment. + - The second value is `True` if the cluster has strong coverage at the end of the segment. + """ count_start = None count_stop = None res = [False,False] diff --git a/strainy/graph_operations/gfa_ops.py b/strainy/graph_operations/gfa_ops.py index bdfdac6..3f2acd1 100644 --- a/strainy/graph_operations/gfa_ops.py +++ b/strainy/graph_operations/gfa_ops.py @@ -7,7 +7,7 @@ """ This contains functions for operation with graph of gfa format: 1. add_link: Adds a link between specified segments in the graph -2. add_edge: Adds an empty(no sequence) segment with the specified name and coverage to the graph +2. add_edge: Adds an empty(no sequence) segment with the specified name and coverage to the graph #TODO add sequence 3. gfa_to_nx: Сonverts the graph from the gfa format to nx (networkx) format 4. from_pandas_adjacency_notinplace: Workaround for networkx.from_pandas_adjacency issue https://github.com/networkx/networkx/issues/7407 5. clean_graph: Cleans graph from selflinks, and add "A" sequence to 0-length edges @@ -25,6 +25,7 @@ def add_link(graph, fr, fr_or, to, to_or, w): fr, to (string): names of segments to be linked (from and to) fr_or,fr_or (string): orientation of segments to be linked (from and to) w: weight of the link + Returns: None: The function modifies the graph in place by adding a link between the specified segments. """ #check if segments exist before connecting if graph.segment(fr) is None or graph.segment(to) is None: @@ -37,6 +38,8 @@ def add_link(graph, fr, fr_or, to, to_or, w): pass + + def add_edge(graph,edge, clN, cov): #TODO remove edge,clN from parameters, use name instead #TODO add seq @@ -58,6 +61,7 @@ def add_edge(graph,edge, clN, cov): + def gfa_to_nx(g): """ Сonverts the graph from the gfa format to nx (networkx) format @@ -74,6 +78,8 @@ def gfa_to_nx(g): return G + + def from_pandas_adjacency_notinplace(df, create_using=None): """ This function is exactly the same as networkx.from_pandas_adjacency, @@ -96,6 +102,8 @@ def from_pandas_adjacency_notinplace(df, create_using=None): return G + + def clean_graph(g): """ Сlears gfa graph - deletes edges with zero length, and self links diff --git a/strainy/graph_operations/overlap_graph_ops.py b/strainy/graph_operations/overlap_graph_ops.py index 3927e4d..9990d39 100644 --- a/strainy/graph_operations/overlap_graph_ops.py +++ b/strainy/graph_operations/overlap_graph_ops.py @@ -12,9 +12,12 @@ """ This contains functions for operation with overlap graph: 1. build_paths_graph: creates overlap graph #TODO rename to overlap -2. find_full_paths: finds full paths in overlap graph -3.remove_nested(G, cons): removes nested clusters ( -4.add_path_edges : calc cluster boundaries and creates unitigs using asm.add_child_edge +2. find_full_paths: finds full paths in overlap graph #TODO: we need to increase cutoff for longer unitigs +3. remove_transitive(G): removes transitive edges from the graph. +4. remove_nested(G, cons): removes nested clusters +5. remove_leaf_root_subnodes: removes leaf and root subnodes from the graph #TODO explain and simplify +6. remove_bubbles: removes bubbles from the graph +7.add_path_edges : calculates cluster boundaries and creates unitigs using asm.add_child_edge #TODO split and move """ def build_paths_graph(cons, full_paths_roots, full_paths_leafs, cluster_distances): @@ -37,7 +40,38 @@ def build_paths_graph(cons, full_paths_roots, full_paths_leafs, cluster_distance +def find_full_paths(G, paths_roots, paths_leafs): + """ + This function identifies all simple paths between nodes in `paths_roots` (source clusters) and + `paths_leafs` (sink clusters) in the graph `G`. The paths between roots and leaves represent + valid full paths in the unitig. + Returns: list: A list of all simple paths found between the root and leaf nodes. Each path is represented + as a list of nodes (clusters) in the path. + """ + paths = [] + for root in paths_roots: + try: + #TODO: we need to increase cutoff for longer unitigs with more clusters. + #But this will result in the exponential number of paths. Instead, we should be + #looking at all nodes that are reachable from both source and sink, which is linear + paths_nx = nx.algorithms.all_simple_paths(G, root, paths_leafs, cutoff = 10) + except: + pass + for path in list(paths_nx): + paths.append(path) + return paths + + + + def remove_transitive(G): + """ + Removes transitive edges from the graph. + This function identifies and removes transitive edges in the graph `G`. A transitive edge is an edge + where there exists a simple path of length 2 between two nodes, making a direct edge between them + redundant. If such transitive paths are found, the direct edge is removed to simplify the graph structure. + Returns: nx.Graph: The updated graph with transitive edges removed. + """ path_remove = [] for node in G.nodes(): neighbors = nx.all_neighbors(G, node) @@ -55,7 +89,48 @@ def remove_transitive(G): +def remove_nested(G, cons): + """ + Disconnect "nested" clusters from the parent cluster. + This function iterates through the nodes (clusters) in the graph `G` and removes edges between clusters + where one cluster is "nested" inside another. A cluster is considered nested if its start and end + coordinates fall within the boundaries of another (parent) cluster. Edges between the nested and parent + clusters are removed to avoid redundancy. + Returns:nx.Graph: The updated graph with nested clusters disconnected from their parent clusters. + """ + nodes = list(G.nodes()) + for node in nodes: + try: + neighbors = nx.all_neighbors(G, node) + for neighbor in list(neighbors): + if cons[node]["Start"] < cons[neighbor]["Start"] and cons[node]["End"] > cons[neighbor]["End"]: + try: + G.remove_edge(node, neighbor) + G.remove_edge(neighbor,node) + logger.debug("REMOVE NESTED" + str(neighbor)) + + except: + continue + except: + continue + return G + + + + def remove_leaf_root_subnodes(G,full_paths_roots,full_paths_leafs): + """ + Removes leaf and root subnodes from the graph if they are directly connected to other leaf or root nodes. + This function identifies nodes in `full_paths_roots` and `full_paths_leafs` that are directly connected + to other root or leaf nodes in the graph. These connections are identified by finding simple paths of + length 2 between the nodes. Any node that is found to have such a connection is removed from the graph + as well as from the `full_paths_roots` and `full_paths_leafs` lists. + Returns: + Tuple[nx.Graph, list, list]: A tuple containing: + - The updated graph `G` with the leaf and root subnodes removed. + - The updated `full_paths_roots` list, with removed nodes excluded. + - The updated `full_paths_leafs` list, with removed nodes excluded. + """ node_remove = [] for node in full_paths_leafs: neighbors = list(full_paths_leafs) @@ -83,7 +158,11 @@ def remove_leaf_root_subnodes(G,full_paths_roots,full_paths_leafs): + def remove_bubbles(graph, source_nodes): + """ + Removes leaf and root subnodes from the graph + """ for node in source_nodes: neighbors = list(source_nodes) for neighbor in list(neighbors): @@ -94,43 +173,6 @@ def remove_bubbles(graph, source_nodes): -def find_full_paths(G, paths_roots, paths_leafs): - paths = [] - for root in paths_roots: - try: - #TODO: we need to increas cutoff for longer unitigs with more clusters. - #But this will result in the exponential number of paths. Instead, we should be - #looking at all nodes that are reachable from both source and sink, which is linear - paths_nx = nx.algorithms.all_simple_paths(G, root, paths_leafs, cutoff = 10) - except: - pass - for path in list(paths_nx): - paths.append(path) - return paths - - - -def remove_nested(G, cons): - """ - Disconnect "nested" clusters from the parent cluster - """ - nodes = list(G.nodes()) - for node in nodes: - try: - neighbors = nx.all_neighbors(G, node) - for neighbor in list(neighbors): - if cons[node]["Start"] < cons[neighbor]["Start"] and cons[node]["End"] > cons[neighbor]["End"]: - try: - G.remove_edge(node, neighbor) - G.remove_edge(neighbor,node) - logger.debug("REMOVE NESTED" + str(neighbor)) - - except: - continue - except: - continue - return G - def add_path_edges(edge, g, cl, ln, full_paths, G, paths_roots, paths_leafs, full_clusters, cons, flye_consensus): """ Add gfa nodes (unitigs) forming "full path", calculating cluster boundaries @@ -265,6 +307,8 @@ def add_path_edges(edge, g, cl, ln, full_paths, G, paths_roots, paths_leafs, ful return path_cl + + def paths_graph_add_vis(edge, cons, cl, full_paths_roots, full_paths_leafs, full_clusters, cluster_distances): """