Skip to content

Commit

Permalink
ref
Browse files Browse the repository at this point in the history
  • Loading branch information
katerinakazantseva committed Sep 11, 2024
1 parent 73eb40a commit bc051d2
Show file tree
Hide file tree
Showing 3 changed files with 146 additions and 47 deletions.
59 changes: 53 additions & 6 deletions strainy/graph_operations/asm_graph_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@

"""
This contains functions for operation with assembly graph:
1. add_child_edge: adds a child unitig with the same sequence as the father unitig or with the given sequence
(aff full path and othr clusters"
2. add_path_links: Adds link ("full path)
3.change_cov: recalculate coverage
4.change_sec" recalculate sequence
1. add_child_edge: adds a child unitig with the same sequence as the father unitig or with the given sequence #TODO
2. add_path_links: Adds GFA links between newly created unitigs that form a "full path."
3. change_cov: recalculate coverage of parent unitig #TODO change coverage
4. change_sec: recalculate sequence of parent unitig
5. strong_tail: determines whether a cluster has strong coverage at both the start and end of a segment
"""


Expand All @@ -23,6 +23,15 @@
def add_child_edge(edge, clN, g, cl, left, right, cons, flye_consensus, change_seq=True, insertmain=True):
"""
Adds a child unitig with the same sequence as the parental unitig or with the given sequence
This function uses `flye_consensus` to compute the consensus sequence for the unitig and constructs
the new unitig based on the provided `left` and `right` coordinates. The function then creates a new edge
in the graph using `gfa_ops.add_edge` and assigns the appropriate sequence (either the computed consensus
or the parental sequence) to the new unitig.
Returns: new_line (Edge): The newly created edge representing the child unitig in the graph.
Notes:
- If `change_seq` is set to `True`, the sequence of the child unitig is replaced by the consensus
sequence computed by `flye_consensus`. If `False`, the parent unitig's sequence is retained.
- This function is designed to work with the `gfa_ops.add_edge` function to handle edge insertion in the GFA graph.
"""
##TODO if cons provided change_seq=True (provide seq not consensus)
##TODO make separare function to add gfa edge and move to gfa_ops
Expand Down Expand Up @@ -57,17 +66,33 @@ def add_child_edge(edge, clN, g, cl, left, right, cons, flye_consensus, change_s
)




def add_path_links(graph, edge, paths):
"""
Add gfa links between newly created unitigs forming "full path"
Adds GFA links between newly created unitigs that form a "full path."
This function iterates through the provided paths (each representing a series of unitigs) and creates
links between consecutive unitigs. These links are added to the graph using the `gfa_ops.add_link`
function, which represents the connections between unitigs in a GFA.
Returns: None: This function modifies the graph in place by adding links between unitigs.
"""
for path in paths:
for i in range(0, len(path) - 1):
gfa_ops.add_link(graph, f"{edge}_{path[i]}", "+", f"{edge}_{path[i + 1]}", "+", 1)




def change_cov(g, edge, cons, ln, clusters, othercl, remove_clusters):
"""
Updates the coverage of a parent segment in the GFA graph based on the consensus of specified clusters
(that weren't created into new unitigs).
This function recalculates the coverage of a segment (`edge`) by considering the coverage of the specified
clusters (`othercl`). If the proportion of the segment covered by these clusters is below a threshold
(`parental_min_len`), the segment is marked for removal. The segment's coverage is then updated in the GFA
graph.
"""
#TODO change coverage, not return it
cov = 0
len_cl = []
for i in othercl:
Expand All @@ -82,7 +107,18 @@ def change_cov(g, edge, cons, ln, clusters, othercl, remove_clusters):
return cov




def change_sec(g, edge, othercl, cl,SNP_pos, data, cut = True):
"""
Updates the sequence of a parent unitig in the GFA graph based on the consensus from specified clusters
(that weren't created into new unitigs).
This function modifies the sequence of a segment (`edge`) in the GFA graph by applying the consensus
sequence of other clusters (`othercl`). The consensus is computed using SNP positions and read data,
and the segment sequence is updated accordingly.
Returns:
None: The function modifies the graph in place, updating the sequence of the specified segment.
"""
temp = {}
other_cl = cl
for cluster in othercl:
Expand All @@ -101,7 +137,18 @@ def change_sec(g, edge, othercl, cl,SNP_pos, data, cut = True):
i.sequence=''.join(seq)




def strong_tail(cluster, cl, ln, data):
"""
Determines whether a cluster has strong coverage at both the start and end of a segment.
This function checks the reads associated with a cluster to determine if there is strong coverage
near the start and end of a segment (unitig). The coverage is considered "strong" if the number of reads
beginning near the start or ending near the end exceeds a predefined threshold (`strong_cluster_min_reads`).
Returns:Tuple[bool, bool]: A tuple with two boolean values:
- The first value is `True` if the cluster has strong coverage at the start of the segment.
- The second value is `True` if the cluster has strong coverage at the end of the segment.
"""
count_start = None
count_stop = None
res = [False,False]
Expand Down
10 changes: 9 additions & 1 deletion strainy/graph_operations/gfa_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"""
This contains functions for operation with graph of gfa format:
1. add_link: Adds a link between specified segments in the graph
2. add_edge: Adds an empty(no sequence) segment with the specified name and coverage to the graph
2. add_edge: Adds an empty(no sequence) segment with the specified name and coverage to the graph #TODO add sequence
3. gfa_to_nx: Сonverts the graph from the gfa format to nx (networkx) format
4. from_pandas_adjacency_notinplace: Workaround for networkx.from_pandas_adjacency issue https://github.com/networkx/networkx/issues/7407
5. clean_graph: Cleans graph from selflinks, and add "A" sequence to 0-length edges
Expand All @@ -25,6 +25,7 @@ def add_link(graph, fr, fr_or, to, to_or, w):
fr, to (string): names of segments to be linked (from and to)
fr_or,fr_or (string): orientation of segments to be linked (from and to)
w: weight of the link
Returns: None: The function modifies the graph in place by adding a link between the specified segments.
"""
#check if segments exist before connecting
if graph.segment(fr) is None or graph.segment(to) is None:
Expand All @@ -37,6 +38,8 @@ def add_link(graph, fr, fr_or, to, to_or, w):
pass




def add_edge(graph,edge, clN, cov):
#TODO remove edge,clN from parameters, use name instead
#TODO add seq
Expand All @@ -58,6 +61,7 @@ def add_edge(graph,edge, clN, cov):




def gfa_to_nx(g):
"""
Сonverts the graph from the gfa format to nx (networkx) format
Expand All @@ -74,6 +78,8 @@ def gfa_to_nx(g):
return G




def from_pandas_adjacency_notinplace(df, create_using=None):
"""
This function is exactly the same as networkx.from_pandas_adjacency,
Expand All @@ -96,6 +102,8 @@ def from_pandas_adjacency_notinplace(df, create_using=None):
return G




def clean_graph(g):
"""
Сlears gfa graph - deletes edges with zero length, and self links
Expand Down
124 changes: 84 additions & 40 deletions strainy/graph_operations/overlap_graph_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,12 @@
"""
This contains functions for operation with overlap graph:
1. build_paths_graph: creates overlap graph #TODO rename to overlap
2. find_full_paths: finds full paths in overlap graph
3.remove_nested(G, cons): removes nested clusters (
4.add_path_edges : calc cluster boundaries and creates unitigs using asm.add_child_edge
2. find_full_paths: finds full paths in overlap graph #TODO: we need to increase cutoff for longer unitigs
3. remove_transitive(G): removes transitive edges from the graph.
4. remove_nested(G, cons): removes nested clusters
5. remove_leaf_root_subnodes: removes leaf and root subnodes from the graph #TODO explain and simplify
6. remove_bubbles: removes bubbles from the graph
7.add_path_edges : calculates cluster boundaries and creates unitigs using asm.add_child_edge #TODO split and move
"""

def build_paths_graph(cons, full_paths_roots, full_paths_leafs, cluster_distances):
Expand All @@ -37,7 +40,38 @@ def build_paths_graph(cons, full_paths_roots, full_paths_leafs, cluster_distance



def find_full_paths(G, paths_roots, paths_leafs):
"""
This function identifies all simple paths between nodes in `paths_roots` (source clusters) and
`paths_leafs` (sink clusters) in the graph `G`. The paths between roots and leaves represent
valid full paths in the unitig.
Returns: list: A list of all simple paths found between the root and leaf nodes. Each path is represented
as a list of nodes (clusters) in the path.
"""
paths = []
for root in paths_roots:
try:
#TODO: we need to increase cutoff for longer unitigs with more clusters.
#But this will result in the exponential number of paths. Instead, we should be
#looking at all nodes that are reachable from both source and sink, which is linear
paths_nx = nx.algorithms.all_simple_paths(G, root, paths_leafs, cutoff = 10)
except:
pass
for path in list(paths_nx):
paths.append(path)
return paths




def remove_transitive(G):
"""
Removes transitive edges from the graph.
This function identifies and removes transitive edges in the graph `G`. A transitive edge is an edge
where there exists a simple path of length 2 between two nodes, making a direct edge between them
redundant. If such transitive paths are found, the direct edge is removed to simplify the graph structure.
Returns: nx.Graph: The updated graph with transitive edges removed.
"""
path_remove = []
for node in G.nodes():
neighbors = nx.all_neighbors(G, node)
Expand All @@ -55,7 +89,48 @@ def remove_transitive(G):



def remove_nested(G, cons):
"""
Disconnect "nested" clusters from the parent cluster.
This function iterates through the nodes (clusters) in the graph `G` and removes edges between clusters
where one cluster is "nested" inside another. A cluster is considered nested if its start and end
coordinates fall within the boundaries of another (parent) cluster. Edges between the nested and parent
clusters are removed to avoid redundancy.
Returns:nx.Graph: The updated graph with nested clusters disconnected from their parent clusters.
"""
nodes = list(G.nodes())
for node in nodes:
try:
neighbors = nx.all_neighbors(G, node)
for neighbor in list(neighbors):
if cons[node]["Start"] < cons[neighbor]["Start"] and cons[node]["End"] > cons[neighbor]["End"]:
try:
G.remove_edge(node, neighbor)
G.remove_edge(neighbor,node)
logger.debug("REMOVE NESTED" + str(neighbor))

except:
continue
except:
continue
return G




def remove_leaf_root_subnodes(G,full_paths_roots,full_paths_leafs):
"""
Removes leaf and root subnodes from the graph if they are directly connected to other leaf or root nodes.
This function identifies nodes in `full_paths_roots` and `full_paths_leafs` that are directly connected
to other root or leaf nodes in the graph. These connections are identified by finding simple paths of
length 2 between the nodes. Any node that is found to have such a connection is removed from the graph
as well as from the `full_paths_roots` and `full_paths_leafs` lists.
Returns:
Tuple[nx.Graph, list, list]: A tuple containing:
- The updated graph `G` with the leaf and root subnodes removed.
- The updated `full_paths_roots` list, with removed nodes excluded.
- The updated `full_paths_leafs` list, with removed nodes excluded.
"""
node_remove = []
for node in full_paths_leafs:
neighbors = list(full_paths_leafs)
Expand Down Expand Up @@ -83,7 +158,11 @@ def remove_leaf_root_subnodes(G,full_paths_roots,full_paths_leafs):




def remove_bubbles(graph, source_nodes):
"""
Removes leaf and root subnodes from the graph
"""
for node in source_nodes:
neighbors = list(source_nodes)
for neighbor in list(neighbors):
Expand All @@ -94,43 +173,6 @@ def remove_bubbles(graph, source_nodes):



def find_full_paths(G, paths_roots, paths_leafs):
paths = []
for root in paths_roots:
try:
#TODO: we need to increas cutoff for longer unitigs with more clusters.
#But this will result in the exponential number of paths. Instead, we should be
#looking at all nodes that are reachable from both source and sink, which is linear
paths_nx = nx.algorithms.all_simple_paths(G, root, paths_leafs, cutoff = 10)
except:
pass
for path in list(paths_nx):
paths.append(path)
return paths



def remove_nested(G, cons):
"""
Disconnect "nested" clusters from the parent cluster
"""
nodes = list(G.nodes())
for node in nodes:
try:
neighbors = nx.all_neighbors(G, node)
for neighbor in list(neighbors):
if cons[node]["Start"] < cons[neighbor]["Start"] and cons[node]["End"] > cons[neighbor]["End"]:
try:
G.remove_edge(node, neighbor)
G.remove_edge(neighbor,node)
logger.debug("REMOVE NESTED" + str(neighbor))

except:
continue
except:
continue
return G

def add_path_edges(edge, g, cl, ln, full_paths, G, paths_roots, paths_leafs, full_clusters, cons, flye_consensus):
"""
Add gfa nodes (unitigs) forming "full path", calculating cluster boundaries
Expand Down Expand Up @@ -265,6 +307,8 @@ def add_path_edges(edge, g, cl, ln, full_paths, G, paths_roots, paths_leafs, ful
return path_cl




def paths_graph_add_vis(edge, cons, cl, full_paths_roots,
full_paths_leafs, full_clusters, cluster_distances):
"""
Expand Down

0 comments on commit bc051d2

Please sign in to comment.