Skip to content

Commit

Permalink
ref
Browse files Browse the repository at this point in the history
  • Loading branch information
katerinakazantseva committed Jun 7, 2024
1 parent 5748aeb commit 0b9fc06
Showing 1 changed file with 51 additions and 44 deletions.
95 changes: 51 additions & 44 deletions strainy/clustering/cluster_postprocess.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
import networkx as nx
import logging
import pandas as pd

from strainy.clustering.community_detection import find_communities
import strainy.clustering.build_adj_matrix as matrix
import strainy.clustering.build_data as build_data
import strainy.gfa_operations.gfa_ops as gfa_ops
from strainy.clustering import build_adj_matrix as matrix
from strainy.clustering import build_data
from strainy.gfa_operations import gfa_ops
from strainy.params import *

logger = logging.getLogger()
Expand All @@ -15,8 +14,10 @@ def split_cluster(cl,cluster, data,cons,clSNP, bam, edge, R, I,only_with_common_
#logging.debug("Split cluster: " + str(cluster)+ " "+ str(only_with_common_snip))
child_clusters = []
reads = sorted(set(cl.loc[cl["Cluster"] == cluster,"ReadName"].values))
if cluster == UNCLUSTERED_GROUP_N or cluster==UNCLUSTERED_GROUP_N2 or only_with_common_snip==False: #NA cluster
m = matrix.build_adj_matrix(cl[cl["Cluster"] == cluster], data, clSNP, I, bam, edge, R, only_with_common_snip=False)
if cluster == UNCLUSTERED_GROUP_N or cluster==UNCLUSTERED_GROUP_N2 \
or only_with_common_snip is False: #NA cluster
m = matrix.build_adj_matrix(cl[cl["Cluster"] == cluster],
data, clSNP, I, bam, edge, R, only_with_common_snip=False)
else:
m = matrix.build_adj_matrix(cl[cl["Cluster"] == cluster], data, clSNP, I, bam,edge,R)
m = matrix.remove_edges(m, 1)
Expand All @@ -43,28 +44,30 @@ def split_cluster(cl,cluster, data,cons,clSNP, bam, edge, R, I,only_with_common_
child_clusters.append(new_cl_id)
cl_exist.append(new_cl_id)
for i in group:
mask = cl["ReadName"] == str(reads[i])
cl.loc[mask, "Cluster"] = new_cl_id
mask = cl["ReadName"] == str(reads[i])
cl.loc[mask, "Cluster"] = new_cl_id
else:
uncl = uncl + 1
for i in group:
mask = cl["ReadName"] == str(reads[i])
if only_with_common_snip == True and cluster!=UNCLUSTERED_GROUP_N: #change it for parameter process NA or not
if only_with_common_snip is True and cluster!=UNCLUSTERED_GROUP_N:
#change it for parameter process NA or not
cl.loc[mask, "Cluster"] = new_cl_id_na
child_clusters.append(new_cl_id_na)
else:
cl.loc[mask, "Cluster"] = UNCLUSTERED_GROUP_N
return [new_cl_id_na, clN]


def build_adj_matrix_clusters(edge,cons,cl,flye_consensus, only_with_common_snip=True, set_slusters=None):
if set_slusters == None:
def build_adj_matrix_clusters(edge,cons,cl,flye_consensus,
only_with_common_snip=True, set_slusters=None):
if set_slusters is None:
clusters = sorted(set(cl.loc[cl["Cluster"] != "NA","Cluster"].values))
else:
clusters = set_slusters
try:
clusters.remove(0)
except:
except ValueError:
pass
Y=[]
X=[]
Expand All @@ -88,23 +91,25 @@ def build_adj_matrix_clusters(edge,cons,cl,flye_consensus, only_with_common_snip
second_cl = m.index[k]

if m[second_cl][first_cl] == -1:
m.loc[first_cl, second_cl] = matrix.distance_clusters(edge, first_cl, second_cl, cons, cl,flye_consensus, only_with_common_snip)
m.loc[first_cl, second_cl] = matrix.distance_clusters\
(edge, first_cl, second_cl, cons, cl,flye_consensus, only_with_common_snip)
#m[second_cl][first_cl] = matrix.distance_clusters(edge, first_cl, second_cl, cons, cl,flye_consensus, only_with_common_snip)
return m



def join_clusters(cons, cl, Rcl, edge, consensus, only_with_common_snip=True,set_clusters=None, only_nested=False, transitive=False):
def join_clusters(cons, cl, Rcl, edge, consensus,
only_with_common_snip=True,set_clusters=None, only_nested=False, transitive=False):
MAX_VIS_SIZE = 500
CUT_OFF=3

if only_with_common_snip == False:
if set_clusters == None:
if only_with_common_snip is False:
if set_clusters is None:
M = build_adj_matrix_clusters(edge,cons, cl,consensus, False)
else:
M = build_adj_matrix_clusters(edge, cons, cl, consensus, False,set_clusters)
else:
if set_clusters == None:
if set_clusters is None:
M = build_adj_matrix_clusters(edge,cons, cl,consensus, True)
else:
M = build_adj_matrix_clusters(edge, cons, cl, consensus, True, set_clusters)
Expand All @@ -122,7 +127,7 @@ def join_clusters(cons, cl, Rcl, edge, consensus, only_with_common_snip=True,set
if StRainyArgs().debug and max(G_vis.number_of_nodes(), G_vis.number_of_edges()) < MAX_VIS_SIZE:
G_vis_before = nx.nx_agraph.to_agraph(G_vis)
G_vis_before.layout(prog = "dot")
G_vis_before.draw("%s/graphs/linear_phase_%s.png" % (StRainyArgs().output_intermediate, edge))
G_vis_before.draw(f"{StRainyArgs().output_intermediate}/graphs/linear_phase_{edge}.png")
CUT_OFF=5

path_remove = []
Expand Down Expand Up @@ -156,16 +161,15 @@ def join_clusters(cons, cl, Rcl, edge, consensus, only_with_common_snip=True,set
if StRainyArgs().debug and max(G_vis.number_of_nodes(), G_vis.number_of_edges()) < MAX_VIS_SIZE:
G_vis = nx.nx_agraph.to_agraph(G_vis)
G_vis.layout(prog="dot")
G_vis.draw("%s/graphs/linear_phase_simplified_%s.png" % (StRainyArgs().output_intermediate, edge))
G_vis.draw(f"{StRainyArgs().output_intermediate}/graphs/linear_phase_simplified_{edge}.png")

G = gfa_ops.from_pandas_adjacency_notinplace(M)


if transitive==True:
if transitive is True:
graph_str = str(nx.nx_agraph.to_agraph(G))
not_visited=list(G.nodes).copy()
while not_visited:

node=not_visited[0]
neighbors = nx.all_neighbors(G, node)
n_list=list(neighbors)
Expand All @@ -188,13 +192,13 @@ def join_clusters(cons, cl, Rcl, edge, consensus, only_with_common_snip=True,set
try:
not_visited.remove(nei)
not_visited.remove(node)
except:
except ValueError:
pass
break
try:
not_visited.remove(node)
except:
pass
except ValueError:
pass


else:
Expand All @@ -215,23 +219,24 @@ def join_clusters(cons, cl, Rcl, edge, consensus, only_with_common_snip=True,set
try:
neighbors = nx.all_neighbors(G, node)
for neighbor in list(neighbors):
if cons[int(node)]["Start"] < cons[int(neighbor)]["Start"] and cons[int(node)]["End"] > cons[int(neighbor)]["End"]:
if cons[int(node)]["Start"] < cons[int(neighbor)]["Start"] \
and cons[int(node)]["End"] > cons[int(neighbor)]["End"]:
try:
G.remove_edge(node, neighbor)
logger.debug("REMOVE NESTED" + str(neighbor)+" :"+str(node))
if len(nx.all_neighbors(G, neighbor)) == 1:
try:
nested[neighbor] = nested[neighbor].append(node)
except:
except KeyError:
nodes = [node]
nested[neighbor] = nodes
except:
except KeyError:
continue
except:
continue

groups = list(nx.connected_components(G))
if only_nested == True:
if only_nested is True:
for k,v in nested.items():
if len(v) == 1:
cl.loc[cl["Cluster"] == int(k), "Cluster"] = int(v[0])
Expand Down Expand Up @@ -265,16 +270,17 @@ def split_all(cl, cluster, data, cons,bam, edge, R, I, SNP_pos,reference_seq,typ
build_data.cluster_consensuns(cl, new_cl_id_na, SNP_pos, data, cons, edge, reference_seq)

if clN != 0: #if clN==0 we dont need split NA cluster
split_cluster(cl, new_cl_id_na, data, cons,cons[new_cl_id_na][snp_set], bam, edge, R, I, False)
split_cluster(cl, new_cl_id_na, data, cons,
cons[new_cl_id_na][snp_set], bam, edge, R, I, False)
clusters = sorted(set(cl.loc[cl["Cluster"] != "NA", "Cluster"].values))

if clN == 1: #STOP LOOP IF EXIST
build_data.cluster_consensuns(cl, new_cl_id_na + clN, SNP_pos, data, cons, edge, reference_seq)

for cluster in clusters:
if cluster not in cons:
build_data.cluster_consensuns(cl, cluster, SNP_pos, data, cons, edge, reference_seq)
split_all(cl, cluster, data, cons,bam, edge, R, I, SNP_pos,reference_seq,"unclustered")
for clstr in clusters:
if clstr not in cons:
build_data.cluster_consensuns(cl, clstr, SNP_pos, data, cons, edge, reference_seq)
split_all(cl, clstr, data, cons,bam, edge, R, I, SNP_pos,reference_seq,"unclustered")



Expand All @@ -285,7 +291,7 @@ def postprocess(bam, cl, SNP_pos, data, edge, R,Rcl, I, flye_consensus,mean_edge
reference_seq = build_data.read_fasta_seq(StRainyArgs().fa, edge)
cons = build_data.build_data_cons(cl, SNP_pos, data, edge, reference_seq)
if StRainyArgs().debug:
cl.to_csv("%s/clusters/%s_1.csv" % (StRainyArgs().output_intermediate, edge))
cl.to_csv(f"{StRainyArgs().output_intermediate}/clusters/{edge}_1.csv")
clusters = sorted(set(cl.loc[cl["Cluster"] != "NA","Cluster"].values))


Expand Down Expand Up @@ -339,7 +345,6 @@ def postprocess(bam, cl, SNP_pos, data, edge, R,Rcl, I, flye_consensus,mean_edge
build_data.cluster_consensuns(cl, UNCLUSTERED_GROUP_N, SNP_pos, data, cons, edge, reference_seq)
clSNP = cons[UNCLUSTERED_GROUP_N]["clSNP2"]
splitna = split_cluster(cl, UNCLUSTERED_GROUP_N, data, cons, clSNP, bam, edge, R, I,False)

#Remove unclustered reads after splitting NA cluster
splitna[0]
cl = cl[cl["Cluster"] != splitna[0]]
Expand All @@ -355,25 +360,27 @@ def postprocess(bam, cl, SNP_pos, data, edge, R,Rcl, I, flye_consensus,mean_edge
cl = join_clusters(cons, cl, Rcl, edge, flye_consensus, transitive=True)
cl=update_cluster_set(cl, cluster, SNP_pos, data, cons, edge, reference_seq,mean_edge_cov)
cl = join_clusters(cons, cl, Rcl, edge, flye_consensus)
cl=update_cluster_set(cl, cluster, SNP_pos, data, cons, edge, reference_seq,mean_edge_cov,fraction=0.05)
cl=update_cluster_set(cl, cluster, SNP_pos, data, cons, edge,
reference_seq,mean_edge_cov,fraction=0.05)
cl = join_clusters(cons, cl, Rcl, edge, flye_consensus, only_with_common_snip=False,only_nested=True)
counts = cl["Cluster"].value_counts(dropna = False)
cl = cl[~cl["Cluster"].isin(counts[counts < 6].index)] #TODO change for cov*01.
cl=update_cluster_set(cl, cluster, SNP_pos, data, cons, edge, reference_seq,mean_edge_cov,fraction=0.05)
cl=update_cluster_set(cl, cluster, SNP_pos, data, cons, edge,
reference_seq,mean_edge_cov,fraction=0.05)
return cl



def update_cluster_set(cl, cluster, SNP_pos, data, cons, edge, reference_seq,mean_edge_cov,fraction=0.01):
#Update consensus and remove small clusters (less 5% of unitig coverage)
"""Update consensus and remove small clusters (less 1% of unitig coverage)"""
clusters = sorted(set(cl.loc[cl["Cluster"] != "NA", "Cluster"].values))
for cluster in clusters:
if cluster not in cons:
build_data.cluster_consensuns(cl, cluster, SNP_pos, data, cons, edge, reference_seq)
for clstr in clusters:
if clstr not in cons:
build_data.cluster_consensuns(cl, clstr, SNP_pos, data, cons, edge, reference_seq)

for cluster in clusters:
if cons[cluster]['Cov']<mean_edge_cov*fraction:
cl = cl[cl["Cluster"] !=cluster]
for clstr in clusters:
if cons[clstr]['Cov']<mean_edge_cov*fraction:
cl = cl[cl["Cluster"] !=clstr]


return cl

0 comments on commit 0b9fc06

Please sign in to comment.