ref

katerinakazantseva · Jun 7, 2024 · 0b9fc06 · 0b9fc06
1 parent 5748aeb
commit 0b9fc06
Showing 1 changed file with 51 additions and 44 deletions.
diff --git a/strainy/clustering/cluster_postprocess.py b/strainy/clustering/cluster_postprocess.py
@@ -1,11 +1,10 @@
 import networkx as nx
 import logging
 import pandas as pd
-
 from strainy.clustering.community_detection import find_communities
-import strainy.clustering.build_adj_matrix as matrix
-import strainy.clustering.build_data as build_data
-import strainy.gfa_operations.gfa_ops as gfa_ops
+from strainy.clustering import build_adj_matrix as matrix
+from strainy.clustering import build_data
+from strainy.gfa_operations import gfa_ops
 from strainy.params import *
 
 logger = logging.getLogger()
@@ -15,8 +14,10 @@ def split_cluster(cl,cluster, data,cons,clSNP, bam, edge, R, I,only_with_common_
     #logging.debug("Split cluster: " + str(cluster)+ " "+ str(only_with_common_snip))
     child_clusters = []
     reads = sorted(set(cl.loc[cl["Cluster"] == cluster,"ReadName"].values))
-    if cluster == UNCLUSTERED_GROUP_N or cluster==UNCLUSTERED_GROUP_N2  or only_with_common_snip==False: #NA cluster
-        m = matrix.build_adj_matrix(cl[cl["Cluster"] == cluster], data, clSNP, I, bam, edge, R, only_with_common_snip=False)
+    if cluster == UNCLUSTERED_GROUP_N or cluster==UNCLUSTERED_GROUP_N2  \
+            or only_with_common_snip is False: #NA cluster
+        m = matrix.build_adj_matrix(cl[cl["Cluster"] == cluster],
+                                    data, clSNP, I, bam, edge, R, only_with_common_snip=False)
     else:
         m = matrix.build_adj_matrix(cl[cl["Cluster"] == cluster], data, clSNP, I, bam,edge,R)
     m = matrix.remove_edges(m, 1)
@@ -43,28 +44,30 @@ def split_cluster(cl,cluster, data,cons,clSNP, bam, edge, R, I,only_with_common_
                     child_clusters.append(new_cl_id)
                 cl_exist.append(new_cl_id)
                 for i in group:
-                     mask = cl["ReadName"] == str(reads[i])
-                     cl.loc[mask, "Cluster"] = new_cl_id
+                    mask = cl["ReadName"] == str(reads[i])
+                    cl.loc[mask, "Cluster"] = new_cl_id
             else:
                 uncl = uncl + 1
                 for i in group:
                     mask = cl["ReadName"] == str(reads[i])
-                if only_with_common_snip == True and cluster!=UNCLUSTERED_GROUP_N: #change it for parameter process NA or not
+                if only_with_common_snip is True and cluster!=UNCLUSTERED_GROUP_N:
+                    #change it for parameter process NA or not
                     cl.loc[mask, "Cluster"] = new_cl_id_na
                     child_clusters.append(new_cl_id_na)
                 else:
                     cl.loc[mask, "Cluster"] = UNCLUSTERED_GROUP_N
     return [new_cl_id_na, clN]
 
 
-def build_adj_matrix_clusters(edge,cons,cl,flye_consensus, only_with_common_snip=True, set_slusters=None):
-    if set_slusters == None:
+def build_adj_matrix_clusters(edge,cons,cl,flye_consensus,
+                              only_with_common_snip=True, set_slusters=None):
+    if set_slusters is None:
         clusters = sorted(set(cl.loc[cl["Cluster"] != "NA","Cluster"].values))
     else:
         clusters = set_slusters
     try:
         clusters.remove(0)
-    except:
+    except ValueError:
         pass
     Y=[]
     X=[]
@@ -88,23 +91,25 @@ def build_adj_matrix_clusters(edge,cons,cl,flye_consensus, only_with_common_snip
             second_cl = m.index[k]
 
             if m[second_cl][first_cl] == -1:
-                m.loc[first_cl, second_cl] = matrix.distance_clusters(edge, first_cl, second_cl, cons, cl,flye_consensus, only_with_common_snip)
+                m.loc[first_cl, second_cl] = matrix.distance_clusters\
+                    (edge, first_cl, second_cl, cons, cl,flye_consensus, only_with_common_snip)
                 #m[second_cl][first_cl] = matrix.distance_clusters(edge, first_cl, second_cl, cons, cl,flye_consensus, only_with_common_snip)
     return m
 
 
 
-def join_clusters(cons, cl, Rcl, edge, consensus, only_with_common_snip=True,set_clusters=None, only_nested=False, transitive=False):
+def join_clusters(cons, cl, Rcl, edge, consensus,
+                  only_with_common_snip=True,set_clusters=None, only_nested=False, transitive=False):
     MAX_VIS_SIZE = 500
     CUT_OFF=3
 
-    if only_with_common_snip == False:
-        if set_clusters == None:
+    if only_with_common_snip is False:
+        if set_clusters is None:
             M = build_adj_matrix_clusters(edge,cons, cl,consensus, False)
         else:
             M = build_adj_matrix_clusters(edge, cons, cl, consensus, False,set_clusters)
     else:
-        if set_clusters == None:
+        if set_clusters is None:
             M = build_adj_matrix_clusters(edge,cons, cl,consensus, True)
         else:
             M = build_adj_matrix_clusters(edge, cons, cl, consensus, True, set_clusters)
@@ -122,7 +127,7 @@ def join_clusters(cons, cl, Rcl, edge, consensus, only_with_common_snip=True,set
     if StRainyArgs().debug and max(G_vis.number_of_nodes(), G_vis.number_of_edges()) < MAX_VIS_SIZE:
         G_vis_before = nx.nx_agraph.to_agraph(G_vis)
         G_vis_before.layout(prog = "dot")
-        G_vis_before.draw("%s/graphs/linear_phase_%s.png" % (StRainyArgs().output_intermediate, edge))
+        G_vis_before.draw(f"{StRainyArgs().output_intermediate}/graphs/linear_phase_{edge}.png")
         CUT_OFF=5
 
     path_remove = []
@@ -156,16 +161,15 @@ def join_clusters(cons, cl, Rcl, edge, consensus, only_with_common_snip=True,set
     if StRainyArgs().debug and max(G_vis.number_of_nodes(), G_vis.number_of_edges()) < MAX_VIS_SIZE:
         G_vis = nx.nx_agraph.to_agraph(G_vis)
         G_vis.layout(prog="dot")
-        G_vis.draw("%s/graphs/linear_phase_simplified_%s.png" % (StRainyArgs().output_intermediate, edge))
+        G_vis.draw(f"{StRainyArgs().output_intermediate}/graphs/linear_phase_simplified_{edge}.png")
 
     G = gfa_ops.from_pandas_adjacency_notinplace(M)
 
 
-    if transitive==True:
+    if transitive is True:
         graph_str = str(nx.nx_agraph.to_agraph(G))
         not_visited=list(G.nodes).copy()
         while not_visited:
-
             node=not_visited[0]
             neighbors = nx.all_neighbors(G, node)
             n_list=list(neighbors)
@@ -188,13 +192,13 @@ def join_clusters(cons, cl, Rcl, edge, consensus, only_with_common_snip=True,set
                     try:
                         not_visited.remove(nei)
                         not_visited.remove(node)
-                    except:
+                    except ValueError:
                         pass
                     break
             try:
                 not_visited.remove(node)
-            except:
-                 pass
+            except ValueError:
+                pass
 
 
     else:
@@ -215,23 +219,24 @@ def join_clusters(cons, cl, Rcl, edge, consensus, only_with_common_snip=True,set
             try:
                 neighbors = nx.all_neighbors(G, node)
                 for neighbor in list(neighbors):
-                    if cons[int(node)]["Start"] < cons[int(neighbor)]["Start"] and cons[int(node)]["End"] > cons[int(neighbor)]["End"]:
+                    if cons[int(node)]["Start"] < cons[int(neighbor)]["Start"] \
+                            and cons[int(node)]["End"] > cons[int(neighbor)]["End"]:
                         try:
                             G.remove_edge(node, neighbor)
                             logger.debug("REMOVE NESTED" + str(neighbor)+" :"+str(node))
                             if len(nx.all_neighbors(G, neighbor)) == 1:
                                 try:
                                     nested[neighbor] = nested[neighbor].append(node)
-                                except:
+                                except KeyError:
                                     nodes = [node]
                                     nested[neighbor] = nodes
-                        except:
+                        except KeyError:
                             continue
             except:
                 continue
 
         groups = list(nx.connected_components(G))
-        if only_nested == True:
+        if only_nested is True:
             for k,v in nested.items():
                 if len(v) == 1:
                     cl.loc[cl["Cluster"] == int(k), "Cluster"] = int(v[0])
@@ -265,16 +270,17 @@ def split_all(cl, cluster, data, cons,bam, edge, R, I, SNP_pos,reference_seq,typ
         build_data.cluster_consensuns(cl, new_cl_id_na, SNP_pos, data, cons, edge, reference_seq)
 
         if clN != 0: #if clN==0 we dont need split NA cluster
-            split_cluster(cl, new_cl_id_na, data, cons,cons[new_cl_id_na][snp_set], bam, edge, R, I, False)
+            split_cluster(cl, new_cl_id_na, data, cons,
+                          cons[new_cl_id_na][snp_set], bam, edge, R, I, False)
         clusters = sorted(set(cl.loc[cl["Cluster"] != "NA", "Cluster"].values))
 
         if clN == 1: #STOP LOOP IF EXIST
             build_data.cluster_consensuns(cl, new_cl_id_na + clN, SNP_pos, data, cons, edge, reference_seq)
 
-        for cluster in clusters:
-            if cluster not in cons:
-                build_data.cluster_consensuns(cl, cluster, SNP_pos, data, cons, edge, reference_seq)
-                split_all(cl, cluster, data, cons,bam, edge, R, I, SNP_pos,reference_seq,"unclustered")
+        for clstr in clusters:
+            if clstr not in cons:
+                build_data.cluster_consensuns(cl, clstr, SNP_pos, data, cons, edge, reference_seq)
+                split_all(cl, clstr, data, cons,bam, edge, R, I, SNP_pos,reference_seq,"unclustered")
 
 
 
@@ -285,7 +291,7 @@ def postprocess(bam, cl, SNP_pos, data, edge, R,Rcl, I, flye_consensus,mean_edge
     reference_seq = build_data.read_fasta_seq(StRainyArgs().fa, edge)
     cons = build_data.build_data_cons(cl, SNP_pos, data, edge, reference_seq)
     if StRainyArgs().debug:
-        cl.to_csv("%s/clusters/%s_1.csv" % (StRainyArgs().output_intermediate, edge))
+        cl.to_csv(f"{StRainyArgs().output_intermediate}/clusters/{edge}_1.csv")
     clusters = sorted(set(cl.loc[cl["Cluster"] != "NA","Cluster"].values))
 
 
@@ -339,7 +345,6 @@ def postprocess(bam, cl, SNP_pos, data, edge, R,Rcl, I, flye_consensus,mean_edge
     build_data.cluster_consensuns(cl, UNCLUSTERED_GROUP_N, SNP_pos, data, cons, edge, reference_seq)
     clSNP = cons[UNCLUSTERED_GROUP_N]["clSNP2"]
     splitna = split_cluster(cl, UNCLUSTERED_GROUP_N, data, cons, clSNP, bam, edge, R, I,False)
-
     #Remove unclustered reads after splitting NA cluster
     splitna[0]
     cl = cl[cl["Cluster"] != splitna[0]]
@@ -355,25 +360,27 @@ def postprocess(bam, cl, SNP_pos, data, edge, R,Rcl, I, flye_consensus,mean_edge
     cl = join_clusters(cons, cl, Rcl, edge, flye_consensus, transitive=True)
     cl=update_cluster_set(cl, cluster, SNP_pos, data, cons, edge, reference_seq,mean_edge_cov)
     cl = join_clusters(cons, cl, Rcl, edge, flye_consensus)
-    cl=update_cluster_set(cl, cluster, SNP_pos, data, cons, edge, reference_seq,mean_edge_cov,fraction=0.05)
+    cl=update_cluster_set(cl, cluster, SNP_pos, data, cons, edge,
+                          reference_seq,mean_edge_cov,fraction=0.05)
     cl = join_clusters(cons, cl, Rcl, edge, flye_consensus, only_with_common_snip=False,only_nested=True)
     counts = cl["Cluster"].value_counts(dropna = False)
     cl = cl[~cl["Cluster"].isin(counts[counts < 6].index)]  #TODO change for cov*01.
-    cl=update_cluster_set(cl, cluster, SNP_pos, data, cons, edge, reference_seq,mean_edge_cov,fraction=0.05)
+    cl=update_cluster_set(cl, cluster, SNP_pos, data, cons, edge,
+                          reference_seq,mean_edge_cov,fraction=0.05)
     return cl
 
 
 
 def update_cluster_set(cl, cluster, SNP_pos, data, cons, edge, reference_seq,mean_edge_cov,fraction=0.01):
-    #Update consensus and remove small clusters (less 5% of unitig coverage)
+    """Update consensus and remove small clusters (less 1% of unitig coverage)"""
     clusters = sorted(set(cl.loc[cl["Cluster"] != "NA", "Cluster"].values))
-    for cluster in clusters:
-        if cluster not in cons:
-            build_data.cluster_consensuns(cl, cluster, SNP_pos, data, cons, edge, reference_seq)
+    for clstr in clusters:
+        if clstr not in cons:
+            build_data.cluster_consensuns(cl, clstr, SNP_pos, data, cons, edge, reference_seq)
 
-    for cluster in clusters:
-        if cons[cluster]['Cov']<mean_edge_cov*fraction:
-            cl = cl[cl["Cluster"] !=cluster]
+    for clstr in clusters:
+        if cons[clstr]['Cov']<mean_edge_cov*fraction:
+            cl = cl[cl["Cluster"] !=clstr]
 
 
     return cl