This repository has been archived by the owner on May 16, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
gff_to_reference.py
236 lines (202 loc) · 8.49 KB
/
gff_to_reference.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
from gatherer.bioinfo import get_gff_attributes, get_gff_attributes_str
from gatherer.bioinfo import header_to_id
from gatherer.util import *
from gatherer.rna_type import *
import sys
import pandas as pd
import os
def matching_coords(a, b, max_outside):
diff_end = max(0, a[1] - b[1])
diff_start = max(0, b[0] - a[0])
return diff_end + diff_start <= max_outside
def sorted_match(my_coords, ref_coords):
max_out = [(my_coord[1] - my_coord[0])*0.05 for my_coord in my_coords]
starts = [(my_coords[i][0], True, i) for i in range(len(my_coords))]
starts += [(ref_coords[i][0], False, i) for i in range(len(ref_coords))]
ends = [(my_coords[i][1], True, i) for i in range(len(my_coords))]
ends += [(ref_coords[i][1], False, i) for i in range(len(ref_coords))]
edges = [(True, (coord, not_ref, index)) for coord, not_ref, index in starts]
edges += [(False, (coord, not_ref, index)) for coord, not_ref, index in ends]
edges.sort(key=lambda x: x[1][0])
possible_matchs = {}
keeping_track_of = set()
maybe_ending = set()
for is_start, values in edges:
coord, not_ref, index = values
if not_ref:
if is_start:
keeping_track_of.add(index)
possible_matchs[index] = set()
elif index in keeping_track_of:
maybe_ending.add(index)
#keeping_track_of.remove(index)
else:
remove_by_distance = []
for fading_index in maybe_ending:
if (coord - my_coords[fading_index][1]) > max_out[fading_index]:
remove_by_distance.append(fading_index)
for to_remove in remove_by_distance:
maybe_ending.remove(to_remove)
keeping_track_of.remove(to_remove)
for my_coord in keeping_track_of:
possible_matchs[my_coord].add(index)
matched = set()
for my_index, ref_indexes in possible_matchs.items():
start, end = my_coords[my_index]
max_outside = max_out[my_index]
for ref_index in ref_indexes:
other_start, other_end = ref_coords[ref_index]
result = matching_coords((start, end),
(other_start, other_end),
max_outside)
if result:
matched.add(my_index)
break
return matched
def update_attrs(attr_str):
attrs = get_gff_attributes(attr_str)
if "family" in attrs:
attrs["rfam"] = attrs["family"]
if not "rfam" in attrs:
new_rfam = get_rfam_from_rnacentral(attrs["ID"])
if new_rfam != None:
attrs["rfam"] = new_rfam
if "rfam" in attrs:
new_type = get_rna_type(attrs["rfam"])
attrs["type"] = new_type
else:
if not "type" in attrs:
attrs["type"] = "other"
#Replace 'misc_rna' with 'other'
if attrs["type"] in unkown_rna:
attrs["type"] = "other"
attrs["type"] = get_full_type(attrs["type"])
return get_gff_attributes_str(attrs)
def read_gff(gff_path, feature = "transcript", get_types = False):
print("Loading " + gff_path)
df = pd.read_csv(gff_path, sep="\t", header=None,
names = ["seqname", "source", "feature", "start", "end",
"score", "strand", "frame", "attribute"])
#print("\tFiltering features")
df = df[df["feature"] == feature]
#print("\tUpdating attributes")
if get_types:
df["attribute"] = df.apply(lambda row: row["attribute"], axis=1)
df["rna_type"] = df.apply(
lambda row: get_gff_attributes(row["attribute"])["type"],
axis = 1)
return df
def get_coords(df):
coords_by_chr = {}
chr_groups = df.groupby(["seqname"])
for chr_name, group in chr_groups:
coords = []
for index, row in group.iterrows():
x = int(row["start"])
y = int(row["end"])
coords.append((min(x,y), max(x,y)))
coords.sort(key=lambda x: x[0])
coords_by_chr[str(chr_name)] = coords
return coords_by_chr
gatherer_dir = os.path.abspath(os.path.dirname(os.path.realpath(__file__)))
print("RNA Gatherer dir = ", gatherer_dir)
#annotation_path = sys.argv[1]
annotation_path = gatherer_dir + "/result_data/annotation_benchmarking_mgi/annotation.gff"
#reference_path = sys.argv[2]
reference_path = gatherer_dir + "/test_data/reference/mus_musculus.GRCm38.gff3"
#output_path = sys.argv[3]
output_path = gatherer_dir + "/result_data/annotation_benchmarking_mgi/reference_matchs.tsv"
annotation_df = read_gff(annotation_path, get_types=True)
reference_transcripts = read_gff(reference_path)
reference_exons = read_gff(reference_path, feature="noncoding_exon")
print("Extracting coordinates")
transcripts_by_chr = get_coords(reference_transcripts)
exons_by_chr = get_coords(reference_exons)
print("Matching coordinates for each type")
def match_groups(my_coords, ref_coords):
count = 0
matched = set()
for chr_name, my_chr in my_coords.items():
#print("\tChr: " + chr_name)
if chr_name in ref_coords:
ref_chr = ref_coords[chr_name]
local_matchs = sorted_match(my_chr, ref_chr)
for local_match in local_matchs:
matched.add(str(chr_name)+"_"+str(local_match))
return matched
def all_names(my_coords):
matched = set()
for chr_name, my_chr in my_coords:
for i in range(len(my_chr)):
matched.add(str(chr_name)+"_"+str(i))
raw_matchs_by_type = {}
n_types = len(annotation_df["rna_type"].unique().tolist())
progress_bar = tqdm(total=n_types)
for rna_type, rna_group in annotation_df.groupby(["rna_type"]):
#print("Matching for " + rna_type)
my_coords = get_coords(rna_group)
matched_transcripts = match_groups(my_coords, transcripts_by_chr)
matched_exons = match_groups(my_coords, exons_by_chr)
all_matched = set.union(matched_exons, matched_transcripts)
raw_matchs_by_type[rna_type] = {
"matchs": len(all_matched),
"total": len(rna_group)}
progress_bar.update(1)
progress_bar.close()
print("RNA Type\tExon Match Rate\tTranscript Match Rate")
for rna_type, counts_dict in raw_matchs_by_type.items():
#exons = counts_dict["exons"]
transcripts = counts_dict["matchs"]
total = counts_dict["total"]
#print(rna_type+"\t"+str(exons/total)+"\t"+str(transcripts/total))
print(rna_type+"\t"+str(total)+"\t"+str(transcripts/total))
expanded_matchs_by_type = {}
for rna_type in raw_matchs_by_type.keys():
expanded_matchs = {"total": 0, "matchs": 0}
for other_type, stats in raw_matchs_by_type.items():
if rna_type in other_type:
expanded_matchs["total"] += stats["total"]
expanded_matchs["matchs"] += stats["matchs"]
expanded_matchs_by_type[rna_type] = expanded_matchs
raw_rows = []
for rna_type, stats in expanded_matchs_by_type.items():
total = stats["total"]
transcripts = stats["matchs"]
perc = (transcripts/total)* 100.0
raw_rows.append([rna_type, total, transcripts, perc])
raw_rows.sort(key=lambda x: x[0], reverse=True)
raw_lines = [[str(x) for x in row]
for row in raw_rows]
rows = "\n".join(["\t".join(line) for line in raw_lines])+"\n"
with open(output_path, 'w') as stream:
stream.write("Tipos de ncRNA\tTotal\tCompatíveis com a Referência\tCompatíveis(%)\n")
stream.write(rows)
print(rows)
class_stats = {}
adapted_rows = []
for rna_type, total, transcripts, perc in raw_lines:
tp_parts = rna_type.split(";")
if not tp_parts[0] in class_stats:
class_stats[tp_parts[0]] = [0,0]
tp = ""
class_stats[tp_parts[0]]
if len(tp_parts) == 1:
tp = tp_parts[0]+'\t'
class_stats[tp_parts[0]][1] += int(transcripts)
class_stats[tp_parts[0]][0] += int(total)
else:
if len(tp_parts) == 2:
class_stats[tp_parts[0]][1] -= int(transcripts)
class_stats[tp_parts[0]][0] -= int(total)
tp = "\t"+tp_parts[1]
else:
continue
adapted_rows.append("\t".join([tp, transcripts + " (" + str(round(float(perc),3)) + "%)"]))
for tp, vals in class_stats.items():
total, transcripts = vals
adapted_rows.append("\t".join(["\t"+tp+" No subtype", str(transcripts) + " (" + str(round((transcripts/total)*100.0,3)) + "%)"]))
adapted_lines = "\n".join(adapted_rows)+"\n"
with open(output_path+".artigo.tsv", 'w') as stream:
stream.write("ncRNA Types\t\tMatching a Reference (%)\n")
stream.write(adapted_lines)
print(adapted_lines)