-
Notifications
You must be signed in to change notification settings - Fork 1
/
work3.py
272 lines (239 loc) · 9.27 KB
/
work3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Comparison of the number of common barcodes between real variants and false one.
"""
import argparse, pysam, xlsxwriter
from Variant import Variant
GAP = 500
L_SV = [2000,10000] # lengths for variants
def trueSV(file):
'''
Returns a list of variants within a file.
Registers first chromosome, start and end position.
file -- file
'''
truth = []
with open(file,"r") as filin:
line = filin.readline()
while line != '':
sv = line.split()
if sv[4] == "TRA":
line = filin.readline()
sv2 = line.split()
truth.append((sv[0],int(sv[1]),int(sv2[1])))
truth.append((sv[2],int(sv[3]),int(sv2[3])))
else:
truth.append((sv[0],int(sv[1]),int(sv[3])))
line = filin.readline()
return truth
def isValid(variant,L,m):
'''
Returns True if a Variant is valid, else False.
variant -- Variant object
L -- list of real variants
m -- int
'''
for v in L:
if variant.chrom == v[0] and abs(v[1] - variant.pos) <= m and abs(v[2] - variant.get_end()) <= m:
return True
return False
def isValid_bnd(variant,L,m):
'''
Returs True if a BND variant is valid, else False.
variant -- a list as [chrom,start,end]
L -- list of real variants
m -- int
'''
for v in L:
if variant[0] == v[0] and abs(variant[1] - v[1]) <= m and abs(variant[2] - v[2]) <= m:
return True
return False
def get_all_Bx(file,chrom,start,end):
'''
Returns all the barcodes from a region (set).
file -- a samfile
chrom -- chromosome name
start -- region's start position
end -- region's end position
'''
all_bx = set()
if start < 0:
start = 0
if start > end:
start1 = end
end1 = start
else:
start1 = start
end1 = end
for read in file.fetch(chrom,start1,end1):
if read.has_tag('BX'):
bx = read.get_tag('BX')
bx = bx[:-2]
all_bx.add(bx)
return all_bx
def intersection(S1,S2):
'''
Returns the number of barcodes in common between two sets.
'''
res = 0
for bx in S1:
if bx in S2:
res +=1
return res
def get_chrom_bnd(v):
'''
Returns the chromosome name of a BND variant.
v -- Variant object
'''
c = v.alt.split(':')
if '[' in c[0]:
c = c[0].split('[')
return c[1]
else:
c = c[0].split(']')
return c[1]
def get_pos_bnd(v):
'''
Returns the position in ALT attribute of a BND variant.
v -- Variant object
'''
c = v.alt.split(':')
try:
c = c[1].split(']')
return int(c[0])
except ValueError:
c = c[0].split('[')
return int(c[0])
def sortSV(vcf,bam,truth,margin):
'''
Creates results.xlsx containing the number of barcodes for real
variants and false one.
vcf -- vcf file with variants
bam -- bam file with reads mapping in the genome reference
truth -- file with real variants
margin -- boolean
'''
cpt = 1
L = []
row = 15 * [0]
m = 100 if margin else 0
realSV = trueSV(truth)
samfile = pysam.AlignmentFile(bam,"rb")
workbook = xlsxwriter.Workbook('results.xlsx')
worksheet = workbook.add_worksheet()
# Used to store current chromosomes for BND, and to output BND when changing chromosome
curChr1 = ""
curChr2 = ""
with open(vcf,"r") as filin:
# skips file's head :
line = filin.readline()
while line.startswith('#'):
line = filin.readline()
# for each variant :
while line != '':
v = Variant(line)
print("variant",cpt)
cpt += 1
# We keep filling L if both chromosomes correspond to current one
# If not, this means we're not processing the same variant anymore, so we treat the BND we've read so far
if v.get_svtype() == "BND" and ((curChr1 == "" and curChr2 == "") or (curChr1 == v.chrom and curChr2 == get_chrom_bnd(v))):
if L ==[]:
L.append([v.chrom,v.pos,-1])
L.append([get_chrom_bnd(v),get_pos_bnd(v),-1])
curChr1 = v.chrom
curChr2 = get_chrom_bnd(v)
else:
if v.pos > L[0][2]:
L[0][2] = v.pos
if get_pos_bnd(v) > L[1][2]:
L[1][2] = get_pos_bnd(v)
else:
# we treat the BND variants :
if L != [] and L[0][2] != -1 and L[1][2] != -1:
all_Bx1 = get_all_Bx(samfile,L[0][0],L[0][1]-GAP,L[0][1]+GAP)
all_Bx2 = get_all_Bx(samfile,L[0][0],L[0][2]-GAP,L[0][2]+GAP)
all_Bx_pair1 = get_all_Bx(samfile,L[1][0],L[1][1]-GAP,L[1][1]+GAP)
all_Bx_pair2 = get_all_Bx(samfile,L[1][0],L[1][2]-GAP,L[1][2]+GAP)
nb_common1 = intersection(all_Bx1,all_Bx2)
nb_common2 = intersection(all_Bx_pair1,all_Bx_pair2)
# first BND variant is valid :
if L[0][2] - L[0][1] < L_SV[0]:
cln = 0
if L_SV[0] <= L[0][2] - L[0][1] < L_SV[1]:
cln = 6
if L[0][2] - L[0][1] >= L_SV[1]:
cln = 12
if isValid_bnd(L[0],realSV,m):
worksheet.write(row[cln],cln,L[0][0]+":"+str(L[0][1])+"-"+str(L[0][2]))
worksheet.write(row[cln],cln+1,nb_common1)
row[cln] += 1
# first BND variant is not valid :
else:
worksheet.write(row[cln+2],cln+2,L[0][0]+":"+str(L[0][1])+"-"+str(L[0][2]))
worksheet.write(row[cln+2],cln+3,nb_common1)
row[cln+2] += 1
# second BND variant is valid :
if L[1][2] - L[1][1] < L_SV[0]:
cln = 0
if L_SV[0] <= L[1][2] - L[1][1] < L_SV[1]:
cln = 6
if L[1][2] - L[1][1] >= L_SV[1]:
cln = 12
if isValid_bnd(L[1],realSV,m):
worksheet.write(row[cln],cln,L[1][0]+":"+str(L[1][1])+"-"+str(L[1][2]))
worksheet.write(row[cln],cln+1,nb_common2)
row[cln] += 1
# second BND variant is not valid :
else:
worksheet.write(row[cln+2],cln+2,L[1][0]+":"+str(L[1][1])+"-"+str(L[1][2]))
worksheet.write(row[cln+2],cln+3,nb_common2)
row[cln+2] += 1
L = []
# Update current chromosomes and L if we read a BND, otherwise set them / leave them empty
if v.get_svtype() == "BND":
L.append([v.chrom,v.pos,-1])
L.append([get_chrom_bnd(v),get_pos_bnd(v),-1])
curChr1 = v.chrom
curChr2 = get_chrom_bnd(v)
else:
curChr1 = ""
curChr2 = ""
# treatment of a non BND variant :
# Only do if we didn't read a BND
if v.get_svtype() != "BND":
end = v.get_end()
all_Bx1 = get_all_Bx(samfile,v.chrom,v.pos-GAP,v.pos+GAP)
all_Bx2 = get_all_Bx(samfile,v.chrom,end-GAP,end+GAP)
nb_common = intersection(all_Bx1,all_Bx2)
if end - v.pos < L_SV[0]:
cln = 0
if L_SV[0] <= end - v.pos < L_SV[1]:
cln = 6
if end - v.pos >= L_SV[1]:
cln = 12
# variant is valid :
if isValid(v,realSV,m):
worksheet.write(row[cln],cln,v.chrom+":"+str(v.pos)+"-"+str(end))
worksheet.write(row[cln],cln+1,nb_common)
row[cln] += 1
# variant is not valid :
else:
worksheet.write(row[cln+2],cln+2,v.chrom+":"+str(v.pos)+"-"+str(end))
worksheet.write(row[cln+2],cln+3,nb_common)
row[cln+2] += 1
line = filin.readline()
workbook.close()
samfile.close()
####################################################
parser = argparse.ArgumentParser(description='Sort SV')
parser.add_argument('-vcf', type=str, required=True, help='vcf file')
parser.add_argument('-bam', type=str, required=True, help='bam file')
parser.add_argument('-t', type=str, required=True, help='Truth file')
parser.add_argument('-m', action='store_true', help="Allows a margin to increase variants's length")
args = parser.parse_args()
if __name__ == '__main__':
if args.m:
sortSV(args.vcf,args.bam,args.t,True)
else:
sortSV(args.vcf,args.bam,args.t,False)