-
Notifications
You must be signed in to change notification settings - Fork 0
/
frequency_scanner.py
244 lines (203 loc) · 8.05 KB
/
frequency_scanner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
# Frequency_Scanner takes fastq files and calculates the frequency of every type of single nucletodie change
# Copyright (C) 2019 Ryan C Shean
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import glob
import subprocess
import argparse
import sys
def up_count(s1, s2, c1, c2):
global a_count
global c_count
global t_count
global g_count
global a_to_c
global a_to_t
global a_to_g
global c_to_a
global c_to_t
global c_to_g
global t_to_a
global t_to_c
global t_to_g
global g_to_a
global g_to_c
global g_to_t
if s1 == 'A':
a_count += c1
if s2 == 'C':
a_to_c += c2
elif s2 == 'T':
a_to_t += c2
elif s2 == 'G':
a_to_g += c2
elif s1 == 'C':
c_count += c1
if s2 == 'A':
c_to_a += c2
elif s2 == 'T':
c_to_t += c2
elif s2 == 'G':
c_to_g += c2
elif s1 == 'T':
t_count += c1
if s2 == 'A':
t_to_a += c2
elif s2 == 'C':
t_to_c += c2
elif s2 == 'G':
t_to_g += c2
elif s1 == 'G':
g_count += c1
if s2 == 'A':
g_to_a += c2
elif s2 == 'C':
g_to_c += c2
elif s2 == 'T':
g_to_t += c2
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Frequency Scanner is used to calculate relative frequencies nucleotide transitions and transversions of low frequency random mutations. Takes a reference fasta and aligns all fasta files in the working directory to the reference and calculates mutational frequeinces based off this. If you are using paired end reads you need to have them fully merged at input.')
parser.add_argument('reference_fasta', help='You need a reference fasta to align all your reads to. For best results this should be the majority consensus. Although you will get decent output so long as your input reads can align to this file')
parser.add_argument('-qual', help='Specify a qualtiy filter, any read that has any base with a quality score lower than this will be completely removed. Default: 34')
parser.add_argument('-af', help='Mutations with a frequency greater than this will not be counted. Default 0.05')
parser.add_argument('-gen_len', help='Per genome adjustment, default 10,000')
args = parser.parse_args()
if args.qual:
qual = int(args.qual)
else:
qual = 34
if args.af:
af = float(args.af)
else:
af = 0.05
if args.gen_len:
gen_len = float(args.gen_len)
else:
gen_len = 10000.0
ref_fasta = args.reference_fasta
subprocess.call('bwa index ' + ref_fasta, shell=True)
for name in glob.glob('*.fastq'):
base = name.split('.')[0]
out = '-'.join(base.split('-')[0:3]) + '-' + str(qual)
subprocess.call('fastq_quality_filter -q ' + str(qual) + ' -p 100 -i ' + name + ' -o ' + out + '.fastq', shell=True)
subprocess.call('bwa mem ' + ref_fasta + ' ' + out + '.fastq > ' + base + '.bam',shell=True)
subprocess.call('samtools sort ' + base + '.bam -o ' + base + '_sorted.bam', shell=True)
subprocess.call('./bin/bam-readcount -f ' + ref_fasta + ' ' + base + '_sorted.bam -w 0 > ' + base + '.vcf', shell=True)
for filepath in glob.glob('*.vcf'):
a_count = 0
c_count = 0
t_count = 0
g_count = 0
a_to_c = 0
a_to_t = 0
a_to_g = 0
c_to_a = 0
c_to_t = 0
c_to_g = 0
t_to_a = 0
t_to_c = 0
t_to_g = 0
g_to_a = 0
g_to_c = 0
g_to_t = 0
for line in open(filepath):
# not a comment at the header
line_split = line.split('\t')
temp_dp = float(line_split[3])
temp_ref = line_split[2]
temp_a_count = int(line_split[5].split(':')[1])
temp_c_count = int(line_split[6].split(':')[1])
temp_g_count = int(line_split[7].split(':')[1])
temp_t_count = int(line_split[8].split(':')[1])
if temp_ref == 'A':
a_count += temp_a_count
if temp_t_count / temp_dp <= af:
a_to_t += temp_t_count
if temp_c_count / temp_dp <= af:
a_to_c += temp_c_count
if temp_g_count / temp_dp <= af:
a_to_g += temp_g_count
elif temp_ref == 'C':
c_count += temp_c_count
if temp_a_count / temp_dp <= af:
c_to_a += temp_a_count
if temp_t_count / temp_dp <= af:
c_to_t += temp_t_count
if temp_g_count / temp_dp <= af:
c_to_g += temp_g_count
elif temp_ref == 'G':
g_count += temp_g_count
if temp_a_count / temp_dp <= af:
g_to_a += temp_a_count
if temp_c_count / temp_dp <= af:
g_to_c += temp_c_count
if temp_t_count / temp_dp <= af:
g_to_t += temp_t_count
elif temp_ref == 'T':
t_count += temp_t_count
if temp_a_count / temp_dp <= af:
t_to_a += temp_a_count
if temp_c_count / temp_dp <= af:
t_to_c += temp_c_count
if temp_g_count / temp_dp <= af:
t_to_g += temp_g_count
total = a_count + c_count + t_count + g_count# + a_to_c + a_to_t + a_to_g + c_to_a + c_to_t + c_to_g + t_to_a + t_to_c + t_to_g + g_to_a + g_to_c + g_to_t
print('Total Bases counted: ' + str(total))
if total != 0:
total = total / gen_len
else:
total = 1
print()
print('A count: ' + str(a_count/total))
print('\tA -> C: ' + str(a_to_c/total))
print('\tA -> T: ' + str(a_to_t/total))
print('\tA -> G: ' + str(a_to_g/total))
print('')
print('C count: ' + str(c_count/total))
print('\tC -> A: ' + str(c_to_a/total))
print('\tC -> T: ' + str(c_to_t/total))
print('\tC -> G: ' + str(c_to_g/total))
print('')
print('T count: ' + str(t_count/total))
print('\tT -> A: ' + str(t_to_a/total))
print('\tT -> C: ' + str(t_to_c/total))
print('\tT -> G: ' + str(t_to_g/total))
print('')
print('G count: ' + str(g_count/total))
print('\tG -> A: ' + str(g_to_a/total))
print('\tG -> C: ' + str(g_to_c/total))
print('\tG -> T: ' + str(g_to_t/total))
info = filepath.split('.')[0]
print(info)
samp = info
con =''# info.split('_')[0]
prep =''# info.split('_')[1]
passage =''# info.split('_#')[1]
g = open(base + '.csv', 'w')
g.write(samp + ',' + prep + ',' + con + ',total,' + str(total) +','+ str(af)+ ',' + passage +'\n')
g.write(samp + ',' + prep + ',' + con + ',A,' + str(a_count) +','+ str(af)+ ',' + passage +'\n')
g.write(samp + ',' + prep + ',' + con + ',AC,' + str(a_to_c/total) +','+ str(af) + ',' + passage +'\n')
g.write(samp + ',' + prep + ',' + con + ',AT,' + str(a_to_t/total) + ','+ str(af) + ',' + passage +'\n')
g.write(samp + ',' + prep + ',' + con + ',AG,' + str(a_to_g/total) + ','+ str(af) + ',' + passage +'\n')
g.write(samp + ',' + prep + ',' + con + ',C,' + str(c_count) +','+ str(af)+ ',' + passage +'\n')
g.write(samp + ',' + prep + ',' + con + ',CA,' + str(c_to_a/total) + ','+ str(af) + ',' + passage +'\n')
g.write(samp + ',' + prep + ',' + con + ',CT,' + str(c_to_t/total) + ','+ str(af) + ',' + passage +'\n')
g.write(samp + ',' + prep + ',' + con + ',CG,' + str(c_to_g/total) + ','+ str(af) + ',' + passage +'\n')
g.write(samp + ',' + prep + ',' + con + ',T,' + str(t_count) +','+ str(af)+ ',' + passage +'\n')
g.write(samp + ',' + prep + ',' + con + ',TA,' + str(t_to_a/total) + ','+ str(af) + ',' + passage +'\n')
g.write(samp + ',' + prep + ',' + con + ',TC,' + str(t_to_c/total) +','+ str(af) + ',' + passage +'\n')
g.write(samp + ',' + prep + ',' + con + ',TG,' + str(t_to_g/total) + ','+ str(af) + ',' + passage +'\n')
g.write(samp + ',' + prep + ',' + con + ',G,' + str(g_count) +','+ str(af)+ ',' + passage +'\n')
g.write(samp + ',' + prep + ',' + con + ',GA,' + str(g_to_a/total) + ','+ str(af) + ',' + passage +'\n')
g.write(samp + ',' + prep + ',' + con + ',GC,' + str(g_to_c/total) + ','+ str(af) + ',' + passage +'\n')
g.write(samp + ',' + prep + ',' + con + ',GT,' + str(g_to_t/total) + ','+ str(af) + ',' + passage +'\n')
g.close()
subprocess.call('cat *.vcf > final_output.csv', shell = True )