-
Notifications
You must be signed in to change notification settings - Fork 0
/
Alignment Scoring Script.py
245 lines (190 loc) · 10.7 KB
/
Alignment Scoring Script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
"""
# Importing relevant packages (Biopython, numpy, and os)
from Bio import AlignIO
import numpy as np
import os
from natsort import natsorted
import csv
# Provide the paths to the reference and realignment directories
refer_path = ("/home/isaac/Desktop/Test_Simulations/Results/A100_Samples/Reference_Alignments/")
reali_path = ("/home/isaac/Desktop/Test_Simulations/Results/A100_Samples/MAFFT_Alignments/")
# Create a list of the 100 simulations performed
simulation_list = list(range(0, 100))
# Returns the path for the alignment file that will be analyzed
def Get_Alignment_File_Path(simulation_number, alignment_directory_path):
alignment_files_list = os.listdir(alignment_directory_path)
alignment_files_list = natsorted(alignment_files_list)
alignment_filename = alignment_files_list[simulation_number]
alignment_file_path = str(alignment_directory_path) + str(alignment_filename)
return alignment_file_path
# Imports MSA file and sorts sequence IDs by the same order
def Open_MSA(file_path):
alignment = AlignIO.read(file_path, "fasta")
alignment.sort()
return alignment
# Converts any lowercase nucleotides to uppercase
def Make_UpperCase(alignment):
for i in range(100):
sequence = alignment[i].seq
sequence = sequence.upper()
alignment[i].seq = sequence
return alignment
# Calculates the number of columns in an alignment
def len_alignment_fn(alignment):
len_alignment = len(alignment[0].seq)
return len_alignment
# Represent MSA as an array of nucleotide positions
"""
For example, the alignment:
[ , 0] [ , 1] [ , 2] [ , 3] [ , 4] [ , 5] ...... [ , 1574] [ , 1575]
[0, ] A T G A - T ...... C -
[1, ] A - 0 A C T ...... - T
[2, ] A T G A C A ...... - -
[3, ] - - - G C A ...... T T
[4, ] C T G 0 - 4 ...... T T
[5, ] A - - G T A ...... C -
..... ... ... ... ... ... ... ...... ... ...
..... ... ... ... ... ... ... ...... ... ...
..... ... ... ... ... ... ... ...... ... ...
[99, ] C T G A - A ...... C T
Would be represented as:
[ , 0] [ , 1] [ , 2] [ , 3] [ , 4] [ , 5] ...... [ , 1574] [ , 1575]
[0, ] 1 2 3 4 0 5 ...... 1500 0
[1, ] 1 0 0 2 3 4 ...... 0 1500
[2, ] 1 2 3 4 5 6 ...... 0 0
[3, ] 0 0 0 1 2 3 ...... 1500 1501
[4, ] 1 2 3 0 0 4 ...... 1499 1500
[5, ] 1 0 0 2 3 5 ...... 1500 0
..... ... ... ... ... ... ... ...... ... ...
..... ... ... ... ... ... ... ...... ... ...
..... ... ... ... ... ... ... ...... ... ...
[99, ] 1 2 3 4 0 5 ...... 1499 1500
In this array, rows [0, ] to [99, ] represent sequences 1-100, while columns
[ , 0] to [ , 1574] represent the nucleotide positions. In the array, a value of
0 represents a gap ("-") at that position. For example, the 0 at [0, 4] means
that there is a gap in the 5th position/column of the first sequence.
Values of 1 and greater means that the cell is occupied by a nucleotide, and
not a gap. This number is also the gap free position. For example, [3, 3] has a
value of 1, meaning that the 1st nucleotide in sequence 4 is in column 4
(this is because the first 3 columns in this sequence are gaps).
The maximum value in each row is normally 1500, but it varies depending on how
many indels has experienced.
"""
def Get_Nucleotide_Positions(alignment, len_alignment):
# Create a zero array with the same dimensions as the MSA
reference_nucleotide_positions = np.zeros((100, len_alignment), dtype=int)
# For loop through the MSA, assigning a 0 to cells with gaps, and 1-~1500
# for cells with nucleotides
gap_count = int()
for i in range(100):
sequence = alignment[i].seq
for j in range(len_alignment):
residue = sequence[j]
if residue == "-":
reference_nucleotide_positions[i, j] = 0
else:
gap_count = (alignment[i].seq[0:j]).count("-")
cell_column = j
reference_nucleotide_positions[i, j] = cell_column + 1 - gap_count
return reference_nucleotide_positions
# Calculates reference sequence nucleotide positions
def Get_Refer_Sequence_Nucleotide_Positions(alignment_positions, len_alignment):
# Create a list, refer_seq_nuc_pos, to store the nucleotide positions of the
# reference sequence. Sequence ID 1 (alignment_positions[0, ]) will be used
# as the reference
refer_sequence = list(alignment_positions[0, ])
refer_sequence_nuc_count = len_alignment - refer_sequence.count(0)
refer_sequence_nuc_pos = [0] * refer_sequence_nuc_count
j = 0
for i in range (len_alignment):
value = refer_sequence[i]
if value > 0:
refer_sequence_nuc_pos[j] = i
j = j + 1
else:
value = value
return refer_sequence_nuc_pos
# Calculate reference alignment score
def Get_Refer_Alignment_Column_Score(reference_alignment_nucleotide_positions, ref_seq_nuc_pos):
refer_column_alignment_score = [0] * len(ref_seq_nuc_pos)
for i in range(len(ref_seq_nuc_pos)):
column = ref_seq_nuc_pos[i]
column_gap_count = (list(reference_alignment_nucleotide_positions[:, column])).count(0)
column_score = 100 - column_gap_count
refer_column_alignment_score[i] = column_score
return refer_column_alignment_score
# Score realignment sequence columns against reference sequence columns
def Get_Realignment_Column_Scores(reference_columns, realignment_columns, ref_seq_nuc_pos, reali_seq_nuc_pos):
reali_col_score = [0] * len(ref_seq_nuc_pos)
for i in range(len(ref_seq_nuc_pos)):
ref_col_seq = reference_columns[:, i]
reali_col_seq = realignment_columns[:, i]
for j in range(100):
ref_value = ref_col_seq[j]
reali_value = reali_col_seq[j]
if ref_value == 0:
ref_value = ref_value
elif ref_value > 0:
if ref_value == reali_value:
score = reali_col_score[i] + 1
reali_col_score[i] = score
elif ref_value != reali_value:
if reali_value == 0:
score = reali_col_score[i] - 5
reali_col_score[i] = score
elif reali_value != 0:
score = reali_col_score[i] - 1
reali_col_score[i] = score
return reali_col_score
# Create a list to store the alignment score difference for all 100 alignments
alignment_score_difference = [0] * 100
# Create a csv to save the alignment scores into
with open('Alignment Score Results.csv', 'a', newline = '') as csvfile:
fieldnames = ['Reference_Score', 'Realignment_Score', 'Alignment_Score_Difference']
alignmentwriter = csv.DictWriter(csvfile, fieldnames = fieldnames)
alignmentwriter.writeheader()
# Calculates the alignment score of reference alignment
def Alignment_Score_Differenence_Calculator(simulation_number):
# Upload the reference and realignment files that will be analyzed
refer_file_path = Get_Alignment_File_Path(simulation_number, refer_path)
refer_alignment = Open_MSA(refer_file_path)
refer_alignment_upper = Make_UpperCase(refer_alignment)
reali_file_path = Get_Alignment_File_Path(simulation_number, reali_path)
reali_alignment = Open_MSA(reali_file_path)
reali_alignment_upper = Make_UpperCase(reali_alignment)
# Get the lengths of the reference and realignment alignments
len_refer = len_alignment_fn(refer_alignment_upper)
len_reali = len_alignment_fn(reali_alignment_upper)
# Get the column numbers of the reference sequence's nucleotide positions
# (refer_sequence_positions), the nucleotide composition of that entire
# column (refer_columns), the score for each of those columns
# (refer_alignment_column_scores), and the reference alignment score (refer_alignment_score)
refer_alignment_positions = Get_Nucleotide_Positions(refer_alignment_upper, len_alignment = len_refer)
refer_sequence_positions = Get_Refer_Sequence_Nucleotide_Positions(refer_alignment_positions, len_refer)
refer_columns = refer_alignment_positions[:, list(refer_sequence_positions)]
refer_alignment_column_scores = Get_Refer_Alignment_Column_Score(refer_alignment_positions, refer_sequence_positions)
# Get the column numbers of the reference sequence's nucleotide positions
# within the realignment alignment (reali_sequence_positions), and the nucleotide
# composition of those column (reali_columns), the score for each of those columns
# (refer_alignment_column_scores)
reali_alignment_positions = Get_Nucleotide_Positions(reali_alignment_upper, len_alignment = len_reali)
reali_sequence_positions = Get_Refer_Sequence_Nucleotide_Positions(reali_alignment_positions, len_reali)
reali_columns = reali_alignment_positions[:, list(reali_sequence_positions)]
# Get the reference alignment score, the individual realignment column scores
# (reali_alignment_column_scores), the realignment score, and the alignment
# score difference (alignment_score_difference)
reali_alignment_column_scores = Get_Realignment_Column_Scores(refer_columns, reali_columns, refer_sequence_positions, reali_sequence_positions)
reference_alignment_score = sum(refer_alignment_column_scores)
realignment_alignment_score = sum(reali_alignment_column_scores)
alignment_score_difference[simulation_number] = reference_alignment_score - realignment_alignment_score
print(alignment_score_difference[simulation_number])
# Save the alignment score results to a csv file
with open('Alignment Score Results.csv', 'a', newline = '') as csvfile:
alignmentwriter = csv.writer(csvfile, delimiter = ',',
quotechar = '|', quoting=csv.QUOTE_MINIMAL)
alignmentwriter.writerow([reference_alignment_score, realignment_alignment_score, alignment_score_difference[simulation_number]])
for simulation_number in simulation_list:
Alignment_Score_Differenence_Calculator(simulation_number)