Skip to content

Commit

Permalink
Fix memory access/corruption error when scoring alignments with seque…
Browse files Browse the repository at this point in the history
…nces that end in stop codons.

When alignments are scored, any stop codons that are at the end of sequences (ignoring gaps) are moved to the end of the alignment.
  • Loading branch information
reedacartwright committed Jan 23, 2024
1 parent 5888fbb commit 64025c7
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 24 deletions.
3 changes: 2 additions & 1 deletion meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ project('COATI', 'cpp',
version : '1.0.0',
license : 'MIT',
meson_version : '>=0.58.0',
default_options: [ 'buildtype=debugoptimized', 'cpp_std=c++17' ])
default_options: [ 'buildtype=debugoptimized',
'cpp_std=c++17', 'b_ndebug=if-release' ])

subdir('contrib')
subdir('src')
Expand Down
2 changes: 1 addition & 1 deletion src/include/coati/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ void order_ref(coati::alignment_t& aln);
// Validate input sequences
void process_marginal(coati::alignment_t& aln);
// Validate input pairwise alignment for scoring
void process_alignment(coati::alignment_t& aln);
std::string process_alignment(coati::alignment_t& aln);
// Trim end stop codons
void trim_end_stops(coati::data_t& data);
// Restore end stop codons
Expand Down
44 changes: 27 additions & 17 deletions src/lib/align_marginal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -371,12 +371,12 @@ TEST_CASE("marg_alignment") {
* @retval float alignment score.
*/
float alignment_score(coati::alignment_t& aln, const coati::Matrixf& p_marg) {
std::vector<std::string> seqs = aln.data.seqs;

coati::utils::process_alignment(aln);
// Remove gaps from alignment and return an expanded cigar string of
// alignment
std::string cigar = coati::utils::process_alignment(aln);

coati::utils::sequence_pair_t seq_pair =
coati::utils::marginal_seq_encoding(aln.data.seqs[0], seqs[1]);
coati::utils::marginal_seq_encoding(aln.data.seqs[0], aln.data.seqs[1]);

coati::semiring::tropical trig; // tropical rig
// calculate log(1-g) log(1-e) log(g) log(e) log(pi)
Expand All @@ -391,30 +391,35 @@ float alignment_score(coati::alignment_t& aln, const coati::Matrixf& p_marg) {
enum struct State { MATCH, GAP };
auto state = State::MATCH;
float score{0.f};
size_t ngap{0}, nins{0}, ndel{0}, len_stops{0};
if(aln.data.stops[0] != "" || aln.data.stops[1] != "") {
len_stops = 3;
}
for(size_t i = 0; i < seqs[0].length() - len_stops; i++) {
size_t nins{0}, ndel{0};
size_t apos = 0, bpos = 0;

for(size_t i = 0; i < cigar.length(); i++) {
switch(state) {
case State::MATCH:
if(seqs[0][i] == '-') { // insertion;
if(cigar[i] == 'I') { // insertion;
nins++;
bpos++;
state = State::GAP;
} else if(seqs[1][i] == '-') { // deletion;
} else if(cigar[i] == 'D') { // deletion;
ndel++;
apos++;
state = State::GAP;
} else { // match/mismatch;
score =
trig.times(score, no_gap, no_gap,
p_marg(seq_pair[0][i - ngap], seq_pair[1][i]));
p_marg(seq_pair[0][apos], seq_pair[1][bpos]));
apos++;
bpos++;
}
break;
case State::GAP:
if(seqs[0][i] == '-') { // insertion_extension
if(cigar[i] == 'I') { // insertion_extension
nins++;
} else if(seqs[1][i] == '-') { // deletion_ext
bpos++;
} else if(cigar[i] == 'D') { // deletion_ext
ndel++;
apos++;
} else { // match/mismatch
assert(nins > 0 || ndel > 0);
if(nins == 0) { // score deletions
Expand All @@ -430,15 +435,18 @@ float alignment_score(coati::alignment_t& aln, const coati::Matrixf& p_marg) {
trig.power(gap_extend, nins + ndel - 2),
gap_stop, gap_stop);
}
ngap += nins;
score = trig.times(
score, p_marg(seq_pair[0][i - ngap], seq_pair[1][i]));
score, p_marg(seq_pair[0][apos], seq_pair[1][bpos]));
nins = ndel = 0;
state = State::MATCH;
apos++;
bpos++;
}
break;
}
}
assert(apos == seq_pair[0].length());
assert(bpos == seq_pair[1].length());
// terminal state score
if(state == State::MATCH) {
score = trig.times(score, no_gap, no_gap);
Expand Down Expand Up @@ -483,7 +491,6 @@ TEST_CASE("alignment_score") {
test("ACTCT-A", "ACTCTG-", -10.52864);
test("ATGCTTTAC", "ATGCT-TAC", 2.13593f);
test("ATGCTT---", "ATGCTTTGA", 0.70607f);
test("ACTCTA-", "ACTCTAG", -1.57693f);
test("A-CTAAC", "ACCTAAG", -8.2786f);
test("ACT---", "ACTCTG", -5.04197);
test("ACTCTA", "ACT---", -5.04197);
Expand All @@ -494,6 +501,9 @@ TEST_CASE("alignment_score") {
test("AAAAAA---", "AAAAAAAAA", -2.03242);
test("AAAAAAAAA", "---AAAAAA", -2.03242);
test("AAAAAAAAA", "AAAAAA---", -2.03242);
test("ACTCTA", "ACTC--", -3.18537f);
// This alignment will be scored as ACTCTA--- / ACTC--TAG
test("ACTCTA-", "ACTCTAG", -10.45777f);

// NOLINTNEXTLINE(misc-unused-parameters)
auto test_fail = [](const std::string& anc, const std::string& des) {
Expand Down
44 changes: 39 additions & 5 deletions src/lib/utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -841,8 +841,9 @@ void process_marginal(coati::alignment_t& aln) {
*
* @param[in,out] aln coati::alignment_t input sequences and alignment info.
*
* @retval std::string an expanded cigar string of alignment.
*/
void process_alignment(coati::alignment_t& aln) {
std::string process_alignment(coati::alignment_t& aln) {
if(aln.data.size() != 2) {
throw std::invalid_argument("Exactly two sequences required.");
}
Expand All @@ -852,15 +853,49 @@ void process_alignment(coati::alignment_t& aln) {
order_ref(aln);
}

size_t len_a = aln.seq(0).length();
size_t len_b = aln.seq(1).length();
size_t len_a = aln.data.seqs[0].length();
size_t len_b = aln.data.seqs[1].length();

// check that both sequences have equal length
if(len_a != len_b) {
throw std::invalid_argument(
"For alignment scoring both sequences must have equal length.");
}

// trim final codons considering alignment we will replace with gaps
// and then remove the gaps if necessary
for(size_t i = 0; i < aln.data.size(); ++i) {
std::string_view seq{aln.data.seqs[i]};
size_t pos = seq.find_last_not_of('-');
if(pos == std::string::npos || pos < 2) {
aln.data.stops.emplace_back("");
continue;
}
auto last_cod = seq.substr(pos - 2, 3);
int cod = cod_int(last_cod);
if(cod == 48 || cod == 50 || cod == 56) {
aln.data.stops.emplace_back(last_cod);
aln.data.seqs[i].replace(pos - 2, 3, 3, '-');
} else {
aln.data.stops.emplace_back("");
}
}

// create an expanded cigar string of the alignment
std::string cigar;
cigar.reserve(len_a);
for(size_t i = 0; i < len_a; ++i) {
auto a = aln.data.seqs[0][i];
auto b = aln.data.seqs[1][i];
if(a != '-' && b != '-') {
cigar.push_back('M');
} else if(a != '-' && b == '-') {
cigar.push_back('D');
} else if(a == '-' && b != '-') {
cigar.push_back('I');
}
}

// remove gaps
boost::algorithm::erase_all(aln.data.seqs[0], "-");
boost::algorithm::erase_all(aln.data.seqs[1], "-");
Expand All @@ -883,8 +918,7 @@ void process_alignment(coati::alignment_t& aln) {
"length.");
}

// handle ending stop codons
trim_end_stops(aln.data);
return cigar;
}

/**
Expand Down

0 comments on commit 64025c7

Please sign in to comment.