Skip to content

Commit

Permalink
Add option to use no end penalties. This option favours partial matches
Browse files Browse the repository at this point in the history
  • Loading branch information
ollenordesjo committed Nov 11, 2022
1 parent 31f5f3e commit 5a4e32a
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 7 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [v0.2.15]
### Fixed
- Bug where template/complement pairs with small negative time between them would not be chosen (rounding error).
### Added
- Option to set --no_end_penalties in filter_pairs. This option favours partial matches and avoids unbounded negative pairing scores

## [v0.2.14]
### Fixed
- Bug where template/complement pairs with no time between them would not be chosen
Expand Down
2 changes: 1 addition & 1 deletion duplex_tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"split_on_adapter", "assess_split_on_adapter",
"pairs_from_summary", "filter_pairs"]

__version__ = '0.2.14'
__version__ = '0.2.15'


def main():
Expand Down
23 changes: 18 additions & 5 deletions duplex_tools/filter_pairs.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def filter_candidate_pairs_by_aligning(
min_length: int = 1,
max_length: float = float('inf'),
threads: int = None,
no_end_penalties: bool = False,
loglevel: str = "INFO",
) -> None:
"""Filter candidate read pairs by quality of alignment.
Expand All @@ -66,6 +67,8 @@ def filter_candidate_pairs_by_aligning(
:param max_length: The maximum length of reads to keep, inclusive.
Both template and complement need to be shorter than this length.
:param threads: number of worker threads.
:param no_end_penalties: Do not penalise ends of complement alignment.
Favours truncated alignments
:param loglevel: Level to log at, for example 'INFO' or 'DEBUG'.
Expand Down Expand Up @@ -107,7 +110,7 @@ def filter_candidate_pairs_by_aligning(
alignment_scores_df = align_all_pairs(
align_threshold, fastq_index, bases_to_align, pairs,
penalty_extend, penalty_open, score_match, score_mismatch,
min_length, max_length)
min_length, max_length, no_end_penalties)

# Finally, write full summary and filtered pairs
alignment_scores_df.to_csv(
Expand Down Expand Up @@ -173,14 +176,17 @@ def align_all_pairs(
score_match,
score_mismatch,
min_length,
max_length) -> pd.DataFrame:
max_length,
no_end_penalties) -> pd.DataFrame:
"""Align read pairs to each other using parasail."""
counter = defaultdict(int)
alignment_scores = list()
npairs = len(pairs)
logger = duplex_tools.get_named_logger("AlignPairs")
logger.info(f"Aligning {npairs} pairs")
score_matrix = parasail.matrix_create("ACGT", score_match, score_mismatch)
if no_end_penalties:
logger.info("Using --no_end_penalties")
for idx, read_pair in enumerate(pairs.itertuples(), 1):
if idx % 50000 == 0:
done = 100 * idx / npairs
Expand Down Expand Up @@ -221,7 +227,11 @@ def align_all_pairs(
continue

# Run a semi-global alignment (sg) with zero end-penalty for seq2 (dx)
result = parasail.sg_dx_trace_scan_16(
if no_end_penalties:
aligner = parasail.sg_trace_scan_16
else:
aligner = parasail.sg_dx_trace_scan_16
result = aligner(
seq1, seq2, penalty_open, penalty_extend, score_matrix)

# scale score to read length
Expand Down Expand Up @@ -287,7 +297,10 @@ def argparser():
"--threads", default=None, type=int,
help=("Number of worker threads. "
"Equal to number of logical CPUs by default."))

grp.add_argument(
"--no_end_penalties", action="store_true",
help="Do no use end penalties for alignment. Allows truncated "
"complement")
return parser


Expand All @@ -299,4 +312,4 @@ def main(args):
args.penalty_open, args.penalty_extend,
args.score_match, args.score_mismatch,
args.min_length, args.max_length,
args.threads)
args.threads, args.no_end_penalties)
2 changes: 1 addition & 1 deletion duplex_tools/pairs_from_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def seqsummary_to_tempcompsummary(
logger = duplex_tools.get_named_logger("FindPairs")
# Default filtering
seqsummary["candidate_followon"] = (
(0 <= seqsummary["duration_until_next_start"])
(-0.01 <= seqsummary["duration_until_next_start"])
& (seqsummary["duration_until_next_start"] < max_time_between_reads))
logger.info(f'{seqsummary["candidate_followon"].sum()} pairs after '
f'filtering on duration between reads. (max '
Expand Down

0 comments on commit 5a4e32a

Please sign in to comment.