diff --git a/CHANGELOG.md b/CHANGELOG.md index 81609f7..95d6546 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [v0.2.15] +### Fixed +- Bug where template/complement pairs with small negative time between them would not be chosen (rounding error). +### Added +- Option to set --no_end_penalties in filter_pairs. This option favours partial matches and avoids unbounded negative pairing scores + ## [v0.2.14] ### Fixed - Bug where template/complement pairs with no time between them would not be chosen diff --git a/duplex_tools/__init__.py b/duplex_tools/__init__.py index 49822b9..347aeea 100644 --- a/duplex_tools/__init__.py +++ b/duplex_tools/__init__.py @@ -10,7 +10,7 @@ "split_on_adapter", "assess_split_on_adapter", "pairs_from_summary", "filter_pairs"] -__version__ = '0.2.14' +__version__ = '0.2.15' def main(): diff --git a/duplex_tools/filter_pairs.py b/duplex_tools/filter_pairs.py index 51ffc5b..1268dc4 100755 --- a/duplex_tools/filter_pairs.py +++ b/duplex_tools/filter_pairs.py @@ -45,6 +45,7 @@ def filter_candidate_pairs_by_aligning( min_length: int = 1, max_length: float = float('inf'), threads: int = None, + no_end_penalties: bool = False, loglevel: str = "INFO", ) -> None: """Filter candidate read pairs by quality of alignment. @@ -66,6 +67,8 @@ def filter_candidate_pairs_by_aligning( :param max_length: The maximum length of reads to keep, inclusive. Both template and complement need to be shorter than this length. :param threads: number of worker threads. + :param no_end_penalties: Do not penalise ends of complement alignment. + Favours truncated alignments :param loglevel: Level to log at, for example 'INFO' or 'DEBUG'. @@ -107,7 +110,7 @@ def filter_candidate_pairs_by_aligning( alignment_scores_df = align_all_pairs( align_threshold, fastq_index, bases_to_align, pairs, penalty_extend, penalty_open, score_match, score_mismatch, - min_length, max_length) + min_length, max_length, no_end_penalties) # Finally, write full summary and filtered pairs alignment_scores_df.to_csv( @@ -173,7 +176,8 @@ def align_all_pairs( score_match, score_mismatch, min_length, - max_length) -> pd.DataFrame: + max_length, + no_end_penalties) -> pd.DataFrame: """Align read pairs to each other using parasail.""" counter = defaultdict(int) alignment_scores = list() @@ -181,6 +185,8 @@ def align_all_pairs( logger = duplex_tools.get_named_logger("AlignPairs") logger.info(f"Aligning {npairs} pairs") score_matrix = parasail.matrix_create("ACGT", score_match, score_mismatch) + if no_end_penalties: + logger.info("Using --no_end_penalties") for idx, read_pair in enumerate(pairs.itertuples(), 1): if idx % 50000 == 0: done = 100 * idx / npairs @@ -221,7 +227,11 @@ def align_all_pairs( continue # Run a semi-global alignment (sg) with zero end-penalty for seq2 (dx) - result = parasail.sg_dx_trace_scan_16( + if no_end_penalties: + aligner = parasail.sg_trace_scan_16 + else: + aligner = parasail.sg_dx_trace_scan_16 + result = aligner( seq1, seq2, penalty_open, penalty_extend, score_matrix) # scale score to read length @@ -287,7 +297,10 @@ def argparser(): "--threads", default=None, type=int, help=("Number of worker threads. " "Equal to number of logical CPUs by default.")) - + grp.add_argument( + "--no_end_penalties", action="store_true", + help="Do no use end penalties for alignment. Allows truncated " + "complement") return parser @@ -299,4 +312,4 @@ def main(args): args.penalty_open, args.penalty_extend, args.score_match, args.score_mismatch, args.min_length, args.max_length, - args.threads) + args.threads, args.no_end_penalties) diff --git a/duplex_tools/pairs_from_summary.py b/duplex_tools/pairs_from_summary.py index eff5111..cf922ad 100755 --- a/duplex_tools/pairs_from_summary.py +++ b/duplex_tools/pairs_from_summary.py @@ -139,7 +139,7 @@ def seqsummary_to_tempcompsummary( logger = duplex_tools.get_named_logger("FindPairs") # Default filtering seqsummary["candidate_followon"] = ( - (0 <= seqsummary["duration_until_next_start"]) + (-0.01 <= seqsummary["duration_until_next_start"]) & (seqsummary["duration_until_next_start"] < max_time_between_reads)) logger.info(f'{seqsummary["candidate_followon"].sum()} pairs after ' f'filtering on duration between reads. (max '