Update README.md/compatibility with pyfastx >= 0.9

nanoporetech · Feb 1, 2023 · b509909 · b509909
1 parent 1006137
commit b509909
Show file tree

Hide file tree

Showing 7 changed files with 13 additions and 11 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,13 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [v0.3.0]
+### Changed
+- update fastx iteration in `split_pairs` to be compatible with pyfastx>=0.9.0.
+### Fixed
+- Bug where `split_pairs` would raise a StopIteration if dataset has < 5k reads. 
+
+
 ## [v0.2.20]
 ### Added
 - `split_pairs`, a tool to recover non-split reads into their template/complement parts. 

diff --git a/README.md b/README.md
@@ -63,7 +63,7 @@ This will create an (unmapped) .sam file which has a mapping between the signal
 ### 2a) Find duplex pairs for Dorado stereo/basespace basecalling
 This will detect the majority of pairs and put them in the `pairs_from_bam` directory.
 
-    duplex_tools pair unmapped_reads_with_moves.bam pairs_from_bam/
+    duplex_tools pair --output_dir pairs_from_bam unmapped_reads_with_moves.bam
 
 
 ### 2b) Find additional duplex pairs in non-split reads (optional)

diff --git a/duplex_tools/__init__.py b/duplex_tools/__init__.py
@@ -10,7 +10,7 @@
     "split_on_adapter", "assess_split_on_adapter",
     "pairs_from_summary", "filter_pairs", "pair", "split_pairs"]
 
-__version__ = '0.2.20'
+__version__ = '0.3.0'
 
 
 def main():

diff --git a/duplex_tools/filter_pairs.py b/duplex_tools/filter_pairs.py
@@ -149,7 +149,7 @@ def scrape_sequences(file, first, second, n_bases):
 
 
 def read_all_sequences(reads_directory, pairs, n_bases, threads=None):
-    """Find an read all necessary data from fastq or bam files."""
+    """Find and read all necessary data from fastq or bam files."""
     logger = duplex_tools.get_named_logger("ReadFastq")
     first = set(pairs["first"])
     second = set(pairs["second"])

diff --git a/duplex_tools/split_on_adapter.py b/duplex_tools/split_on_adapter.py
@@ -138,7 +138,7 @@ def process_file(
     with gzip.open(newfastx, mode='wt', compresslevel=1) as outfh:
 
         for read_id, seq, qual, comments in \
-                tqdm(Fastx(str(fastx)), leave=False):
+                tqdm(Fastx(str(fastx), comment=True), leave=False):
             result = find_mid_adaptor(
                 seq, targets,
                 print_alignment=print_alignment,

diff --git a/duplex_tools/split_pairs_steps.py b/duplex_tools/split_pairs_steps.py
@@ -6,7 +6,6 @@
 from collections import defaultdict
 from concurrent.futures import as_completed, ProcessPoolExecutor
 from functools import partial
-from itertools import chain
 from pathlib import Path
 import random
 import uuid
@@ -58,11 +57,7 @@ def get_split_points(
     with ProcessPoolExecutor(threads) as pool:
         with pysam.AlignmentFile(input_dorado_xam, check_sq=False) as f:
             it = f.fetch(until_eof=True)
-            chunk1 = [next(it) for _ in range(400)]
-            chunk2 = [next(it) for _ in range(600)]
-            chunk3 = [next(it) for _ in range(4000)]
-            iterator = chain([chunk1, chunk2, chunk3],
-                             chunked(it, chunk_size))
+            iterator = chunked(it, chunk_size)
 
             for idx, chunk in enumerate(iterator):
                 if finished:

diff --git a/requirements.txt b/requirements.txt
@@ -7,6 +7,6 @@ numpy
 pandas
 parasail
 pod5
-pyfastx
+pyfastx>=0.9.0
 pysam
 tqdm