From 0193f0cf43aef4c8c1d0f6efc2d288379d6dd98a Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Sun, 22 Dec 2024 14:46:55 +0800 Subject: [PATCH] allow truncating fastq reads --- precellar/src/align/aligners.rs | 6 ++++ precellar/src/align/fastq.rs | 18 ++++------ seqspec/src/lib.rs | 2 +- seqspec/src/read.rs | 60 +++++++++++++++++++++++++++++++-- 4 files changed, 71 insertions(+), 15 deletions(-) diff --git a/precellar/src/align/aligners.rs b/precellar/src/align/aligners.rs index b91a31c..be3742a 100644 --- a/precellar/src/align/aligners.rs +++ b/precellar/src/align/aligners.rs @@ -57,6 +57,12 @@ impl MultiMap { self.primary } + /// Whether the read is confidently mapped. A read is confidently mapped if it + /// is mapped to a single location. + pub fn is_confidently_mapped(&self) -> bool { + self.others.is_none() && !self.primary.flags().unwrap().is_unmapped() + } + /// Returns an iterator over all alignments (primary and secondary). pub fn iter(&self) -> impl Iterator { std::iter::once(&self.primary).chain(self.others.iter().flatten()) diff --git a/precellar/src/align/fastq.rs b/precellar/src/align/fastq.rs index c83a085..f358b6c 100644 --- a/precellar/src/align/fastq.rs +++ b/precellar/src/align/fastq.rs @@ -9,12 +9,9 @@ use indexmap::IndexMap; use kdam::{tqdm, BarExt}; use log::info; use noodles::{bam, fastq}; -use seqspec::{Assay, Modality, Read, RegionId, SegmentInfo, SegmentInfoElem}; +use seqspec::{Assay, FastqReader, Modality, Read, RegionId, SegmentInfo, SegmentInfoElem}; use smallvec::SmallVec; -use std::{ - collections::{HashMap, HashSet}, - io::BufRead, -}; +use std::collections::{HashMap, HashSet}; pub struct FastqProcessor { assay: Assay, @@ -98,7 +95,8 @@ impl FastqProcessor { let header = aligner.header(); let mut qc = AlignQC::default(); self.mito_dna.iter().for_each(|mito| { - header.reference_sequences() + header + .reference_sequences() .get_index_of(&BString::from(mito.as_str())) .map(|x| qc.mito_dna.insert(x)); }); @@ -236,7 +234,7 @@ pub struct AnnotatedFastqReader { buffer: fastq::Record, total_reads: Option, trim_poly_a: bool, - inner: Vec<(FastqAnnotator, fastq::Reader>)>, + inner: Vec<(FastqAnnotator, FastqReader)>, } impl AnnotatedFastqReader { @@ -289,10 +287,8 @@ impl AnnotatedFastqReader { } } -impl FromIterator<(FastqAnnotator, fastq::Reader>)> for AnnotatedFastqReader { - fn from_iter>)>>( - iter: T, - ) -> Self { +impl FromIterator<(FastqAnnotator, FastqReader)> for AnnotatedFastqReader { + fn from_iter>(iter: T) -> Self { Self { buffer: fastq::Record::default(), total_reads: None, diff --git a/seqspec/src/lib.rs b/seqspec/src/lib.rs index 30af89e..86fa047 100644 --- a/seqspec/src/lib.rs +++ b/seqspec/src/lib.rs @@ -5,7 +5,7 @@ pub mod utils; use log::warn; use noodles::fastq; use read::ReadValidator; -pub use read::{SegmentInfo, SegmentInfoElem, File, Read, Strand, UrlType}; +pub use read::{FastqReader, SegmentInfo, SegmentInfoElem, File, Read, Strand, UrlType}; use read::{ReadSpan, ValidateResult}; use region::LibSpec; pub use region::{Onlist, Region, RegionId, RegionType, SequenceType}; diff --git a/seqspec/src/read.rs b/seqspec/src/read.rs index 4b3778e..83c6014 100644 --- a/seqspec/src/read.rs +++ b/seqspec/src/read.rs @@ -83,11 +83,61 @@ impl Default for Read { } } +pub struct FastqReader { + reader: fastq::Reader>, + min_len: u32, + max_len: u32, +} + +impl FastqReader { + pub fn new(reader: Box, min_len: u32, max_len: u32) -> Self { + Self { + reader: fastq::Reader::new(reader), + min_len, + max_len, + } + } + + pub fn read_record(&mut self, record: &mut fastq::Record) -> Result { + let n = self.reader.read_record(record)?; + if self.max_len > 0 { + record.quality_scores_mut().truncate(self.max_len as usize); + record.sequence_mut().truncate(self.max_len as usize); + } + Ok(n) + } + + pub fn records(&mut self) -> FastqRecords { + FastqRecords { + inner: self, + } + } +} + +pub struct FastqRecords<'a> { + inner: &'a mut FastqReader, +} + +impl<'a> Iterator for FastqRecords<'a> +{ + type Item = Result; + + fn next(&mut self) -> Option { + let mut buf = fastq::Record::default(); + + match self.inner.read_record(&mut buf) { + Ok(0) => None, + Ok(_) => Some(Ok(buf)), + Err(e) => Some(Err(e)), + } + } +} + impl Read { /// Open the fastq files for reading, and return a fastq reader. /// If the read has multiple fastq files, they will be concatenated. /// If the read has no fastq files, return None. - pub fn open(&self) -> Option>> { + pub fn open(&self) -> Option { let files = self .files .clone() @@ -100,12 +150,16 @@ impl Read { } let reader = multi_reader::MultiReader::new(files.into_iter().map(move |file| file.open().unwrap())); - Some(fastq::Reader::new(Box::new(BufReader::new(reader)))) + Some(FastqReader::new( + Box::new(BufReader::new(reader)), + self.min_len, + self.max_len, + )) } /// Get the actual length of the read by reading the first record from the fastq file. pub fn actual_len(&self) -> Result { - let mut reader = self.open().unwrap(); + let mut reader = self.open().unwrap().reader; let mut record = fastq::Record::default(); reader.read_record(&mut record)?; Ok(record.sequence().len())