diff --git a/python/src/pyseqspec.rs b/python/src/pyseqspec.rs index f0092bf..711b775 100644 --- a/python/src/pyseqspec.rs +++ b/python/src/pyseqspec.rs @@ -125,9 +125,13 @@ impl Assay { self.0.delete_read(read_id); } - fn validate(&self, modality: &str, out_dir: PathBuf) -> Result<()> { + #[pyo3( + signature = (modality, out_dir, *, tolerance=0.2), + text_signature = "($self, modality, out_dir, *, tolerance=None)", + )] + fn validate(&self, modality: &str, out_dir: PathBuf, tolerance: f64) -> Result<()> { let modality = Modality::from_str(modality)?; - self.0.validate(modality, out_dir) + self.0.validate(modality, out_dir, tolerance) } /* diff --git a/seqspec/Cargo.toml b/seqspec/Cargo.toml index 50d2e1d..90cbec9 100644 --- a/seqspec/Cargo.toml +++ b/seqspec/Cargo.toml @@ -12,7 +12,6 @@ log = "0.4" hamming = "0.1" noodles = { version = "0.80", features = ["core", "fastq", "async"] } reqwest = { version = "0.12", features = ["blocking"] } -rayon = "1.10" serde = { version = "1.0", features = ["derive", "rc"] } serde_yaml = "0.9" zstd = { version = "0.13", features = ["zstdmt"] } diff --git a/seqspec/src/lib.rs b/seqspec/src/lib.rs index ce4610f..ad97846 100644 --- a/seqspec/src/lib.rs +++ b/seqspec/src/lib.rs @@ -3,7 +3,6 @@ mod region; pub mod utils; use log::warn; -use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, IntoParallelRefMutIterator, ParallelIterator}; use read::{ReadSpan, RegionIndex, ValidateResult}; pub use read::{Read, File, UrlType, Strand}; use read::ReadValidator; @@ -288,7 +287,7 @@ impl Assay { /// Validate reads in the sequence spec based on the fixed sequences and /// save the valid and invalid reads to different files. - pub fn validate>(&self, modality: Modality, dir: P) -> Result<()> { + pub fn validate>(&self, modality: Modality, dir: P, tolerance: f64) -> Result<()> { fs::create_dir_all(&dir)?; let (mut readers, reads): (Vec<_>, Vec<_>) = self.iter_reads(modality).flat_map(|read| { let reader = read.open()?; @@ -307,6 +306,7 @@ impl Assay { ReadValidator::new(region) .with_range(range.start as usize ..range.end as usize) .with_strand(read.strand) + .with_tolerance(tolerance) }).collect::>() ).collect(); @@ -335,7 +335,7 @@ impl Assay { break; } - let valid = records.par_iter().zip(validators.par_iter_mut()).all(|(record, validators)| + let valid = records.iter().zip(validators.iter_mut()).all(|(record, validators)| validators.iter_mut().all(|validator| { match validator.validate(record.sequence()) { ValidateResult::OnlistFail | ValidateResult::Valid => true, diff --git a/seqspec/src/read.rs b/seqspec/src/read.rs index a8ea143..51e2f00 100644 --- a/seqspec/src/read.rs +++ b/seqspec/src/read.rs @@ -288,7 +288,7 @@ pub(crate) struct ReadValidator<'a> { range: Option>, n_total: usize, n_matched: usize, - onlist: Option>, + onlist: Option>>, strand: Strand, tolerance: f64, } @@ -329,6 +329,14 @@ impl<'a> ReadValidator<'a> { self } + pub fn with_tolerance(mut self, tolerance: f64) -> Self { + if tolerance < 0.0 || tolerance > 1.0 { + panic!("Tolerance must be between 0 and 1"); + } + self.tolerance = tolerance; + self + } + pub fn frac_matched(&self) -> f64 { self.n_matched as f64 / self.n_total as f64 } @@ -344,13 +352,18 @@ impl<'a> ReadValidator<'a> { } else { seq }; + let seq = if self.strand == Strand::Neg { + crate::utils::rev_compl(seq) + } else { + seq.to_vec() + }; let len = seq.len(); if len < self.region.min_len as usize { ValidateResult::TooShort((seq.len(), self.region.min_len as usize)) } else if seq.len() > self.region.max_len as usize { ValidateResult::TooLong((seq.len(), self.region.max_len as usize)) } else if self.sequence_type() == SequenceType::Fixed { - let d = hamming::distance(seq, self.region.sequence.as_bytes()) as f64; + let d = hamming::distance(&seq, self.region.sequence.as_bytes()) as f64; if d <= self.tolerance * len as f64 { self.n_matched += 1; ValidateResult::Valid @@ -358,24 +371,11 @@ impl<'a> ReadValidator<'a> { ValidateResult::MisMatch(d as usize) } } else if let Some(onlist) = &self.onlist { - match self.strand { - Strand::Neg => { - let seq = crate::utils::rev_compl(seq); - if onlist.contains(std::str::from_utf8(&seq).unwrap()) { - self.n_matched += 1; - ValidateResult::Valid - } else { - ValidateResult::OnlistFail - } - }, - Strand::Pos => { - if onlist.contains(std::str::from_utf8(seq).unwrap()) { - self.n_matched += 1; - ValidateResult::Valid - } else { - ValidateResult::OnlistFail - } - }, + if onlist.contains(&seq) { + self.n_matched += 1; + ValidateResult::Valid + } else { + ValidateResult::OnlistFail } } else { self.n_matched += 1; diff --git a/seqspec/src/region.rs b/seqspec/src/region.rs index c803c4b..efb7f60 100644 --- a/seqspec/src/region.rs +++ b/seqspec/src/region.rs @@ -256,11 +256,11 @@ pub struct Onlist { } impl Onlist { - pub fn read(&self) -> Result> { + pub fn read(&self) -> Result>> { let cache = Cache::new()?; let file = cache.cached_path(&self.url)?; let reader = std::io::BufReader::new(crate::utils::open_file_for_read(file)); - Ok(reader.lines().map(|x| x.unwrap()).collect()) + Ok(reader.lines().map(|x| x.unwrap().into_bytes()).collect()) } pub(crate) fn normalize_path>(&mut self, work_dir: P) -> Result<()> {