Skip to content

Commit

Permalink
minor bug fix
Browse files Browse the repository at this point in the history
  • Loading branch information
kaizhang committed Oct 19, 2024
1 parent d59119c commit 403e53f
Show file tree
Hide file tree
Showing 5 changed files with 31 additions and 28 deletions.
8 changes: 6 additions & 2 deletions python/src/pyseqspec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,9 +125,13 @@ impl Assay {
self.0.delete_read(read_id);
}

fn validate(&self, modality: &str, out_dir: PathBuf) -> Result<()> {
#[pyo3(
signature = (modality, out_dir, *, tolerance=0.2),
text_signature = "($self, modality, out_dir, *, tolerance=None)",
)]
fn validate(&self, modality: &str, out_dir: PathBuf, tolerance: f64) -> Result<()> {
let modality = Modality::from_str(modality)?;
self.0.validate(modality, out_dir)
self.0.validate(modality, out_dir, tolerance)
}

/*
Expand Down
1 change: 0 additions & 1 deletion seqspec/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ log = "0.4"
hamming = "0.1"
noodles = { version = "0.80", features = ["core", "fastq", "async"] }
reqwest = { version = "0.12", features = ["blocking"] }
rayon = "1.10"
serde = { version = "1.0", features = ["derive", "rc"] }
serde_yaml = "0.9"
zstd = { version = "0.13", features = ["zstdmt"] }
Expand Down
6 changes: 3 additions & 3 deletions seqspec/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ mod region;
pub mod utils;

use log::warn;
use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, IntoParallelRefMutIterator, ParallelIterator};
use read::{ReadSpan, RegionIndex, ValidateResult};
pub use read::{Read, File, UrlType, Strand};
use read::ReadValidator;
Expand Down Expand Up @@ -288,7 +287,7 @@ impl Assay {

/// Validate reads in the sequence spec based on the fixed sequences and
/// save the valid and invalid reads to different files.
pub fn validate<P: AsRef<Path>>(&self, modality: Modality, dir: P) -> Result<()> {
pub fn validate<P: AsRef<Path>>(&self, modality: Modality, dir: P, tolerance: f64) -> Result<()> {
fs::create_dir_all(&dir)?;
let (mut readers, reads): (Vec<_>, Vec<_>) = self.iter_reads(modality).flat_map(|read| {
let reader = read.open()?;
Expand All @@ -307,6 +306,7 @@ impl Assay {
ReadValidator::new(region)
.with_range(range.start as usize ..range.end as usize)
.with_strand(read.strand)
.with_tolerance(tolerance)
}).collect::<Vec<_>>()
).collect();

Expand Down Expand Up @@ -335,7 +335,7 @@ impl Assay {
break;
}

let valid = records.par_iter().zip(validators.par_iter_mut()).all(|(record, validators)|
let valid = records.iter().zip(validators.iter_mut()).all(|(record, validators)|
validators.iter_mut().all(|validator| {
match validator.validate(record.sequence()) {
ValidateResult::OnlistFail | ValidateResult::Valid => true,
Expand Down
40 changes: 20 additions & 20 deletions seqspec/src/read.rs
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ pub(crate) struct ReadValidator<'a> {
range: Option<Range<usize>>,
n_total: usize,
n_matched: usize,
onlist: Option<HashSet<String>>,
onlist: Option<HashSet<Vec<u8>>>,
strand: Strand,
tolerance: f64,
}
Expand Down Expand Up @@ -329,6 +329,14 @@ impl<'a> ReadValidator<'a> {
self
}

pub fn with_tolerance(mut self, tolerance: f64) -> Self {
if tolerance < 0.0 || tolerance > 1.0 {
panic!("Tolerance must be between 0 and 1");
}
self.tolerance = tolerance;
self
}

pub fn frac_matched(&self) -> f64 {
self.n_matched as f64 / self.n_total as f64
}
Expand All @@ -344,38 +352,30 @@ impl<'a> ReadValidator<'a> {
} else {
seq
};
let seq = if self.strand == Strand::Neg {
crate::utils::rev_compl(seq)
} else {
seq.to_vec()
};
let len = seq.len();
if len < self.region.min_len as usize {
ValidateResult::TooShort((seq.len(), self.region.min_len as usize))
} else if seq.len() > self.region.max_len as usize {
ValidateResult::TooLong((seq.len(), self.region.max_len as usize))
} else if self.sequence_type() == SequenceType::Fixed {
let d = hamming::distance(seq, self.region.sequence.as_bytes()) as f64;
let d = hamming::distance(&seq, self.region.sequence.as_bytes()) as f64;
if d <= self.tolerance * len as f64 {
self.n_matched += 1;
ValidateResult::Valid
} else {
ValidateResult::MisMatch(d as usize)
}
} else if let Some(onlist) = &self.onlist {
match self.strand {
Strand::Neg => {
let seq = crate::utils::rev_compl(seq);
if onlist.contains(std::str::from_utf8(&seq).unwrap()) {
self.n_matched += 1;
ValidateResult::Valid
} else {
ValidateResult::OnlistFail
}
},
Strand::Pos => {
if onlist.contains(std::str::from_utf8(seq).unwrap()) {
self.n_matched += 1;
ValidateResult::Valid
} else {
ValidateResult::OnlistFail
}
},
if onlist.contains(&seq) {
self.n_matched += 1;
ValidateResult::Valid
} else {
ValidateResult::OnlistFail
}
} else {
self.n_matched += 1;
Expand Down
4 changes: 2 additions & 2 deletions seqspec/src/region.rs
Original file line number Diff line number Diff line change
Expand Up @@ -256,11 +256,11 @@ pub struct Onlist {
}

impl Onlist {
pub fn read(&self) -> Result<HashSet<String>> {
pub fn read(&self) -> Result<HashSet<Vec<u8>>> {
let cache = Cache::new()?;
let file = cache.cached_path(&self.url)?;
let reader = std::io::BufReader::new(crate::utils::open_file_for_read(file));
Ok(reader.lines().map(|x| x.unwrap()).collect())
Ok(reader.lines().map(|x| x.unwrap().into_bytes()).collect())
}

pub(crate) fn normalize_path<P: AsRef<Path>>(&mut self, work_dir: P) -> Result<()> {
Expand Down

0 comments on commit 403e53f

Please sign in to comment.