From 8117d283ad06b09e6327c21bff07dc9abaea0fd4 Mon Sep 17 00:00:00 2001 From: wenjiewei Date: Tue, 22 Oct 2024 14:44:24 +0800 Subject: [PATCH] format some codes for readability --- precellar/src/align.rs | 463 +++++++++++++++---------- precellar/src/barcode.rs | 60 ++-- precellar/src/fragment.rs | 160 ++++++--- precellar/src/fragment/deduplicate.rs | 132 +++++--- precellar/src/qc.rs | 85 +++-- seqspec/src/lib.rs | 465 +++++++++++++++++--------- seqspec/src/read.rs | 110 +++--- seqspec/src/region.rs | 125 ++++--- 8 files changed, 1055 insertions(+), 545 deletions(-) diff --git a/precellar/src/align.rs b/precellar/src/align.rs index 07d6891..019b023 100644 --- a/precellar/src/align.rs +++ b/precellar/src/align.rs @@ -1,34 +1,37 @@ use crate::barcode::{BarcodeCorrector, Whitelist}; -use seqspec::{Assay, Modality, Read, RegionType, SequenceType}; use crate::qc::{AlignQC, Metrics}; +use seqspec::{Assay, Modality, Read, RegionType, SequenceType}; -use bstr::BString; -use kdam::{tqdm, BarExt}; -use std::ops::Range; -use std::sync::{Arc, Mutex}; -use std::collections::{HashMap, HashSet}; -use std::io::BufRead; -use smallvec::SmallVec; use anyhow::{bail, Result}; -use noodles::{bam, sam, fastq}; -use noodles::sam::alignment::{ - Record, record_buf::RecordBuf, record::data::field::tag::Tag, -}; -use noodles::sam::alignment::record_buf::data::field::value::Value; +use bstr::BString; use bwa_mem2::BurrowsWheelerAligner; +use either::Either; +use kdam::{tqdm, BarExt}; use log::info; +use noodles::sam::alignment::record_buf::data::field::value::Value; +use noodles::sam::alignment::{record::data::field::tag::Tag, record_buf::RecordBuf, Record}; +use noodles::{bam, fastq, sam}; use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator}; -use either::Either; +use smallvec::SmallVec; +use std::collections::{HashMap, HashSet}; +use std::io::BufRead; +use std::ops::Range; +use std::sync::{Arc, Mutex}; pub trait Alinger { fn chunk_size(&self) -> usize; fn header(&self) -> sam::Header; - fn align_reads(&mut self, records: &mut [fastq::Record]) -> impl ExactSizeIterator; + fn align_reads( + &mut self, + records: &mut [fastq::Record], + ) -> impl ExactSizeIterator; - fn align_read_pairs(&mut self, records: &mut [(fastq::Record, fastq::Record)]) -> - impl ExactSizeIterator; + fn align_read_pairs( + &mut self, + records: &mut [(fastq::Record, fastq::Record)], + ) -> impl ExactSizeIterator; } impl Alinger for BurrowsWheelerAligner { @@ -40,12 +43,17 @@ impl Alinger for BurrowsWheelerAligner { self.get_sam_header() } - fn align_reads(&mut self, records: &mut [fastq::Record]) -> impl ExactSizeIterator { + fn align_reads( + &mut self, + records: &mut [fastq::Record], + ) -> impl ExactSizeIterator { self.align_reads(records) } - fn align_read_pairs(&mut self, records: &mut [(fastq::Record, fastq::Record)]) -> - impl ExactSizeIterator { + fn align_read_pairs( + &mut self, + records: &mut [(fastq::Record, fastq::Record)], + ) -> impl ExactSizeIterator { self.align_read_pairs(records) } } @@ -57,17 +65,21 @@ pub struct FastqProcessor { mito_dna: HashSet, metrics: HashMap, align_qc: HashMap>>, - barcode_correct_prob: f64, // if the posterior probability of a correction - // exceeds this threshold, the barcode will be corrected. - // cellrange uses 0.975 for ATAC and 0.9 for multiome. + barcode_correct_prob: f64, // if the posterior probability of a correction + // exceeds this threshold, the barcode will be corrected. + // cellrange uses 0.975 for ATAC and 0.9 for multiome. mismatch_in_barcode: usize, // The number of mismatches allowed in barcode } impl FastqProcessor { pub fn new(assay: Assay, aligner: A) -> Self { Self { - assay, aligner, current_modality: None, metrics: HashMap::new(), - align_qc: HashMap::new(), mito_dna: HashSet::new(), + assay, + aligner, + current_modality: None, + metrics: HashMap::new(), + align_qc: HashMap::new(), + mito_dna: HashSet::new(), barcode_correct_prob: 0.975, mismatch_in_barcode: 1, } @@ -79,139 +91,191 @@ impl FastqProcessor { } pub fn modality(&self) -> Modality { - self.current_modality.expect("modality not set, please call set_modality first") + self.current_modality + .expect("modality not set, please call set_modality first") } pub fn add_mito_dna(&mut self, mito_dna: &str) { - self.aligner.header().reference_sequences().get_index_of(&BString::from(mito_dna)) + self.aligner + .header() + .reference_sequences() + .get_index_of(&BString::from(mito_dna)) .map(|x| self.mito_dna.insert(x)); } pub fn with_modality(mut self, modality: Modality) -> Self { - self.current_modality = Some(modality.into()); + self.current_modality = Some(modality); self } pub fn get_report(&self) -> Metrics { - let mut metrics = self.metrics.get(&self.modality()).map_or(Metrics::default(), |x| x.clone()); + let mut metrics = self + .metrics + .get(&self.modality()) + .map_or(Metrics::default(), |x| x.clone()); if let Some(align_qc) = self.align_qc.get(&self.modality()) { align_qc.lock().unwrap().report(&mut metrics); } metrics } - pub fn gen_barcoded_alignments(&mut self) -> - impl Iterator, Vec<(RecordBuf, RecordBuf)>>> + '_ - { + pub fn gen_barcoded_alignments( + &mut self, + ) -> impl Iterator, Vec<(RecordBuf, RecordBuf)>>> + '_ { info!("Counting barcodes..."); let whitelist = self.count_barcodes().unwrap(); - info!("Found {} barcodes. {:.2}% of them have an exact match in whitelist", whitelist.total_count, whitelist.frac_exact_match() * 100.0); + info!( + "Found {} barcodes. {:.2}% of them have an exact match in whitelist", + whitelist.total_count, + whitelist.frac_exact_match() * 100.0 + ); info!("Aligning reads..."); let fq_records = self.gen_raw_fastq_records(); let is_paired = fq_records.is_paired_end(); - let corrector = BarcodeCorrector::default().with_bc_confidence_threshold(self.barcode_correct_prob); + let corrector = + BarcodeCorrector::default().with_bc_confidence_threshold(self.barcode_correct_prob); let header = self.aligner.header(); self.align_qc.insert( self.modality(), Arc::new(Mutex::new(AlignQC { mito_dna: self.mito_dna.clone(), ..AlignQC::default() - })) + })), ); let align_qc = self.align_qc.get(&self.modality()).unwrap().clone(); let mut progress_bar = tqdm!(total = whitelist.total_count); - fq_records.chunk(self.aligner.chunk_size()).map(move |data| if is_paired { - let (barcodes, mut reads): (Vec<_>, Vec<_>) = data.into_iter() - .map(|(barcode, (read1, read2))| (barcode, (read1.unwrap(), read2.unwrap()))).unzip(); - let alignments: Vec<_> = self.aligner.align_read_pairs(&mut reads).collect(); - let results = barcodes.into_par_iter().zip(alignments).map(|(barcode, (ali1, ali2))| { - let corrected_barcode = corrector.correct( - whitelist.get_barcode_counts(), - barcode.sequence(), - barcode.quality_scores(), - self.mismatch_in_barcode, - ).ok(); - let ali1_ = add_cell_barcode( - &header, - &ali1, - barcode.sequence(), - barcode.quality_scores(), - corrected_barcode, - ).unwrap(); - let ali2_ = add_cell_barcode( - &header, - &ali2, - barcode.sequence(), - barcode.quality_scores(), - corrected_barcode, - ).unwrap(); - { - let mut align_qc_lock = align_qc.lock().unwrap(); - align_qc_lock.update(&ali1_, &header); - align_qc_lock.update(&ali2_, &header); + fq_records + .chunk(self.aligner.chunk_size()) + .map(move |data| { + if is_paired { + let (barcodes, mut reads): (Vec<_>, Vec<_>) = data + .into_iter() + .map(|(barcode, (read1, read2))| { + (barcode, (read1.unwrap(), read2.unwrap())) + }) + .unzip(); + let alignments: Vec<_> = self.aligner.align_read_pairs(&mut reads).collect(); + let results = barcodes + .into_par_iter() + .zip(alignments) + .map(|(barcode, (ali1, ali2))| { + let corrected_barcode = corrector + .correct( + whitelist.get_barcode_counts(), + barcode.sequence(), + barcode.quality_scores(), + self.mismatch_in_barcode, + ) + .ok(); + let ali1_ = add_cell_barcode( + &header, + &ali1, + barcode.sequence(), + barcode.quality_scores(), + corrected_barcode, + ) + .unwrap(); + let ali2_ = add_cell_barcode( + &header, + &ali2, + barcode.sequence(), + barcode.quality_scores(), + corrected_barcode, + ) + .unwrap(); + { + let mut align_qc_lock = align_qc.lock().unwrap(); + align_qc_lock.update(&ali1_, &header); + align_qc_lock.update(&ali2_, &header); + } + (ali1_, ali2_) + }) + .collect::>(); + progress_bar.update(results.len()).unwrap(); + Either::Right(results) + } else { + let (barcodes, mut reads): (Vec<_>, Vec<_>) = data + .into_iter() + .map(|(barcode, (read1, _))| (barcode, read1.unwrap())) + .unzip(); + let alignments: Vec<_> = self.aligner.align_reads(&mut reads).collect(); + let results = barcodes + .into_par_iter() + .zip(alignments) + .map(|(barcode, alignment)| { + let corrected_barcode = corrector + .correct( + whitelist.get_barcode_counts(), + barcode.sequence(), + barcode.quality_scores(), + self.mismatch_in_barcode, + ) + .ok(); + let ali = add_cell_barcode( + &header, + &alignment, + barcode.sequence(), + barcode.quality_scores(), + corrected_barcode, + ) + .unwrap(); + { + align_qc.lock().unwrap().update(&ali, &header); + } + ali + }) + .collect::>(); + progress_bar.update(results.len()).unwrap(); + Either::Left(results) } - (ali1_, ali2_) - }).collect::>(); - progress_bar.update(results.len()).unwrap(); - Either::Right(results) - } else { - let (barcodes, mut reads): (Vec<_>, Vec<_>) = data.into_iter() - .map(|(barcode, (read1, _))| (barcode, read1.unwrap())).unzip(); - let alignments: Vec<_> = self.aligner.align_reads(&mut reads).collect(); - let results = barcodes.into_par_iter().zip(alignments).map(|(barcode, alignment)| { - let corrected_barcode = corrector.correct( - whitelist.get_barcode_counts(), - barcode.sequence(), - barcode.quality_scores(), - self.mismatch_in_barcode, - ).ok(); - let ali = add_cell_barcode( - &header, - &alignment, - barcode.sequence(), - - barcode.quality_scores(), - corrected_barcode, - ).unwrap(); - { align_qc.lock().unwrap().update(&ali, &header); } - ali - }).collect::>(); - progress_bar.update(results.len()).unwrap(); - Either::Left(results) - }) + }) } - pub fn gen_raw_alignments(&mut self) -> - impl Iterator, Vec<(sam::Record, sam::Record)>>> + '_ - { + pub fn gen_raw_alignments( + &mut self, + ) -> impl Iterator, Vec<(sam::Record, sam::Record)>>> + '_ { let fq_records = self.gen_raw_fastq_records(); let is_paired = fq_records.is_paired_end(); - fq_records.chunk(self.aligner.chunk_size()).map(move |data| if is_paired { - let mut reads: Vec<_> = data.into_iter().map(|(_, (read1, read2))| - (read1.unwrap(), read2.unwrap())).collect(); - let alignments = self.aligner.align_read_pairs(&mut reads).collect(); - Either::Right(alignments) - } else { - let mut reads: Vec<_> = data.into_iter().map(|(_, (read1, _))| - read1.unwrap()).collect(); - let alignments = self.aligner.align_reads(reads.as_mut()).collect(); - Either::Left(alignments) - }) + fq_records + .chunk(self.aligner.chunk_size()) + .map(move |data| { + if is_paired { + let mut reads: Vec<_> = data + .into_iter() + .map(|(_, (read1, read2))| (read1.unwrap(), read2.unwrap())) + .collect(); + let alignments = self.aligner.align_read_pairs(&mut reads).collect(); + Either::Right(alignments) + } else { + let mut reads: Vec<_> = data + .into_iter() + .map(|(_, (read1, _))| read1.unwrap()) + .collect(); + let alignments = self.aligner.align_reads(reads.as_mut()).collect(); + Either::Left(alignments) + } + }) } pub fn gen_raw_fastq_records(&self) -> FastqRecords { let modality = self.modality(); - let data = self.assay.get_index_by_modality(modality).filter_map(|(read, index)| { - let regions: Vec<_> = index.index.into_iter() - .filter(|x| x.1.is_barcode() || x.1.is_target()).collect(); - if regions.is_empty() { - None - } else { - Some((read, regions, read.open().unwrap())) - } - }); + let data = self + .assay + .get_index_by_modality(modality) + .filter_map(|(read, index)| { + let regions: Vec<_> = index + .index + .into_iter() + .filter(|x| x.1.is_barcode() || x.1.is_target()) + .collect(); + if regions.is_empty() { + None + } else { + Some((read, regions, read.open().unwrap())) + } + }); FastqRecords::new(data) } @@ -219,10 +283,17 @@ impl FastqProcessor { let modality = self.modality(); let mut whitelist = self.get_whitelist()?; - let (read, index) = self.assay.get_index_by_modality(modality).into_iter() + let (read, index) = self + .assay + .get_index_by_modality(modality) .find(|(_, index)| index.index.iter().any(|x| x.1.is_barcode())) .expect("No barcode region found"); - let range = index.index.into_iter().find(|x| x.1.is_barcode()).unwrap().2; + let range = index + .index + .into_iter() + .find(|x| x.1.is_barcode()) + .unwrap() + .2; read.open().unwrap().records().for_each(|record| { let mut record = record.unwrap(); @@ -233,21 +304,38 @@ impl FastqProcessor { whitelist.count_barcode(record.sequence(), record.quality_scores()); }); - self.metrics.entry(modality).or_insert_with(Metrics::default) - .insert("frac_q30_bases_barcode".to_string(), whitelist.frac_q30_bases()); + self.metrics.entry(modality).or_default().insert( + "frac_q30_bases_barcode".to_string(), + whitelist.frac_q30_bases(), + ); Ok(whitelist) } fn get_whitelist(&self) -> Result { - let regions = self.assay.library_spec.get_modality(&self.modality()).unwrap().read().unwrap(); - let regions: Vec<_> = regions.subregions.iter().filter(|r| r.read().unwrap().region_type.is_barcode()).collect(); + let regions = self + .assay + .library_spec + .get_modality(&self.modality()) + .unwrap() + .read() + .unwrap(); + let regions: Vec<_> = regions + .subregions + .iter() + .filter(|r| r.read().unwrap().region_type.is_barcode()) + .collect(); if regions.len() != 1 { - bail!("Expecting exactly one barcode region, found {}", regions.len()); + bail!( + "Expecting exactly one barcode region, found {}", + regions.len() + ); } let region = regions[0]; if region.read().unwrap().sequence_type == SequenceType::Onlist { - Ok(Whitelist::new(region.read().unwrap().onlist.as_ref().unwrap().read()?)) + Ok(Whitelist::new( + region.read().unwrap().onlist.as_ref().unwrap().read()?, + )) } else { Ok(Whitelist::empty()) } @@ -271,34 +359,45 @@ pub type UMI = fastq::Record; impl FastqRecords { fn new<'a, I>(iter: I) -> Self where - I: Iterator)>, fastq::Reader)>, + I: Iterator< + Item = ( + &'a Read, + Vec<(String, RegionType, Range)>, + fastq::Reader, + ), + >, { - let records = iter.map(|(read, regions, reader)| - FastqRecord { + let records = iter + .map(|(read, regions, reader)| FastqRecord { id: read.read_id.to_string(), is_reverse: read.is_reverse(), subregion: regions.into_iter().map(|x| (x.1, x.2)).collect(), reader, min_len: read.min_len as usize, max_len: read.max_len as usize, - } - ).collect(); + }) + .collect(); Self(records) } fn chunk(self, chunk_size: usize) -> FastqRecordChunk { - FastqRecordChunk { fq: self, chunk_size } + FastqRecordChunk { + fq: self, + chunk_size, + } } fn is_paired_end(&self) -> bool { let mut read1 = false; let mut read2 = false; self.0.iter().for_each(|x| { - x.subregion.iter().for_each(|(region_type, _)| if region_type.is_target() { - if x.is_reverse { - read1 = true; - } else { - read2 = true; + x.subregion.iter().for_each(|(region_type, _)| { + if region_type.is_target() { + if x.is_reverse { + read1 = true; + } else { + read2 = true; + } } }); }); @@ -311,23 +410,37 @@ impl Iterator for FastqRecords { fn next(&mut self) -> Option { let mut id_without_record = Vec::new(); - let records: SmallVec<[_; 4]> = self.0.iter_mut().map(|x| { - let mut record = fastq::Record::default(); - if x.reader.read_record(&mut record).expect("error reading fastq record") == 0 { - id_without_record.push(x.id.as_str()); - None - } else { - let n = record.sequence().len(); - if n < x.min_len || n > x.max_len { - panic!("Read length ({}) out of range: {}-{}", n, x.min_len, x.max_len); + let records: SmallVec<[_; 4]> = self + .0 + .iter_mut() + .map(|x| { + let mut record = fastq::Record::default(); + if x.reader + .read_record(&mut record) + .expect("error reading fastq record") + == 0 + { + id_without_record.push(x.id.as_str()); + None + } else { + let n = record.sequence().len(); + if n < x.min_len || n > x.max_len { + panic!( + "Read length ({}) out of range: {}-{}", + n, x.min_len, x.max_len + ); + } + Some(record) } - Some(record) - } - }).collect(); + }) + .collect(); if id_without_record.len() == records.len() { return None; - } else if id_without_record.len() > 0 { - panic!("Missing records in these files: {}", id_without_record.join(",")); + } else if !id_without_record.is_empty() { + panic!( + "Missing records in these files: {}", + id_without_record.join(",") + ); } let mut barcode = None; @@ -371,8 +484,8 @@ impl Iterator for FastqRecordChunk { let mut accumulated_length = 0; for record in self.fq.by_ref() { - accumulated_length += record.1.0.as_ref().map_or(0, |x| x.sequence().len()) + - record.1.1.as_ref().map_or(0, |x| x.sequence().len()); + accumulated_length += record.1 .0.as_ref().map_or(0, |x| x.sequence().len()) + + record.1 .1.as_ref().map_or(0, |x| x.sequence().len()); chunk.push(record); if accumulated_length >= self.chunk_size { break; @@ -387,7 +500,6 @@ impl Iterator for FastqRecordChunk { } } - fn add_cell_barcode( header: &sam::Header, record: &R, @@ -397,8 +509,14 @@ fn add_cell_barcode( ) -> std::io::Result { let mut record_buf = RecordBuf::try_from_alignment_record(header, record)?; let data = record_buf.data_mut(); - data.insert(Tag::CELL_BARCODE_SEQUENCE, Value::String(ori_barcode.into())); - data.insert(Tag::CELL_BARCODE_QUALITY_SCORES, Value::String(ori_qual.into())); + data.insert( + Tag::CELL_BARCODE_SEQUENCE, + Value::String(ori_barcode.into()), + ); + data.insert( + Tag::CELL_BARCODE_QUALITY_SCORES, + Value::String(ori_qual.into()), + ); if let Some(barcode) = correct_barcode { data.insert(Tag::CELL_BARCODE_ID, Value::String(barcode.into())); } @@ -415,13 +533,18 @@ fn slice_fastq_record(record: &fastq::Record, start: usize, end: usize) -> fastq fn rev_compl_fastq_record(mut record: fastq::Record) -> fastq::Record { *record.quality_scores_mut() = record.quality_scores().iter().rev().copied().collect(); - *record.sequence_mut() = record.sequence().iter().rev().map(|&x| match x { - b'A' => b'T', - b'T' => b'A', - b'C' => b'G', - b'G' => b'C', - _ => x, - }).collect(); + *record.sequence_mut() = record + .sequence() + .iter() + .rev() + .map(|&x| match x { + b'A' => b'T', + b'T' => b'A', + b'C' => b'G', + b'G' => b'C', + _ => x, + }) + .collect(); record } @@ -441,7 +564,10 @@ impl<'a, R: std::io::Read> NameCollatedRecords<'a, R> { } fn check(&mut self, name: &BString) { - assert!(!self.checker.contains(name), "bam file must be name collated or name sorted"); + assert!( + !self.checker.contains(name), + "bam file must be name collated or name sorted" + ); self.checker.insert(name.to_owned()); } } @@ -456,7 +582,10 @@ impl<'a, R: std::io::Read> Iterator for NameCollatedRecords<'a, R> { if name == prev_name { Some((prev_record, record)) } else { - panic!("Expecting paired end reads with the same name, found {} and {}", prev_name, name); + panic!( + "Expecting paired end reads with the same name, found {} and {}", + prev_name, name + ); } } else { self.check(&name); @@ -466,7 +595,6 @@ impl<'a, R: std::io::Read> Iterator for NameCollatedRecords<'a, R> { } } - #[cfg(test)] mod tests { use bwa_mem2::{AlignerOpts, FMIndex, PairedEndStats}; @@ -479,10 +607,9 @@ mod tests { let aligner = BurrowsWheelerAligner::new( FMIndex::read("tests/data/hg38").unwrap(), AlignerOpts::default(), - PairedEndStats::default() + PairedEndStats::default(), ); - let mut processor = FastqProcessor::new(spec, aligner) - .with_modality(Modality::ATAC); + let mut processor = FastqProcessor::new(spec, aligner).with_modality(Modality::ATAC); processor.gen_barcoded_alignments().take(6).for_each(|x| { println!("{:?}", x); @@ -490,4 +617,4 @@ mod tests { println!("{}", processor.get_report()); } -} \ No newline at end of file +} diff --git a/precellar/src/barcode.rs b/precellar/src/barcode.rs index 0073e08..d0528ec 100644 --- a/precellar/src/barcode.rs +++ b/precellar/src/barcode.rs @@ -1,5 +1,8 @@ use core::f64; -use std::{collections::HashMap, ops::{Deref, DerefMut}}; +use std::{ + collections::HashMap, + ops::{Deref, DerefMut}, +}; const BC_MAX_QV: u8 = 66; // This is the illumina quality value pub(crate) const BASE_OPTS: [u8; 4] = [b'A', b'C', b'G', b'T']; @@ -28,6 +31,13 @@ impl FromIterator<(Vec, usize)> for OligoFrequncy { } } +// Implement default for OligoFrequncy +impl Default for OligoFrequncy { + fn default() -> Self { + Self::new() + } +} + impl OligoFrequncy { pub fn new() -> Self { Self(HashMap::new()) @@ -36,7 +46,12 @@ impl OligoFrequncy { /// The likelihood of a query oligo being generated by the library. /// If the query is present in the library, the likelihood is 1.0. /// Otherwise, the likelihood is calculated as - pub fn likelihood<'a>(&'a self, query: &'a [u8], qual: &[u8], n_mismatch: usize) -> (&'a [u8], f64) { + pub fn likelihood<'a>( + &'a self, + query: &'a [u8], + qual: &[u8], + n_mismatch: usize, + ) -> (&'a [u8], f64) { if n_mismatch == 0 { if self.0.contains_key(query) { (query, 1.0) @@ -57,20 +72,20 @@ impl OligoFrequncy { if self.0.contains_key(query) { return (query, 1.0); } - + let mut best_option = None; let mut total_likelihood = 0.0; let mut query_bytes = query.to_vec(); - + // Single mismatch loop for (pos1, &qv1) in qual.iter().enumerate() { let qv1 = qv1.min(BC_MAX_QV); let original1 = query_bytes[pos1]; - + for base1 in BASE_OPTS { if base1 != original1 { query_bytes[pos1] = base1; - + // Check for 1-mismatch barcode match if let Some((key, raw_count)) = self.0.get_key_value(&query_bytes) { let bc_count = 1 + raw_count; @@ -78,20 +93,22 @@ impl OligoFrequncy { update_best_option(&mut best_option, likelihood, key); total_likelihood += likelihood; } - + // Loop for the second mismatch for (pos2, &qv2) in qual.iter().enumerate().skip(pos1 + 1) { let qv2 = qv2.min(BC_MAX_QV); let original2 = query_bytes[pos2]; - + for val2 in BASE_OPTS { if val2 != original2 { query_bytes[pos2] = val2; - + // Check for 2-mismatch barcode match if let Some((key, raw_count)) = self.0.get_key_value(&query_bytes) { let bc_count = 1 + raw_count; - let likelihood = bc_count as f64 * error_probability(qv1) * error_probability(qv2); + let likelihood = bc_count as f64 + * error_probability(qv1) + * error_probability(qv2); update_best_option(&mut best_option, likelihood, key); total_likelihood += likelihood; } @@ -105,19 +122,18 @@ impl OligoFrequncy { // Restore original value for first position query_bytes[pos1] = original1; } - + if let Some((best_like, best_bc)) = best_option { (best_bc, best_like / total_likelihood) } else { (query, 0.0) } } - /// The likehood up to 1 mismatch. fn likelihood1<'a>(&'a self, query: &'a [u8], qual: &[u8]) -> (&'a [u8], f64) { if self.0.contains_key(query) { - return (query, 1.0) + return (query, 1.0); } let mut best_option = None; @@ -160,7 +176,7 @@ fn update_best_option<'a>( if old_best.0 < likelihood { *best_option = Some((likelihood, key)); } - }, + } } } @@ -258,7 +274,7 @@ pub struct BarcodeCorrector { max_expected_errors: f64, /// if the posterior probability of a correction /// exceeds this threshold, the barcode will be corrected. - bc_confidence_threshold: f64, + bc_confidence_threshold: f64, } impl Default for BarcodeCorrector { @@ -289,9 +305,15 @@ impl BarcodeCorrector { /// 1) It is in the whitelist and the number of expected errors is less than the max_expected_errors. /// 2) It is not in the whitelist, but the number of expected errors is less than the max_expected_errors and the corrected barcode is in the whitelist. /// 3) If the whitelist does not exist, the barcode is always valid. - /// + /// /// Return the corrected barcode - pub fn correct<'a>(&'a self, barcode_counts: &'a OligoFrequncy, barcode: &'a [u8], qual: &[u8], n_mismatch: usize) -> Result<&[u8], BarcodeError> { + pub fn correct<'a>( + &'a self, + barcode_counts: &'a OligoFrequncy, + barcode: &'a [u8], + qual: &[u8], + n_mismatch: usize, + ) -> Result<&[u8], BarcodeError> { let expected_errors: f64 = qual.iter().map(|&q| error_probability(q)).sum(); if expected_errors >= self.max_expected_errors { return Err(BarcodeError::ExceedExpectedError(expected_errors)); @@ -308,13 +330,13 @@ impl BarcodeCorrector { } } -/// Barcode correction problem: Given a whitelist of barcodes, and a sequenced barcode with quality scores, +/// Barcode correction problem: Given a whitelist of barcodes, and a sequenced barcode with quality scores, /// decide which barcode in the whitelist generated the sequenced barcode. /// Convert Illumina quality scores to base-calling error probabilities, i.e., /// the probability of an incorrect base call. #[inline(always)] fn error_probability(qual: u8) -> f64 { - let offset = 33.0; // Illumina quality score offset + let offset = 33.0; // Illumina quality score offset 10f64.powf(-((qual as f64 - offset) / 10.0)) } diff --git a/precellar/src/fragment.rs b/precellar/src/fragment.rs index 1aa2c17..0bf0dc7 100644 --- a/precellar/src/fragment.rs +++ b/precellar/src/fragment.rs @@ -1,20 +1,26 @@ mod deduplicate; +use anyhow::Result; +use bed_utils::{ + bed::{BEDLike, ParseError, Strand}, + extsort::ExternalSorterBuilder, +}; use deduplicate::{remove_duplicates, AlignmentInfo}; use either::Either; use itertools::Itertools; -use noodles::sam::{alignment::{record::Flags, Record}, Header}; +use noodles::sam::{ + alignment::{record::Flags, Record}, + Header, +}; use rayon::prelude::ParallelSliceMut; -use serde::{Serialize, Deserialize}; -use std::{ops::Deref, path::PathBuf}; -use bed_utils::{bed::{BEDLike, ParseError, Strand}, extsort::ExternalSorterBuilder}; -use anyhow::Result; +use serde::{Deserialize, Serialize}; +use std::path::PathBuf; pub type CellBarcode = String; /// Fragments from single-cell ATAC-seq experiment. Each fragment is represented /// by a genomic coordinate, cell barcode and a integer value. -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug)] pub struct Fragment { pub chrom: String, pub start: u64, @@ -25,17 +31,23 @@ pub struct Fragment { } impl BEDLike for Fragment { - fn chrom(&self) -> &str { &self.chrom } + fn chrom(&self) -> &str { + &self.chrom + } fn set_chrom(&mut self, chrom: &str) -> &mut Self { self.chrom = chrom.to_string(); self } - fn start(&self) -> u64 { self.start } + fn start(&self) -> u64 { + self.start + } fn set_start(&mut self, start: u64) -> &mut Self { self.start = start; self } - fn end(&self) -> u64 { self.end } + fn end(&self) -> u64 { + self.end + } fn set_end(&mut self, end: u64) -> &mut Self { self.end = end; self @@ -74,10 +86,17 @@ impl std::str::FromStr for Fragment { fn from_str(s: &str) -> Result { let mut fields = s.split('\t'); - let chrom = fields.next().ok_or(ParseError::MissingReferenceSequenceName)?.to_string(); - let start = fields.next().ok_or(ParseError::MissingStartPosition) + let chrom = fields + .next() + .ok_or(ParseError::MissingReferenceSequenceName)? + .to_string(); + let start = fields + .next() + .ok_or(ParseError::MissingStartPosition) .and_then(|s| lexical::parse(s).map_err(ParseError::InvalidStartPosition))?; - let end = fields.next().ok_or(ParseError::MissingEndPosition) + let end = fields + .next() + .ok_or(ParseError::MissingEndPosition) .and_then(|s| lexical::parse(s).map_err(ParseError::InvalidEndPosition))?; let barcode = fields .next() @@ -86,24 +105,35 @@ impl std::str::FromStr for Fragment { "." => None, _ => Some(s.into()), })?; - let count = fields.next().map_or(Ok(1), |s| if s == "." { - Ok(1) - } else { - lexical::parse(s).map_err(ParseError::InvalidStartPosition) + let count = fields.next().map_or(Ok(1), |s| { + if s == "." { + Ok(1) + } else { + lexical::parse(s).map_err(ParseError::InvalidStartPosition) + } })?; - let strand = fields.next().map_or(Ok(None), |s| if s == "." { - Ok(None) - } else { - s.parse().map(Some).map_err(ParseError::InvalidStrand) + let strand = fields.next().map_or(Ok(None), |s| { + if s == "." { + Ok(None) + } else { + s.parse().map(Some).map_err(ParseError::InvalidStrand) + } })?; - Ok(Fragment { chrom, start, end, barcode, count, strand }) + Ok(Fragment { + chrom, + start, + end, + barcode, + count, + strand, + }) } } #[derive(Debug)] pub struct FragmentGenerator { shift_left: i64, // `shift_left` - Insertion site correction for the left end. - shift_right: i64, // `shift_right` - Insertion site correction for the right end. + shift_right: i64, // `shift_right` - Insertion site correction for the right end. mapq: u8, chunk_size: usize, temp_dir: Option, @@ -134,24 +164,33 @@ impl FragmentGenerator { self.shift_right = shift_right; } - pub fn gen_unique_fragments<'a, I, R>(&'a self, header: &'a Header, records: I) - -> UniqueFragments + 'a, impl FnMut(&AlignmentInfo) -> String + 'a> + pub fn gen_unique_fragments<'a, I, R>( + &'a self, + header: &'a Header, + records: I, + ) -> UniqueFragments< + impl Iterator + 'a, + impl FnMut(&AlignmentInfo) -> String + 'a, + > where I: Iterator, Vec<(R, R)>>> + 'a, R: Record + 'a, { let data = records.flat_map(|x| match x { - Either::Left(chunk) => Box::new(chunk.into_iter().flat_map(|r| if filter_read(&r, self.mapq) { - AlignmentInfo::from_read(&r, header).unwrap() - } else { - None + Either::Left(chunk) => Box::new(chunk.into_iter().flat_map(|r| { + if filter_read(&r, self.mapq) { + AlignmentInfo::from_read(&r, header).unwrap() + } else { + None + } })) as Box>, - Either::Right(chunk) => Box::new(chunk.into_iter().flat_map(|(r1, r2)| + Either::Right(chunk) => Box::new(chunk.into_iter().flat_map(|(r1, r2)| { if filter_read_pair((&r1, &r2), self.mapq) { AlignmentInfo::from_read_pair((&r1, &r2), header).unwrap() } else { None - })) as Box>, + } + })) as Box>, }); let sorted = sort_by_barcode(data, self.temp_dir.clone(), self.chunk_size); @@ -169,7 +208,9 @@ pub struct UniqueFragments { chunks: itertools::ChunkBy, } -impl<'a, I: Iterator, F: FnMut(&AlignmentInfo) -> String> IntoIterator for &'a UniqueFragments { +impl<'a, I: Iterator, F: FnMut(&AlignmentInfo) -> String> IntoIterator + for &'a UniqueFragments +{ type Item = Vec; type IntoIter = UniqueFragmentsIter<'a, I, F>; @@ -188,24 +229,35 @@ pub struct UniqueFragmentsIter<'a, I: Iterator, F> { iter: itertools::structs::Groups<'a, String, I, F>, } -impl<'a, I: Iterator, F: FnMut(&AlignmentInfo) -> String> Iterator for UniqueFragmentsIter<'a, I, F> { +impl<'a, I: Iterator, F: FnMut(&AlignmentInfo) -> String> Iterator + for UniqueFragmentsIter<'a, I, F> +{ type Item = Vec; fn next(&mut self) -> Option { let (_, chunk) = self.iter.next()?; - let mut fragments: Vec<_> = remove_duplicates(chunk).drain().flat_map(|(_, mut frag)| { - if frag.strand().is_none() { // perform fragment length correction for paired-end reads - frag.set_start(frag.start().saturating_add_signed(self.shift_left)); - frag.set_end(frag.end().saturating_add_signed(self.shift_right)); - } - if frag.len() > 0 { Some(frag) } else { None } - }).collect(); - fragments.par_sort_unstable_by(|a, b| - a.chrom().cmp(b.chrom()) + let mut fragments: Vec<_> = remove_duplicates(chunk) + .drain() + .flat_map(|(_, mut frag)| { + if frag.strand().is_none() { + // perform fragment length correction for paired-end reads + frag.set_start(frag.start().saturating_add_signed(self.shift_left)); + frag.set_end(frag.end().saturating_add_signed(self.shift_right)); + } + if frag.len() > 0 { + Some(frag) + } else { + None + } + }) + .collect(); + fragments.par_sort_unstable_by(|a, b| { + a.chrom() + .cmp(b.chrom()) .then_with(|| a.start().cmp(&b.start())) .then_with(|| a.end().cmp(&b.end())) - .then_with(|| a.score().as_ref().map(|x| x.deref()).cmp(&b.score().as_ref().map(|x| x.deref()))) - ); + .then_with(|| a.score().as_deref().cmp(&b.score().as_deref())) + }); Some(fragments) } } @@ -225,9 +277,11 @@ where if let Some(tmp) = temp_dir { sorter = sorter.with_tmp_dir(tmp); } - sorter.build().unwrap() - .sort_by(reads, |a, b| a.barcode.cmp(&b.barcode) - ).unwrap() + sorter + .build() + .unwrap() + .sort_by(reads, |a, b| a.barcode.cmp(&b.barcode)) + .unwrap() .map(|x| x.unwrap()) } @@ -239,11 +293,15 @@ fn filter_read(record: &R, min_q: u8) -> bool { // - read fails platform/vendor quality checks // - read is PCR or optical duplicate // - supplementary alignment - !record.flags().unwrap().intersects(Flags::from_bits_retain(3852)) && - record.mapping_quality().map_or(255, |x| x.unwrap().get()) >= min_q + !record + .flags() + .unwrap() + .intersects(Flags::from_bits_retain(3852)) + && record.mapping_quality().map_or(255, |x| x.unwrap().get()) >= min_q } fn filter_read_pair(pair: (&R, &R), min_q: u8) -> bool { - filter_read(pair.0, min_q) && filter_read(pair.1, min_q) && - pair.0.flags().unwrap().is_properly_segmented() -} \ No newline at end of file + filter_read(pair.0, min_q) + && filter_read(pair.1, min_q) + && pair.0.flags().unwrap().is_properly_segmented() +} diff --git a/precellar/src/fragment/deduplicate.rs b/precellar/src/fragment/deduplicate.rs index 9916827..f7e97f7 100644 --- a/precellar/src/fragment/deduplicate.rs +++ b/precellar/src/fragment/deduplicate.rs @@ -18,16 +18,19 @@ // if it's a PCR duplicate, it will be guaranteed to be the same at that end // but not at the 3' end. +use anyhow::{Context, Result}; +use bed_utils::bed::Strand; +use itertools::Itertools; use noodles::sam::alignment::record::cigar::op::Kind; use noodles::sam::alignment::record::Flags; use noodles::sam::alignment::Record; -use noodles::sam::{alignment::record::data::field::{Tag, Value}, Header}; -use bed_utils::bed::Strand; -use itertools::Itertools; +use noodles::sam::{ + alignment::record::data::field::{Tag, Value}, + Header, +}; +use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::hash::Hash; -use anyhow::{Result, Context}; -use serde::{Serialize, Deserialize}; use crate::fragment::Fragment; @@ -56,7 +59,14 @@ use crate::fragment::Fragment; // RF_secondstrand outward 3' <==1==---------- 5' // 5' ----------==2==> 3' #[derive(Eq, PartialEq, Debug, Hash, Serialize, Deserialize)] -pub enum Orientation { F, R, FR, FF, RR, RF } +pub enum Orientation { + F, + R, + FR, + FF, + RR, + RF, +} /// Reads are considered duplicates if and only if they have the same fingerprint. #[derive(Eq, PartialEq, Debug, Hash)] @@ -89,7 +99,7 @@ pub(crate) struct AlignmentMini { impl AlignmentMini { fn new(rec: &R) -> Result { let cigar = rec.cigar(); - let start: usize = rec.alignment_start().unwrap().unwrap().try_into()?; + let start: usize = rec.alignment_start().unwrap().unwrap().into(); let alignment_start: u32 = start.try_into()?; let alignment_span: u32 = cigar.alignment_span()?.try_into()?; let alignment_end = alignment_start + alignment_span - 1; @@ -98,15 +108,19 @@ impl AlignmentMini { kind == Kind::HardClip || kind == Kind::SoftClip }); let mut clips = clip_groups.into_iter(); - let clipped_start: u32 = clips.next().map_or(0, |(is_clip, x)| if is_clip { - x.map(|x| x.len() as u32).sum() - } else { - 0 + let clipped_start: u32 = clips.next().map_or(0, |(is_clip, x)| { + if is_clip { + x.map(|x| x.len() as u32).sum() + } else { + 0 + } }); - let clipped_end: u32 = clips.last().map_or(0, |(is_clip, x)| if is_clip { - x.map(|x| x.len() as u32).sum() - } else { - 0 + let clipped_end: u32 = clips.last().map_or(0, |(is_clip, x)| { + if is_clip { + x.map(|x| x.len() as u32).sum() + } else { + 0 + } }); Ok(Self { alignment_start, @@ -141,7 +155,7 @@ impl AlignmentMini { } } } - + #[derive(Serialize, Deserialize, Debug)] pub struct AlignmentInfo { name: String, @@ -156,11 +170,21 @@ pub struct AlignmentInfo { impl AlignmentInfo { pub fn from_read(rec: &R, header: &Header) -> Result> { let barcode = get_barcode(rec)?; - if barcode.is_none() { return Ok(None); } + if barcode.is_none() { + return Ok(None); + } let name = rec.name().unwrap().to_string(); - let reference_sequence_id: u16 = rec.reference_sequence_id(header).context("no reference sequence id")??.try_into()?; - let reference_sequence = header.reference_sequences().get_index(reference_sequence_id as usize).unwrap().0.to_string(); + let reference_sequence_id: u16 = rec + .reference_sequence_id(header) + .context("no reference sequence id")?? + .try_into()?; + let reference_sequence = header + .reference_sequences() + .get_index(reference_sequence_id as usize) + .unwrap() + .0 + .to_string(); let umi = get_umi(rec)?; Ok(Some(Self { name, @@ -178,20 +202,34 @@ impl AlignmentInfo { let rec2 = records.1; let barcode1 = get_barcode(rec1)?; let barcode2 = get_barcode(rec2)?; - if barcode1 != barcode2 || barcode1.is_none() { return Ok(None); } + if barcode1 != barcode2 || barcode1.is_none() { + return Ok(None); + } let name1 = rec1.name().unwrap(); let name2 = rec2.name().unwrap(); assert!(name1 == name2, "Read names do not match"); - let reference_sequence_id1: u16 = rec1.reference_sequence_id(header).context("no reference sequence id")??.try_into()?; - let reference_sequence_id2: u16 = rec2.reference_sequence_id(header).context("no reference sequence id")??.try_into()?; - if reference_sequence_id1 != reference_sequence_id2 { return Ok(None); } + let reference_sequence_id1: u16 = rec1 + .reference_sequence_id(header) + .context("no reference sequence id")?? + .try_into()?; + let reference_sequence_id2: u16 = rec2 + .reference_sequence_id(header) + .context("no reference sequence id")?? + .try_into()?; + if reference_sequence_id1 != reference_sequence_id2 { + return Ok(None); + } Ok(Some(Self { name: name1.to_string(), reference_sequence_id: reference_sequence_id1, - reference_sequence: header.reference_sequences() - .get_index(reference_sequence_id1 as usize).unwrap().0.to_string(), + reference_sequence: header + .reference_sequences() + .get_index(reference_sequence_id1 as usize) + .unwrap() + .0 + .to_string(), read1: AlignmentMini::new(rec1)?, read2: Some(AlignmentMini::new(rec2)?), barcode: barcode1.unwrap(), @@ -263,12 +301,20 @@ impl From<&AlignmentInfo> for FingerPrint { let orientation = if this.is_reverse_complemented() == other.is_reverse_complemented() { if this.is_reverse_complemented() { - if this.is_first_segment() { Orientation::RR } else { Orientation::FF } + if this.is_first_segment() { + Orientation::RR + } else { + Orientation::FF + } + } else if this.is_first_segment() { + Orientation::FF } else { - if this.is_first_segment() { Orientation::FF } else { Orientation::RR } + Orientation::RR } + } else if this.is_reverse_complemented() { + Orientation::RF } else { - if this.is_reverse_complemented() { Orientation::RF } else { Orientation::FR } + Orientation::FR }; FingerPrint::Paired { reference_id: ali.reference_sequence_id, @@ -281,7 +327,6 @@ impl From<&AlignmentInfo> for FingerPrint { } } - pub(crate) fn remove_duplicates(data: I) -> HashMap where I: IntoIterator, @@ -289,23 +334,32 @@ where let mut result: HashMap = HashMap::new(); data.into_iter().for_each(|ali| { let fingerprint = FingerPrint::from(&ali); - result.entry(fingerprint).and_modify(|x| x.count +=1) + result + .entry(fingerprint) + .and_modify(|x| x.count += 1) .or_insert(Fragment::from((ali, 1))); }); result } fn get_barcode(rec: &R) -> Result> { - Ok(rec.data().get(&Tag::CELL_BARCODE_ID).transpose()?.and_then(|x| match x { - Value::String(barcode) => Some(barcode.to_string()), - _ => None, - })) + Ok(rec + .data() + .get(&Tag::CELL_BARCODE_ID) + .transpose()? + .and_then(|x| match x { + Value::String(barcode) => Some(barcode.to_string()), + _ => None, + })) } fn get_umi(rec: &R) -> Result> { - Ok(rec.data().get(&Tag::UMI_ID).transpose()?.and_then(|x| match x { - Value::String(umi) => Some(umi.to_string()), - _ => None, - })) + Ok(rec + .data() + .get(&Tag::UMI_ID) + .transpose()? + .and_then(|x| match x { + Value::String(umi) => Some(umi.to_string()), + _ => None, + })) } - diff --git a/precellar/src/qc.rs b/precellar/src/qc.rs index bf3667a..e782bda 100644 --- a/precellar/src/qc.rs +++ b/precellar/src/qc.rs @@ -1,26 +1,24 @@ -use std::collections::{HashMap, HashSet}; -use std::ops::{Deref, DerefMut}; -use std::fmt::Display; use bed_utils::bed::BEDLike; use noodles::sam; -use noodles::sam::alignment::{ - Record, record::data::field::tag::Tag, -}; +use noodles::sam::alignment::{record::data::field::tag::Tag, Record}; +use std::collections::{HashMap, HashSet}; +use std::fmt::Display; +use std::ops::{Deref, DerefMut}; use crate::fragment::Fragment; #[derive(Debug, Default, Clone)] pub struct Metrics(HashMap); -impl From > for Metrics { +impl From> for Metrics { fn from(map: HashMap) -> Self { Metrics(map) } } -impl Into> for Metrics { - fn into(self) -> HashMap { - self.0 +impl From for HashMap { + fn from(val: Metrics) -> Self { + val.0 } } @@ -133,8 +131,8 @@ impl FlagStat { self.singleton += 1; } else { self.mate_mapped += 1; - let rec_id = record.mate_reference_sequence_id(header).unwrap().unwrap(); - let mat_id = record.reference_sequence_id(header).unwrap().unwrap(); + let rec_id = record.mate_reference_sequence_id(header).unwrap().unwrap(); + let mat_id = record.reference_sequence_id(header).unwrap().unwrap(); if mat_id != rec_id { self.mate_reference_sequence_id_mismatch += 1; @@ -148,7 +146,7 @@ impl FlagStat { #[derive(Debug, Default)] pub struct AlignQC { - pub(crate) mito_dna: HashSet, // Mitochondrial DNA reference sequence IDs + pub(crate) mito_dna: HashSet, // Mitochondrial DNA reference sequence IDs pub(crate) all_reads_flagstat: FlagStat, pub(crate) barcoded_reads_flagstat: FlagStat, pub(crate) hq_flagstat: FlagStat, @@ -167,21 +165,39 @@ impl AlignQC { pub fn update(&mut self, record: &R, header: &sam::Header) { let mut flagstat = FlagStat::default(); flagstat.update(header, record); - if flagstat.paired == 1 && flagstat.read_2 == 1{ + if flagstat.paired == 1 && flagstat.read_2 == 1 { self.num_read2_bases += record.sequence().len() as u64; - self.num_read2_q30_bases += record.quality_scores().as_ref().iter().filter(|x| *x >= 30).count() as u64; + self.num_read2_q30_bases += record + .quality_scores() + .as_ref() + .iter() + .filter(|x| *x >= 30) + .count() as u64; } else { self.num_read1_bases += record.sequence().len() as u64; - self.num_read1_q30_bases += record.quality_scores().as_ref().iter().filter(|x| *x >= 30).count() as u64; + self.num_read1_q30_bases += record + .quality_scores() + .as_ref() + .iter() + .filter(|x| *x >= 30) + .count() as u64; } self.all_reads_flagstat.add(&flagstat); - let is_hq = record.mapping_quality().map_or(true, |x| x.unwrap().get() >= 30); + let is_hq = record + .mapping_quality() + .map_or(true, |x| x.unwrap().get() >= 30); if is_hq { self.hq_flagstat.add(&flagstat); } - if record.data().get(&Tag::CELL_BARCODE_ID).transpose().unwrap().is_some() { + if record + .data() + .get(&Tag::CELL_BARCODE_ID) + .transpose() + .unwrap() + .is_some() + { self.barcoded_reads_flagstat.add(&flagstat); if let Some(rid) = record.reference_sequence_id(header) { if is_hq && self.mito_dna.contains(&rid.unwrap()) { @@ -224,9 +240,18 @@ impl AlignQC { metric.insert("sequenced_reads".to_string(), num_reads as f64); metric.insert("sequenced_read_pairs".to_string(), num_pairs as f64); - metric.insert("frac_q30_bases_read1".to_string(), self.num_read1_q30_bases as f64 / self.num_read1_bases as f64); - metric.insert("frac_q30_bases_read2".to_string(), self.num_read2_q30_bases as f64 / self.num_read2_bases as f64); - metric.insert("frac_confidently_mapped".to_string(), fraction_confidently_mapped); + metric.insert( + "frac_q30_bases_read1".to_string(), + self.num_read1_q30_bases as f64 / self.num_read1_bases as f64, + ); + metric.insert( + "frac_q30_bases_read2".to_string(), + self.num_read2_q30_bases as f64 / self.num_read2_bases as f64, + ); + metric.insert( + "frac_confidently_mapped".to_string(), + fraction_confidently_mapped, + ); metric.insert("frac_unmapped".to_string(), fraction_unmapped); metric.insert("frac_valid_barcode".to_string(), valid_barcode); metric.insert("frac_nonnuclear".to_string(), fraction_nonnuclear); @@ -261,8 +286,18 @@ impl FragmentQC { } pub fn report(&self, metric: &mut Metrics) { - metric.insert("frac_duplicates".to_string(), self.num_pcr_duplicates as f64 / (self.num_unique_fragments + self.num_pcr_duplicates) as f64); - metric.insert("frac_fragment_in_nucleosome_free_region".to_string(), self.num_frag_nfr as f64 / self.num_unique_fragments as f64); - metric.insert("frac_fragment_flanking_single_nucleosome".to_string(), self.num_frag_single as f64 / self.num_unique_fragments as f64); + metric.insert( + "frac_duplicates".to_string(), + self.num_pcr_duplicates as f64 + / (self.num_unique_fragments + self.num_pcr_duplicates) as f64, + ); + metric.insert( + "frac_fragment_in_nucleosome_free_region".to_string(), + self.num_frag_nfr as f64 / self.num_unique_fragments as f64, + ); + metric.insert( + "frac_fragment_flanking_single_nucleosome".to_string(), + self.num_frag_single as f64 / self.num_unique_fragments as f64, + ); } -} \ No newline at end of file +} diff --git a/seqspec/src/lib.rs b/seqspec/src/lib.rs index c826fe5..bf2c0a1 100644 --- a/seqspec/src/lib.rs +++ b/seqspec/src/lib.rs @@ -3,19 +3,24 @@ mod region; pub mod utils; use log::warn; -use read::{ReadSpan, RegionIndex, ValidateResult}; -pub use read::{Read, File, UrlType, Strand}; +use noodles::fastq; use read::ReadValidator; +pub use read::{File, Read, Strand, UrlType}; +use read::{ReadSpan, RegionIndex, ValidateResult}; use region::LibSpec; -pub use region::{Region, RegionType, SequenceType, Onlist}; -use noodles::fastq; +pub use region::{Onlist, Region, RegionType, SequenceType}; +use anyhow::{anyhow, bail, Result}; use serde::{Deserialize, Deserializer, Serialize}; use serde_yaml::{self, Value}; -use utils::{open_file_for_write, Compression}; -use std::{fs, path::PathBuf, str::FromStr, sync::{Arc, RwLock}}; -use anyhow::{bail, anyhow, Result}; use std::path::Path; +use std::{ + fs, + path::PathBuf, + str::FromStr, + sync::{Arc, RwLock}, +}; +use utils::{open_file_for_write, Compression}; #[derive(Deserialize, Serialize, Debug, Clone, PartialEq)] pub struct Assay { @@ -95,13 +100,22 @@ impl Assay { } /// Add default Illumina reads to the sequence spec. - pub fn add_illumina_reads(&mut self, modality: Modality, read_len: usize, forward_strand_workflow: bool) -> Result<()> { - fn advance_until(iterator: &mut std::slice::Iter<'_, Arc>>, f: fn(&Region) -> bool) -> Option<(Arc>, Vec>>)> { + #[allow(clippy::type_complexity)] + pub fn add_illumina_reads( + &mut self, + modality: Modality, + read_len: usize, + forward_strand_workflow: bool, + ) -> Result<()> { + fn advance_until( + iterator: &mut std::slice::Iter<'_, Arc>>, + f: fn(&Region) -> bool, + ) -> Option<(Arc>, Vec>>)> { let mut regions = Vec::new(); - while let Some(next_region) = iterator.next() { + for next_region in iterator.by_ref() { let r = next_region.read().unwrap(); if f(&r) { - return Some((next_region.clone(), regions)) + return Some((next_region.clone(), regions)); } else { regions.push(next_region.clone()); } @@ -111,22 +125,33 @@ impl Assay { fn get_length(regions: &[Arc>], reverse: bool) -> usize { if reverse { - regions.iter() - .skip_while(|region| region.read().unwrap().sequence_type == SequenceType::Fixed) - .map(|region| region.read().unwrap().len().unwrap() as usize).sum() + regions + .iter() + .skip_while(|region| { + region.read().unwrap().sequence_type == SequenceType::Fixed + }) + .map(|region| region.read().unwrap().len().unwrap() as usize) + .sum() } else { - regions.iter().rev() - .skip_while(|region| region.read().unwrap().sequence_type == SequenceType::Fixed) - .map(|region| region.read().unwrap().len().unwrap() as usize).sum() + regions + .iter() + .rev() + .skip_while(|region| { + region.read().unwrap().sequence_type == SequenceType::Fixed + }) + .map(|region| region.read().unwrap().len().unwrap() as usize) + .sum() } } fn is_read1(region: &Region) -> bool { - region.region_type == RegionType::NexteraRead1 || region.region_type == RegionType::TruseqRead1 + region.region_type == RegionType::NexteraRead1 + || region.region_type == RegionType::TruseqRead1 } fn is_read2(region: &Region) -> bool { - region.region_type == RegionType::NexteraRead2 || region.region_type == RegionType::TruseqRead2 + region.region_type == RegionType::NexteraRead2 + || region.region_type == RegionType::TruseqRead2 } fn is_p5(region: &Region) -> bool { @@ -138,7 +163,11 @@ impl Assay { } self.delete_all_reads(modality); - let regions = self.library_spec.get_modality(&modality).ok_or_else(|| anyhow!("Modality not found: {:?}", modality))?.clone(); + let regions = self + .library_spec + .get_modality(&modality) + .ok_or_else(|| anyhow!("Modality not found: {:?}", modality))? + .clone(); let regions = regions.read().unwrap(); let mut regions = regions.subregions.iter(); while let Some(current_region) = regions.next() { @@ -147,32 +176,38 @@ impl Assay { if let Some((next_region, acc)) = advance_until(&mut regions, is_read1) { let next_region = next_region.read().unwrap(); self.update_read::( - &format!("{}-R1", modality.to_string()), + &format!("{}-R1", modality), Some(modality), Some(&next_region.region_id), Some(false), - None, Some(read_len), Some(read_len) + None, + Some(read_len), + Some(read_len), )?; if forward_strand_workflow { let acc_len = get_length(acc.as_slice(), false); if acc_len > 0 { self.update_read::( - &format!("{}-I2", modality.to_string()), + &format!("{}-I2", modality), Some(modality), Some(¤t_region.region_id), Some(false), - None, Some(acc_len), Some(acc_len) + None, + Some(acc_len), + Some(acc_len), )?; } } else { let acc_len = get_length(acc.as_slice(), true); if acc_len > 0 { self.update_read::( - &format!("{}-I2", modality.to_string()), + &format!("{}-I2", modality), Some(modality), Some(&next_region.region_id), Some(true), - None, Some(acc_len), Some(acc_len) + None, + Some(acc_len), + Some(acc_len), )?; } } @@ -181,19 +216,23 @@ impl Assay { if let Some((_, acc)) = advance_until(&mut regions, is_p7) { let acc_len = get_length(acc.as_slice(), false); self.update_read::( - &format!("{}-R2", modality.to_string()), + &format!("{}-R2", modality), Some(modality), Some(¤t_region.region_id), Some(true), - None, Some(read_len), Some(read_len) + None, + Some(read_len), + Some(read_len), )?; if acc_len > 0 { self.update_read::( - &format!("{}-I1", modality.to_string()), + &format!("{}-I1", modality), Some(modality), Some(¤t_region.region_id), Some(false), - None, Some(acc_len), Some(acc_len) + None, + Some(acc_len), + Some(acc_len), )?; } } @@ -203,6 +242,7 @@ impl Assay { } /// Update read information. If the read does not exist, it will be created. + #[allow(clippy::too_many_arguments)] pub fn update_read>( &mut self, read_id: &str, @@ -216,10 +256,22 @@ impl Assay { let mut read_buffer = if let Some(r) = self.sequence_spec.get(read_id) { r.clone() } else { - assert!(modality.is_some(), "modality must be provided for a new read"); - assert!(primer_id.is_some(), "primer_id must be provided for a new read"); - assert!(is_reverse.is_some(), "is_reverse must be provided for a new read"); - Read { read_id: read_id.to_string(), ..Default::default() } + assert!( + modality.is_some(), + "modality must be provided for a new read" + ); + assert!( + primer_id.is_some(), + "primer_id must be provided for a new read" + ); + assert!( + is_reverse.is_some(), + "is_reverse must be provided for a new read" + ); + Read { + read_id: read_id.to_string(), + ..Default::default() + } }; if let Some(rev) = is_reverse { @@ -232,7 +284,12 @@ impl Assay { read_buffer.primer_id = primer_id.to_string(); } if let Some(fastq) = fastqs { - read_buffer.files = Some(fastq.iter().map(|path| File::from_fastq(path)).collect::>>()?); + read_buffer.files = Some( + fastq + .iter() + .map(File::from_fastq) + .collect::>>()?, + ); } if (min_len.is_none() || max_len.is_none()) && read_buffer.files.is_some() { @@ -255,22 +312,29 @@ impl Assay { } pub fn delete_all_reads(&mut self, modality: Modality) { - let ids: Vec<_> = self.iter_reads(modality).map(|read| { - read.read_id.to_string() - }).collect(); + let ids: Vec<_> = self + .iter_reads(modality) + .map(|read| read.read_id.to_string()) + .collect(); ids.into_iter().for_each(|id| { self.delete_read(&id); }); } /// Get the index of atomic regions of each read in the sequence spec. - pub fn get_index_by_modality(&self, modality: Modality) -> impl Iterator { - self.sequence_spec.values().filter_map(move |read| if read.modality == modality { - let index = self.get_index(&read.read_id) - .expect(&format!("Cannot find index for Read: {}", read.read_id)); - Some((read, index)) - } else { - None + pub fn get_index_by_modality( + &self, + modality: Modality, + ) -> impl Iterator { + self.sequence_spec.values().filter_map(move |read| { + if read.modality == modality { + let index = self + .get_index(&read.read_id) + .unwrap_or_else(|| panic!("Cannot find index for Read: {}", read.read_id)); + Some((read, index)) + } else { + None + } }) } @@ -282,93 +346,136 @@ impl Assay { } pub fn iter_reads(&self, modality: Modality) -> impl Iterator { - self.sequence_spec.values().filter(move |read| read.modality == modality) + self.sequence_spec + .values() + .filter(move |read| read.modality == modality) } /// Validate reads in the sequence spec based on the fixed sequences and /// save the valid and invalid reads to different files. - pub fn validate>(&self, modality: Modality, dir: P, tolerance: f64) -> Result<()> { + pub fn validate>( + &self, + modality: Modality, + dir: P, + tolerance: f64, + ) -> Result<()> { fs::create_dir_all(&dir)?; - let (mut readers, reads): (Vec<_>, Vec<_>) = self.iter_reads(modality).flat_map(|read| { - let reader = read.open()?; - let region = self.library_spec.get_parent(&read.primer_id) - .ok_or_else(|| anyhow!("Primer not found: {}", read.primer_id)).unwrap(); - let index = read.get_index(®ion.read().unwrap())?; - let regions: Vec<_> = index.index.iter().map(|(region_id, _, range)| { - let region = self.library_spec.get(region_id).unwrap(); - (region.read().unwrap(), range.clone()) - }).collect(); - Some((reader, (read, regions))) - }).collect(); - - let mut validators = reads.iter().map(|(read, regions)| - regions.iter().map(|(region, range)| { - Ok(ReadValidator::new(region)? - .with_range(range.start as usize ..range.end as usize) - .with_strand(read.strand) - .with_tolerance(tolerance)) - }).collect::>>() - ).collect::>>()?; - - let mut outputs: Vec<_> = reads.iter().map(|(read, _)| { - let output_valid = dir.as_ref().join(format!("{}.fq.zst", read.read_id)); - let output_valid = open_file_for_write(output_valid, Some(Compression::Zstd), Some(9), 8)?; - let output_valid = fastq::io::Writer::new(output_valid); - let output_other = dir.as_ref().join(format!("Invalid_{}.fq.zst", read.read_id)); - let output_other = open_file_for_write(output_other, Some(Compression::Zstd), Some(9), 8)?; - let output_other = fastq::io::Writer::new(output_other); - - anyhow::Ok((output_valid, output_other)) - }).collect::>()?; + let (mut readers, reads): (Vec<_>, Vec<_>) = self + .iter_reads(modality) + .flat_map(|read| { + let reader = read.open()?; + let region = self + .library_spec + .get_parent(&read.primer_id) + .ok_or_else(|| anyhow!("Primer not found: {}", read.primer_id)) + .unwrap(); + let index = read.get_index(®ion.read().unwrap())?; + let regions: Vec<_> = index + .index + .iter() + .map(|(region_id, _, range)| { + let region = self.library_spec.get(region_id).unwrap(); + (region.read().unwrap(), range.clone()) + }) + .collect(); + Some((reader, (read, regions))) + }) + .collect(); + + let mut validators = reads + .iter() + .map(|(read, regions)| { + regions + .iter() + .map(|(region, range)| { + Ok(ReadValidator::new(region)? + .with_range(range.start as usize..range.end as usize) + .with_strand(read.strand) + .with_tolerance(tolerance)) + }) + .collect::>>() + }) + .collect::>>()?; + + let mut outputs: Vec<_> = reads + .iter() + .map(|(read, _)| { + let output_valid = dir.as_ref().join(format!("{}.fq.zst", read.read_id)); + let output_valid = + open_file_for_write(output_valid, Some(Compression::Zstd), Some(9), 8)?; + let output_valid = fastq::io::Writer::new(output_valid); + let output_other = dir + .as_ref() + .join(format!("Invalid_{}.fq.zst", read.read_id)); + let output_other = + open_file_for_write(output_other, Some(Compression::Zstd), Some(9), 8)?; + let output_other = fastq::io::Writer::new(output_other); + + anyhow::Ok((output_valid, output_other)) + }) + .collect::>()?; loop { - let records: Vec<_> = readers.iter_mut().flat_map(|reader| { - let mut record = fastq::Record::default(); - if reader.read_record(&mut record).unwrap() == 0 { - None - } else { - Some(record) - } - }).collect(); + let records: Vec<_> = readers + .iter_mut() + .flat_map(|reader| { + let mut record = fastq::Record::default(); + if reader.read_record(&mut record).unwrap() == 0 { + None + } else { + Some(record) + } + }) + .collect(); if records.len() != readers.len() { break; } - let valid = records.iter().zip(validators.iter_mut()).all(|(record, validators)| - validators.iter_mut().all(|validator| { - match validator.validate(record.sequence()) { - ValidateResult::OnlistFail | ValidateResult::Valid => true, - _ => false, + let valid = records + .iter() + .zip(validators.iter_mut()) + .all(|(record, validators)| { + validators.iter_mut().all(|validator| { + matches!( + validator.validate(record.sequence()), + ValidateResult::OnlistFail | ValidateResult::Valid + ) + }) + }); + + records.iter().zip(outputs.iter_mut()).try_for_each( + |(record, (output_valid, output_other))| { + if valid { + output_valid.write_record(record)?; + } else { + output_other.write_record(record)?; } - }) - ); - - records.iter().zip(outputs.iter_mut()).try_for_each(|(record, (output_valid, output_other))| { - if valid { - output_valid.write_record(record)?; - } else { - output_other.write_record(record)?; - } - anyhow::Ok(()) - })?; + anyhow::Ok(()) + }, + )?; } Ok(()) } fn verify(&self, read: &Read) -> Result<()> { - let region = self.library_spec.get_parent(&read.primer_id) + let region = self + .library_spec + .get_parent(&read.primer_id) .ok_or_else(|| anyhow!("Primer not found: {}", read.primer_id))?; // Check if the primer exists if let Some(index) = read.get_index(®ion.read().unwrap()) { match index.readlen_info { - ReadSpan::Covered | ReadSpan::Span(_) => {}, + ReadSpan::Covered | ReadSpan::Span(_) => {} ReadSpan::NotEnough => { warn!("'{}' does not cover the region", read.read_id); - }, + } ReadSpan::MayReadThrough(id) => { - warn!("'{}' may read through and contain sequences from: '{}'", read.read_id, id); - }, + warn!( + "'{}' may read through and contain sequences from: '{}'", + read.read_id, id + ); + } ReadSpan::ReadThrough(id) => { warn!("'{}' length exceeds maximum length of the variable-length region (insertion), \ truncating the reads to the maximum length of the region. \ @@ -378,15 +485,22 @@ impl Assay { } if let Some(mut reader) = read.open() { - let regions = index.index.iter().map(|(region_id, _, range)| { - let region = self.library_spec.get(region_id).unwrap(); - (region.read().unwrap(), range) - }).collect::>(); - let mut validators = regions.iter().map(|(region, range)| { - Ok(ReadValidator::new(region)? - .with_range(range.start as usize ..range.end as usize) - .with_strand(read.strand)) - }).collect::>>()?; + let regions = index + .index + .iter() + .map(|(region_id, _, range)| { + let region = self.library_spec.get(region_id).unwrap(); + (region.read().unwrap(), range) + }) + .collect::>(); + let mut validators = regions + .iter() + .map(|(region, range)| { + Ok(ReadValidator::new(region)? + .with_range(range.start as usize..range.end as usize) + .with_strand(read.strand)) + }) + .collect::>>()?; reader.records().take(500).try_for_each(|record| { let record = record?; @@ -396,7 +510,7 @@ impl Assay { ValidateResult::TooLong(_) | ValidateResult::TooShort(_) => { bail!("{}", result); } - _ => {}, + _ => {} } } anyhow::Ok(()) @@ -405,8 +519,13 @@ impl Assay { for validator in validators { let percent_matched = validator.frac_matched() * 100.0; if percent_matched < 50.0 { - warn!("Read '{}' has low percentage of matched bases for region '{}'. \ - Percentage of matched bases: {:.2}%", read.read_id, validator.id(), percent_matched); + warn!( + "Read '{}' has low percentage of matched bases for region '{}'. \ + Percentage of matched bases: {:.2}%", + read.read_id, + validator.id(), + percent_matched + ); } } } @@ -434,7 +553,10 @@ impl<'de> Deserialize<'de> for Modality { let value = Value::deserialize(deserializer)?; match value { Value::String(s) => Modality::from_str(&s).map_err(serde::de::Error::custom), - _ => Err(serde::de::Error::custom(format!("invalid value: {:?}", value))), + _ => Err(serde::de::Error::custom(format!( + "invalid value: {:?}", + value + ))), } } } @@ -455,16 +577,17 @@ impl FromStr for Modality { } } -impl ToString for Modality { - fn to_string(&self) -> String { - match self { - Modality::DNA => "dna".to_string(), - Modality::RNA => "rna".to_string(), - Modality::Tag => "tag".to_string(), - Modality::Protein => "protein".to_string(), - Modality::ATAC => "atac".to_string(), - Modality::Crispr => "crispr".to_string(), - } +impl std::fmt::Display for Modality { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let s = match self { + Modality::DNA => "dna", + Modality::RNA => "rna", + Modality::Tag => "tag", + Modality::Protein => "protein", + Modality::ATAC => "atac", + Modality::Crispr => "crispr", + }; + write!(f, "{}", s) } } @@ -490,11 +613,16 @@ impl<'de> Deserialize<'de> for LibraryProtocol { match value { Value::String(s) => Ok(LibraryProtocol::Standard(s)), Value::Sequence(seq) => { - let items = seq.into_iter().map(|item| serde_yaml::from_value(item).unwrap()) + let items = seq + .into_iter() + .map(|item| serde_yaml::from_value(item).unwrap()) .collect::>(); Ok(LibraryProtocol::Custom(items)) } - _ => Err(serde::de::Error::custom(format!("invalid value: {:?}", value))), + _ => Err(serde::de::Error::custom(format!( + "invalid value: {:?}", + value + ))), } } } @@ -502,7 +630,7 @@ impl<'de> Deserialize<'de> for LibraryProtocol { impl Serialize for LibraryProtocol { fn serialize(&self, serializer: S) -> Result where - S: serde::ser::Serializer + S: serde::ser::Serializer, { match self { LibraryProtocol::Standard(s) => s.serialize(serializer), @@ -533,11 +661,16 @@ impl<'de> Deserialize<'de> for LibraryKit { match value { Value::String(s) => Ok(LibraryKit::Standard(s)), Value::Sequence(seq) => { - let items = seq.into_iter().map(|item| serde_yaml::from_value(item).unwrap()) + let items = seq + .into_iter() + .map(|item| serde_yaml::from_value(item).unwrap()) .collect::>(); Ok(LibraryKit::Custom(items)) } - _ => Err(serde::de::Error::custom(format!("invalid value: {:?}", value))), + _ => Err(serde::de::Error::custom(format!( + "invalid value: {:?}", + value + ))), } } } @@ -545,7 +678,7 @@ impl<'de> Deserialize<'de> for LibraryKit { impl Serialize for LibraryKit { fn serialize(&self, serializer: S) -> Result where - S: serde::ser::Serializer + S: serde::ser::Serializer, { match self { LibraryKit::Standard(s) => s.serialize(serializer), @@ -560,7 +693,7 @@ pub enum SequenceProtocol { Standard(String), } -impl <'de> Deserialize<'de> for SequenceProtocol { +impl<'de> Deserialize<'de> for SequenceProtocol { fn deserialize(deserializer: D) -> Result where D: Deserializer<'de>, @@ -569,11 +702,16 @@ impl <'de> Deserialize<'de> for SequenceProtocol { match value { Value::String(s) => Ok(SequenceProtocol::Standard(s)), Value::Sequence(seq) => { - let items = seq.into_iter().map(|item| serde_yaml::from_value(item).unwrap()) + let items = seq + .into_iter() + .map(|item| serde_yaml::from_value(item).unwrap()) .collect::>(); Ok(SequenceProtocol::Custom(items)) } - _ => Err(serde::de::Error::custom(format!("invalid value: {:?}", value))), + _ => Err(serde::de::Error::custom(format!( + "invalid value: {:?}", + value + ))), } } } @@ -581,7 +719,7 @@ impl <'de> Deserialize<'de> for SequenceProtocol { impl Serialize for SequenceProtocol { fn serialize(&self, serializer: S) -> Result where - S: serde::ser::Serializer + S: serde::ser::Serializer, { match self { SequenceProtocol::Standard(s) => s.serialize(serializer), @@ -605,11 +743,16 @@ impl<'de> Deserialize<'de> for SequenceKit { match value { Value::String(s) => Ok(SequenceKit::Standard(s)), Value::Sequence(seq) => { - let items = seq.into_iter().map(|item| serde_yaml::from_value(item).unwrap()) + let items = seq + .into_iter() + .map(|item| serde_yaml::from_value(item).unwrap()) .collect::>(); Ok(SequenceKit::Custom(items)) } - _ => Err(serde::de::Error::custom(format!("invalid value: {:?}", value))), + _ => Err(serde::de::Error::custom(format!( + "invalid value: {:?}", + value + ))), } } } @@ -617,7 +760,7 @@ impl<'de> Deserialize<'de> for SequenceKit { impl Serialize for SequenceKit { fn serialize(&self, serializer: S) -> Result where - S: serde::ser::Serializer + S: serde::ser::Serializer, { match self { SequenceKit::Standard(s) => s.serialize(serializer), @@ -629,7 +772,7 @@ impl Serialize for SequenceKit { #[cfg(test)] mod tests { use super::*; - + const YAML_FILE: &str = "../seqspec_templates/10x_rna_atac.yaml"; #[test] @@ -643,7 +786,7 @@ mod tests { #[test] fn test_serialize() { fn se_de(yaml_str: &str) { - let assay: Assay = serde_yaml::from_str(&yaml_str).expect("Failed to parse YAML"); + let assay: Assay = serde_yaml::from_str(yaml_str).expect("Failed to parse YAML"); let yaml_str_ = serde_yaml::to_string(&assay).unwrap(); let assay_ = serde_yaml::from_str(&yaml_str_).expect("Failed to parse YAML"); assert_eq!(assay, assay_); @@ -658,13 +801,37 @@ mod tests { let assay: Assay = serde_yaml::from_str(&yaml_str).expect("Failed to parse YAML"); for (read, index) in assay.get_index_by_modality(Modality::RNA) { - println!("{}: {:?}", read.read_id, index.index.into_iter().map(|x| (x.1, x.2)).collect::>()); + println!( + "{}: {:?}", + read.read_id, + index + .index + .into_iter() + .map(|x| (x.1, x.2)) + .collect::>() + ); } for (read, index) in assay.get_index_by_modality(Modality::ATAC) { - println!("{}: {:?}", read.read_id, index.index.into_iter().map(|x| (x.1, x.2)).collect::>()); + println!( + "{}: {:?}", + read.read_id, + index + .index + .into_iter() + .map(|x| (x.1, x.2)) + .collect::>() + ); } for (read, index) in assay.get_index_by_modality(Modality::Protein) { - println!("{}: {:?}", read.read_id, index.index.into_iter().map(|x| (x.1, x.2)).collect::>()); + println!( + "{}: {:?}", + read.read_id, + index + .index + .into_iter() + .map(|x| (x.1, x.2)) + .collect::>() + ); } } -} \ No newline at end of file +} diff --git a/seqspec/src/read.rs b/seqspec/src/read.rs index fd1f1c9..1bc07ba 100644 --- a/seqspec/src/read.rs +++ b/seqspec/src/read.rs @@ -1,16 +1,19 @@ -use crate::{Modality, RegionType}; use crate::region::{Region, SequenceType}; +use crate::{Modality, RegionType}; +use anyhow::Result; use cached_path::Cache; -use noodles::fastq; use indexmap::IndexMap; +use noodles::fastq; use serde::{Deserialize, Serialize, Serializer}; use std::collections::HashSet; use std::ops::{Deref, DerefMut}; -use std::sync::{Arc, RwLock}; -use std::{io::{BufRead, BufReader}, ops::Range}; -use anyhow::Result; use std::path::Path; +use std::sync::{Arc, RwLock}; +use std::{ + io::{BufRead, BufReader}, + ops::Range, +}; #[derive(Debug, Clone, PartialEq)] pub struct SeqSpec(IndexMap); @@ -29,7 +32,7 @@ impl DerefMut for SeqSpec { } } -impl Serialize for SeqSpec{ +impl Serialize for SeqSpec { fn serialize(&self, serializer: S) -> Result { self.0.values().collect::>().serialize(serializer) } @@ -42,7 +45,10 @@ impl<'de> Deserialize<'de> for SeqSpec { for read in reads { let read_id = read.read_id.clone(); if sequences.insert(read_id.clone(), read).is_some() { - return Err(serde::de::Error::custom(format!("Duplicate read id: {}", &read_id))); + return Err(serde::de::Error::custom(format!( + "Duplicate read id: {}", + &read_id + ))); } } Ok(Self(sequences)) @@ -78,14 +84,18 @@ impl Default for Read { impl Read { pub fn open(&self) -> Option> { - let files = self.files.clone().unwrap_or(Vec::new()) - .into_iter().filter(|file| file.filetype == "fastq").collect::>(); + let files = self + .files + .clone() + .unwrap_or_default() + .into_iter() + .filter(|file| file.filetype == "fastq") + .collect::>(); if files.is_empty() { return None; } - let reader = multi_reader::MultiReader::new( - files.into_iter().map(move |file| file.open().unwrap()) - ); + let reader = + multi_reader::MultiReader::new(files.into_iter().map(move |file| file.open().unwrap())); Some(fastq::Reader::new(BufReader::new(reader))) } @@ -105,30 +115,39 @@ impl Read { let result = if self.is_reverse() { self.get_read_span( - region.subregions.iter().rev() + region + .subregions + .iter() + .rev() .skip_while(|region| { let region = region.read().unwrap(); - let found = region.region_type.is_sequencing_primer() && region.region_id == self.primer_id; + let found = region.region_type.is_sequencing_primer() + && region.region_id == self.primer_id; if found { found_primer = true; } !found - }).skip(1) + }) + .skip(1), ) } else { self.get_read_span( - region.subregions.iter() + region + .subregions + .iter() .skip_while(|region| { let region = region.read().unwrap(); - let found = region.region_type.is_sequencing_primer() && region.region_id == self.primer_id; + let found = region.region_type.is_sequencing_primer() + && region.region_id == self.primer_id; if found { found_primer = true; } !found - }).skip(1) + }) + .skip(1), ) }; - + if found_primer { Some(result) } else { @@ -149,7 +168,8 @@ impl Read { let region = region.read().unwrap(); let region_id = region.region_id.clone(); let region_type = region.region_type; - if region.is_fixed_length() { // Fixed-length region + if region.is_fixed_length() { + // Fixed-length region let end = cur_pos + region.min_len; if end >= read_len { index.push((region_id, region_type, cur_pos..read_len)); @@ -161,18 +181,21 @@ impl Read { index.push((region_id, region_type, cur_pos..end)); cur_pos = end; } - } else if cur_pos + region.min_len >= read_len { // Variable-length region and read is shorter + } else if cur_pos + region.min_len >= read_len { + // Variable-length region and read is shorter index.push((region_id, region_type, cur_pos..read_len)); readlen_info = ReadSpan::Span((read_len - cur_pos) as usize); break; - } else if cur_pos + region.max_len < read_len { // Variable-length region and read is longer than max length + } else if cur_pos + region.max_len < read_len { + // Variable-length region and read is longer than max length index.push((region_id, region_type, cur_pos..cur_pos + region.max_len)); if let Some(next_region) = regions.next() { let next_region = next_region.read().unwrap(); readlen_info = ReadSpan::ReadThrough(next_region.region_id.clone()); } break; - } else { // Variable-length region and read is within the length range + } else { + // Variable-length region and read is within the length range index.push((region_id, region_type, cur_pos..read_len)); if let Some(next_region) = regions.next() { let next_region = next_region.read().unwrap(); @@ -181,7 +204,10 @@ impl Read { break; } } - RegionIndex { index, readlen_info } + RegionIndex { + index, + readlen_info, + } } pub fn is_reverse(&self) -> bool { @@ -200,11 +226,11 @@ pub struct RegionIndex { #[derive(Debug, Clone)] pub enum ReadSpan { - Covered, // The read is fully contained within the target region - Span(usize), // The read spans the target region - NotEnough, // Read is too short to reach the target region - ReadThrough(String), // Read is longer than the target region - MayReadThrough(String), // Read may be longer than the target region + Covered, // The read is fully contained within the target region + Span(usize), // The read spans the target region + NotEnough, // Read is too short to reach the target region + ReadThrough(String), // Read is longer than the target region + MayReadThrough(String), // Read may be longer than the target region } #[derive(Deserialize, Serialize, Debug, Copy, Clone, PartialEq)] @@ -243,7 +269,9 @@ impl File { pub(crate) fn normalize_path>(&mut self, work_dir: P) -> Result<()> { if self.urltype == UrlType::Local { self.url = crate::utils::normalize_path(work_dir, &mut self.url)? - .to_str().unwrap().to_owned(); + .to_str() + .unwrap() + .to_owned(); } Ok(()) } @@ -251,7 +279,9 @@ impl File { pub(crate) fn unnormalize_path>(&mut self, work_dir: P) -> Result<()> { if self.urltype == UrlType::Local { self.url = crate::utils::unnormalize_path(work_dir, &mut self.url)? - .to_str().unwrap().to_owned(); + .to_str() + .unwrap() + .to_owned(); } Ok(()) } @@ -262,9 +292,7 @@ impl File { /// The base_dir is used to resolve relative paths. pub fn open(&self) -> Result> { match self.urltype { - UrlType::Local => { - Ok(Box::new(crate::utils::open_file_for_read(&self.url)?)) - } + UrlType::Local => Ok(Box::new(crate::utils::open_file_for_read(&self.url)?)), _ => { let mut cache = Cache::new().unwrap(); cache.dir = home::home_dir().unwrap().join(".cache/seqspec"); @@ -331,7 +359,7 @@ impl<'a> ReadValidator<'a> { } pub fn with_tolerance(mut self, tolerance: f64) -> Self { - if tolerance < 0.0 || tolerance > 1.0 { + if !(0.0..=1.0).contains(&tolerance) { panic!("Tolerance must be between 0 and 1"); } self.tolerance = tolerance; @@ -399,19 +427,19 @@ impl std::fmt::Display for ValidateResult { match self { ValidateResult::TooShort((n, min_len)) => { write!(f, "Sequence too short: {} < {}", n, min_len) - }, + } ValidateResult::TooLong((n, max_len)) => { write!(f, "Sequence too long: {} > {}", n, max_len) - }, + } ValidateResult::OnlistFail => { write!(f, "Not on the onlist") - }, + } ValidateResult::MisMatch(n) => { write!(f, "Mismatch: {}", n) - }, + } ValidateResult::Valid => { write!(f, "Valid") - }, + } } } -} \ No newline at end of file +} diff --git a/seqspec/src/region.rs b/seqspec/src/region.rs index aad7dad..f3bd7b1 100644 --- a/seqspec/src/region.rs +++ b/seqspec/src/region.rs @@ -1,11 +1,17 @@ -use crate::Modality; use crate::read::UrlType; +use crate::Modality; +use anyhow::Result; use cached_path::Cache; use indexmap::IndexMap; use serde::{Deserialize, Serialize}; -use std::{collections::{HashMap, HashSet}, io::BufRead, ops::Deref, path::Path, sync::{Arc, RwLock}}; -use anyhow::Result; +use std::{ + collections::{HashMap, HashSet}, + io::BufRead, + ops::Deref, + path::Path, + sync::{Arc, RwLock}, +}; #[derive(Debug, Clone)] pub struct LibSpec { @@ -17,15 +23,18 @@ pub struct LibSpec { impl PartialEq for LibSpec { fn eq(&self, other: &Self) -> bool { self.modalities.keys().all(|k| { - self.modalities.get(k).unwrap().read().unwrap().deref() == - other.modalities.get(k).unwrap().read().unwrap().deref() + self.modalities.get(k).unwrap().read().unwrap().deref() + == other.modalities.get(k).unwrap().read().unwrap().deref() }) } } impl Serialize for LibSpec { fn serialize(&self, serializer: S) -> Result { - self.modalities.values().collect::>().serialize(serializer) + self.modalities + .values() + .collect::>() + .serialize(serializer) } } @@ -50,7 +59,10 @@ impl LibSpec { if modalities.insert(modality, region.clone()).is_some() { return Err(anyhow::anyhow!("Duplicate modality: {:?}", modality)); } - if region_map.insert(region_id.clone(), region.clone()).is_some() { + if region_map + .insert(region_id.clone(), region.clone()) + .is_some() + { return Err(anyhow::anyhow!("Duplicate region id: {}", region_id)); } for subregion in region.read().unwrap().subregions.iter() { @@ -64,7 +76,11 @@ impl LibSpec { return Err(anyhow::anyhow!("Top-level regions must be modalities")); }; } - Ok(Self { modalities, region_map, parent_map }) + Ok(Self { + modalities, + region_map, + parent_map, + }) } /// Iterate over all regions with modality type in the library. @@ -106,28 +122,27 @@ pub struct Region { impl PartialEq for Region { fn eq(&self, other: &Self) -> bool { - self.region_id == other.region_id && - self.region_type == other.region_type && - self.name == other.name && - self.sequence_type == other.sequence_type && - self.sequence == other.sequence && - self.min_len == other.min_len && - self.max_len == other.max_len && - self.onlist == other.onlist && - self.subregions.iter().zip(other.subregions.iter()) + self.region_id == other.region_id + && self.region_type == other.region_type + && self.name == other.name + && self.sequence_type == other.sequence_type + && self.sequence == other.sequence + && self.min_len == other.min_len + && self.max_len == other.max_len + && self.onlist == other.onlist + && self + .subregions + .iter() + .zip(other.subregions.iter()) .all(|(a, b)| a.read().unwrap().deref() == b.read().unwrap().deref()) } } impl Region { pub fn is_fixed_length(&self) -> bool { - if self.min_len == self.max_len { - true - } else { - false - } + self.min_len == self.max_len } - + pub fn len(&self) -> Option { if self.min_len == self.max_len { Some(self.min_len) @@ -135,14 +150,23 @@ impl Region { None } } + + // https://rust-lang.github.io/rust-clippy/master/index.html#len_without_is_empty + // It is good custom to have both methods, because for some data structures, asking about the length will be a costly operation, whereas .is_empty() can usually answer in constant time. Also it used to lead to false positives on the len_zero lint – currently that lint will ignore such entities. + pub fn is_empty(&self) -> bool { + self.min_len != self.max_len + } } fn deserialize_regions<'de, D>(deserializer: D) -> Result>>, D::Error> where D: serde::Deserializer<'de>, { - if let Some(regions) = Option::>::deserialize(deserializer)? { - Ok(regions.into_iter().map(|x| Arc::new(RwLock::new(x))).collect()) + if let Some(regions) = Option::>::deserialize(deserializer)? { + Ok(regions + .into_iter() + .map(|x| Arc::new(RwLock::new(x))) + .collect()) } else { Ok(Vec::new()) } @@ -195,42 +219,33 @@ pub enum RegionType { impl RegionType { pub fn is_modality(&self) -> bool { - match self { - RegionType::Modality(_) => true, - _ => false, - } + matches!(self, RegionType::Modality(_)) } pub fn is_barcode(&self) -> bool { - match self { - RegionType::Barcode => true, - _ => false, - } + matches!(self, RegionType::Barcode) } pub fn is_umi(&self) -> bool { - match self { - RegionType::Umi => true, - _ => false, - } + matches!(self, RegionType::Umi) } /// Check if the region contains region of interest (insertion). pub fn is_target(&self) -> bool { - match self { - RegionType::Gdna | RegionType::Cdna => true, - _ => false, - } + matches!(self, RegionType::Gdna | RegionType::Cdna) } pub fn is_sequencing_primer(&self) -> bool { - match self { - RegionType::CustomPrimer | - RegionType::TruseqRead1 | RegionType::TruseqRead2 | - RegionType::NexteraRead1 | RegionType::NexteraRead2 | - RegionType::IlluminaP5 | RegionType::IlluminaP7 => true, - _ => false, - } + matches!( + self, + RegionType::CustomPrimer + | RegionType::TruseqRead1 + | RegionType::TruseqRead2 + | RegionType::NexteraRead1 + | RegionType::NexteraRead2 + | RegionType::IlluminaP5 + | RegionType::IlluminaP7 + ) } } @@ -238,9 +253,9 @@ impl RegionType { #[serde(rename_all = "lowercase")] pub enum SequenceType { Fixed, // sequence string is known and fixed in length and nucleotide composition - Random, // the sequence is not known a-priori - Onlist, // the sequence is derived from an onlist - Joined, // the sequence is created from nested regions and the regions property must contain Regions + Random, // the sequence is not known a-priori + Onlist, // the sequence is derived from an onlist + Joined, // the sequence is created from nested regions and the regions property must contain Regions } #[derive(Deserialize, Serialize, Debug, Clone, PartialEq)] @@ -267,7 +282,9 @@ impl Onlist { pub(crate) fn normalize_path>(&mut self, work_dir: P) -> Result<()> { if self.urltype == UrlType::Local { self.url = crate::utils::normalize_path(work_dir, &mut self.url)? - .to_str().unwrap().to_owned(); + .to_str() + .unwrap() + .to_owned(); } Ok(()) } @@ -275,7 +292,9 @@ impl Onlist { pub(crate) fn unnormalize_path>(&mut self, work_dir: P) -> Result<()> { if self.urltype == UrlType::Local { self.url = crate::utils::unnormalize_path(work_dir, &mut self.url)? - .to_str().unwrap().to_owned(); + .to_str() + .unwrap() + .to_owned(); } Ok(()) }