diff --git a/.github/workflows/test_python.yml b/.github/workflows/test_python.yml index 9bfc605..16f3d63 100644 --- a/.github/workflows/test_python.yml +++ b/.github/workflows/test_python.yml @@ -50,6 +50,7 @@ jobs: build-wheel: needs: build-and-test + if: ${{ startsWith(github.ref, 'refs/tags/') || contains(github.event.head_commit.message, '[wheel]') }} uses: regulatory-genomics/precellar/.github/workflows/wheels.yml@main publish: diff --git a/docs/tutorials/txg_atac.ipynb b/docs/tutorials/txg_atac.ipynb index 5817eaa..659b1d0 100644 --- a/docs/tutorials/txg_atac.ipynb +++ b/docs/tutorials/txg_atac.ipynb @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9331d929469533eacc126f4d3ee226d5e82fefbce2f03b95cbf1b2caffbf1160 -size 4038273 +oid sha256:e9b5eaf62a7591cf7f5539255ec37eb1215803fcd3c511f344dfe23592adc70a +size 4039867 diff --git a/precellar/src/align.rs b/precellar/src/align.rs index a316f3e..e7d8106 100644 --- a/precellar/src/align.rs +++ b/precellar/src/align.rs @@ -207,7 +207,8 @@ impl FastqProcessor { pub fn gen_raw_fastq_records(&self) -> FastqRecords { let modality = self.modality(); let data = self.assay.get_index_by_modality(modality).filter_map(|(read, index)| { - let regions: Vec<_> = index.index.into_iter().filter(|x| x.1.is_target()).collect(); + let regions: Vec<_> = index.index.into_iter() + .filter(|x| x.1.is_barcode() || x.1.is_target()).collect(); if regions.is_empty() { None } else { @@ -296,7 +297,7 @@ impl FastqRecords { let mut read1 = false; let mut read2 = false; self.0.iter().for_each(|x| { - x.subregion.iter().for_each(|(region_type, _)| if region_type.is_dna() { + x.subregion.iter().for_each(|(region_type, _)| if region_type.is_target() { if x.is_reverse { read1 = true; } else { @@ -346,7 +347,7 @@ impl Iterator for FastqRecords { } else { barcode = Some(fq); } - } else if region_type.is_dna() { + } else if region_type.is_target() { if is_reverse { read1 = Some(fq); } else { diff --git a/python/Cargo.toml b/python/Cargo.toml index 370bce8..d795cee 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "precellar-py" -version = "0.1.1-dev1" +version = "0.1.1-dev2" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html diff --git a/seqspec/src/lib.rs b/seqspec/src/lib.rs index 57c3a1c..866db52 100644 --- a/seqspec/src/lib.rs +++ b/seqspec/src/lib.rs @@ -5,6 +5,7 @@ pub mod utils; use log::warn; use read::{ReadSpan, RegionIndex}; pub use read::{Read, File, UrlType, Strand}; +use read::ReadValidator; use region::LibSpec; pub use region::{Region, RegionType, SequenceType, Onlist}; @@ -132,6 +133,30 @@ impl Assay { If this is not the desired behavior, please adjust the region lengths.", read.read_id, id); } } + let mut validators: Vec<_> = index.index.iter().map(|(region_id, _, range)| { + let region = self.library_spec.get(region_id).unwrap(); + ReadValidator::new(region) + .with_range(range.start as usize ..range.end as usize) + .with_strand(read.strand) + }).collect(); + + read.open("./").unwrap().records().take(500).try_for_each(|record| { + let record = record?; + for validator in &mut validators { + validator.validate(record.sequence())?; + } + anyhow::Ok(()) + })?; + for validator in validators { + let percent_matched = validator.frac_matched() * 100.0; + if percent_matched < 5.0 { + bail!("Read '{}' failed validation for region '{}'. \ + Percentage of matched bases: {:.2}%", read.read_id, validator.id(), percent_matched); + } else if percent_matched < 50.0 { + warn!("Read '{}' has low percentage of matched bases for region '{}'. \ + Percentage of matched bases: {:.2}%", read.read_id, validator.id(), percent_matched); + } + } } Ok(()) } diff --git a/seqspec/src/read.rs b/seqspec/src/read.rs index 4f1fc58..e6ea1f4 100644 --- a/seqspec/src/read.rs +++ b/seqspec/src/read.rs @@ -5,6 +5,7 @@ use cached_path::Cache; use noodles::fastq; use indexmap::IndexMap; use serde::{Deserialize, Serialize, Serializer}; +use std::collections::HashSet; use std::ops::{Deref, DerefMut}; use std::sync::Arc; use std::{io::{BufRead, BufReader}, ops::Range, path::PathBuf}; @@ -202,7 +203,7 @@ pub enum ReadSpan { MayReadThrough(String), // Read may be longer than the target region } -#[derive(Deserialize, Serialize, Debug, Clone, PartialEq)] +#[derive(Deserialize, Serialize, Debug, Copy, Clone, PartialEq)] #[serde(rename_all = "lowercase")] pub enum Strand { Pos, @@ -266,4 +267,84 @@ pub enum UrlType { Ftp, Http, Https, +} + +pub(crate) struct ReadValidator<'a> { + region: &'a Region, + range: Option>, + n_total: usize, + n_matched: usize, + onlist: Option>, + strand: Strand, +} + +impl<'a> ReadValidator<'a> { + pub fn id(&self) -> &str { + &self.region.region_id + } + + pub fn new(region: &'a Region) -> Self { + let onlist = if let Some(onlist) = ®ion.onlist { + Some(onlist.read().unwrap()) + } else if region.sequence_type == SequenceType::Fixed { + Some(HashSet::from([region.sequence.clone()])) + } else { + None + }; + Self { + region, + range: None, + n_total: 0, + n_matched: 0, + onlist, + strand: Strand::Pos, + } + } + + pub fn with_range(mut self, range: Range) -> Self { + self.range = Some(range); + self + } + + pub fn with_strand(mut self, strand: Strand) -> Self { + self.strand = strand; + self + } + + pub fn frac_matched(&self) -> f64 { + self.n_matched as f64 / self.n_total as f64 + } + + pub fn validate(&mut self, seq: &[u8]) -> Result<()> { + self.n_total += 1; + let seq = if let Some(range) = &self.range { + &seq[range.clone()] + } else { + seq + }; + if seq.len() < self.region.min_len as usize { + return Err(anyhow::anyhow!("Sequence too short: {}", seq.len())); + } + if seq.len() > self.region.max_len as usize { + return Err(anyhow::anyhow!("Sequence too long: {}", seq.len())); + } + if let Some(onlist) = &self.onlist { + match self.strand { + Strand::Neg => { + let seq = crate::utils::rev_compl(seq); + if onlist.contains(std::str::from_utf8(&seq)?) { + self.n_matched += 1; + } + }, + Strand::Pos => { + if onlist.contains(std::str::from_utf8(seq)?) { + self.n_matched += 1; + } + }, + } + } else { + self.n_matched += 1; + } + Ok(()) + } } \ No newline at end of file diff --git a/seqspec/src/region.rs b/seqspec/src/region.rs index 0b8819d..20ddc30 100644 --- a/seqspec/src/region.rs +++ b/seqspec/src/region.rs @@ -4,49 +4,56 @@ use crate::read::UrlType; use cached_path::Cache; use indexmap::IndexMap; use serde::{Deserialize, Serialize}; -use std::{io::BufRead, sync::Arc}; +use std::{collections::{HashMap, HashSet}, io::BufRead, sync::Arc}; use anyhow::Result; #[derive(Debug, Clone, PartialEq)] pub struct LibSpec { modalities: IndexMap>, - region_map: IndexMap>, - parent_map: IndexMap>, + parent_map: HashMap>, + region_map: HashMap>, } impl Serialize for LibSpec { fn serialize(&self, serializer: S) -> Result { - self.region_map.values().collect::>().serialize(serializer) + self.modalities.values().collect::>().serialize(serializer) } } impl<'de> Deserialize<'de> for LibSpec { fn deserialize>(deserializer: D) -> Result { let regions = Vec::::deserialize(deserializer)?; - Ok(LibSpec::new(regions)) + Ok(LibSpec::new(regions).unwrap()) } } impl LibSpec { - pub fn new>(regions: I) -> Self { + /// Create a new library specification from top-level regions. The only allowed + /// region types are modality types. + pub fn new>(regions: I) -> Result { let mut modalities = IndexMap::new(); - let mut region_map = IndexMap::new(); - let mut parent_map = IndexMap::new(); + let mut region_map = HashMap::new(); + let mut parent_map = HashMap::new(); for region in regions { - let region = Arc::new(region); - if region_map.insert(region.region_id.clone(), region.clone()).is_some() { - panic!("Duplicate region id: {}", region.region_id); - } if let RegionType::Modality(modality) = region.region_type { + let region = Arc::new(region); if modalities.insert(modality, region.clone()).is_some() { - panic!("Duplicate modality: {:?}", modality); + return Err(anyhow::anyhow!("Duplicate modality: {:?}", modality)); } - } - for subregion in region.subregions.iter() { - parent_map.insert(subregion.region_id.clone(), region.clone()); - } + if region_map.insert(region.region_id.clone(), region.clone()).is_some() { + return Err(anyhow::anyhow!("Duplicate region id: {}", region.region_id)); + } + for subregion in region.subregions.iter() { + if region_map.insert(subregion.region_id.clone(), subregion.clone()).is_some() { + return Err(anyhow::anyhow!("Duplicate region id: {}", subregion.region_id)); + } + parent_map.insert(subregion.region_id.clone(), region.clone()); + } + } else { + return Err(anyhow::anyhow!("Top-level regions must be modalities")); + }; } - Self { modalities, region_map, parent_map } + Ok(Self { modalities, region_map, parent_map }) } /// Iterate over all regions with modality type in the library. @@ -165,9 +172,11 @@ pub enum RegionType { } impl RegionType { - /// Either a barcode, UMI, or DNA/cDNA region. - pub fn is_target(&self) -> bool { - self.is_barcode() || self.is_umi() || self.is_dna() + pub fn is_modality(&self) -> bool { + match self { + RegionType::Modality(_) => true, + _ => false, + } } pub fn is_barcode(&self) -> bool { @@ -184,8 +193,8 @@ impl RegionType { } } - /// Check if the region contains genomic sequences. - pub fn is_dna(&self) -> bool { + /// Check if the region contains region of interest (insertion). + pub fn is_target(&self) -> bool { match self { RegionType::Gdna | RegionType::Cdna => true, _ => false, @@ -225,7 +234,7 @@ pub struct Onlist { } impl Onlist { - pub fn read(&self) -> Result> { + pub fn read(&self) -> Result> { let cache = Cache::new()?; let file = cache.cached_path(&self.url)?; let reader = std::io::BufReader::new(crate::utils::open_file_for_read(file)); @@ -238,4 +247,4 @@ impl Onlist { pub enum Location { Local, Remote, -} \ No newline at end of file +} diff --git a/seqspec/src/utils.rs b/seqspec/src/utils.rs index 1af315e..073400e 100644 --- a/seqspec/src/utils.rs +++ b/seqspec/src/utils.rs @@ -80,4 +80,14 @@ pub fn open_file_for_write>( }, }; Ok(writer) +} + +pub fn rev_compl(seq: &[u8]) -> Vec { + seq.iter().rev().map(|&x| match x { + b'A' => b'T', + b'T' => b'A', + b'C' => b'G', + b'G' => b'C', + _ => x, + }).collect() } \ No newline at end of file