Skip to content

Commit

Permalink
refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
kaizhang committed Oct 4, 2024
1 parent 2d24f86 commit b1ba0f8
Show file tree
Hide file tree
Showing 10 changed files with 248 additions and 131 deletions.
4 changes: 2 additions & 2 deletions docs/tutorials/generic.ipynb
Git LFS file not shown
4 changes: 2 additions & 2 deletions docs/tutorials/txg_atac.ipynb
Git LFS file not shown
4 changes: 2 additions & 2 deletions docs/tutorials/txg_multiome.ipynb
Git LFS file not shown
20 changes: 10 additions & 10 deletions precellar/src/align.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use crate::barcode::{BarcodeCorrector, Whitelist};
use seqspec::{Assay, Modality, Read, Region, RegionType, SequenceType};
use seqspec::{Assay, Modality, Read, RegionType, SequenceType};
use crate::qc::{AlignQC, Metrics};

use bstr::BString;
Expand Down Expand Up @@ -206,8 +206,8 @@ impl<A: Alinger> FastqProcessor<A> {

pub fn gen_raw_fastq_records(&self) -> FastqRecords<impl BufRead> {
let modality = self.modality();
let data = self.assay.get_index_of(modality).filter_map(|(read, regions)| {
let regions: Vec<_> = regions.into_iter().filter(|x| x.0.region_type.is_target()).collect();
let data = self.assay.get_index_by_modality(modality).filter_map(|(read, index)| {
let regions: Vec<_> = index.index.into_iter().filter(|x| x.1.is_target()).collect();
if regions.is_empty() {
None
} else {
Expand All @@ -221,10 +221,10 @@ impl<A: Alinger> FastqProcessor<A> {
let modality = self.modality();
let mut whitelist = self.get_whitelist()?;

let (read, index) = self.assay.get_index_of(modality).into_iter()
.find(|(_, index)| index.into_iter().any(|x| x.0.region_type.is_barcode()))
let (read, index) = self.assay.get_index_by_modality(modality).into_iter()
.find(|(_, index)| index.index.iter().any(|x| x.1.is_barcode()))
.expect("No barcode region found");
let range = index.into_iter().find(|x| x.0.region_type.is_barcode()).unwrap().1;
let range = index.index.into_iter().find(|x| x.1.is_barcode()).unwrap().2;

read.open(&self.base_dir).unwrap().records().for_each(|record| {
let mut record = record.unwrap();
Expand All @@ -242,8 +242,8 @@ impl<A: Alinger> FastqProcessor<A> {
}

fn get_whitelist(&self) -> Result<Whitelist> {
let regions: Vec<_> = self.assay.get_region_by_modality(self.modality()).unwrap()
.iter_regions().filter(|r| r.region_type.is_barcode()).collect();
let regions: Vec<_> = self.assay.library_spec.get_modality(&self.modality()).unwrap()
.subregions.iter().filter(|r| r.region_type.is_barcode()).collect();
if regions.len() != 1 {
bail!("Expecting exactly one barcode region, found {}", regions.len());
}
Expand Down Expand Up @@ -273,13 +273,13 @@ pub type UMI = fastq::Record;
impl<R: BufRead> FastqRecords<R> {
fn new<'a, I>(iter: I) -> Self
where
I: Iterator<Item = (&'a Read, Vec<(&'a Region, Range<u32>)>, fastq::Reader<R>)>,
I: Iterator<Item = (&'a Read, Vec<(String, RegionType, Range<u32>)>, fastq::Reader<R>)>,
{
let records = iter.map(|(read, regions, reader)|
FastqRecord {
id: read.read_id.to_string(),
is_reverse: read.is_reverse(),
subregion: regions.into_iter().map(|x| (x.0.region_type, x.1)).collect(),
subregion: regions.into_iter().map(|x| (x.1, x.2)).collect(),
reader,
min_len: read.min_len as usize,
max_len: read.max_len as usize,
Expand Down
10 changes: 5 additions & 5 deletions python/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ use ::precellar::{
fragment::FragmentGenerator,
qc::{FragmentQC, Metrics, AlignQC},
};
use pyseqspec::SeqSpec;
use seqspec::{Assay, Modality, utils::{open_file_for_write, Compression}};
use pyseqspec::Assay;
use seqspec::{Modality, utils::{open_file_for_write, Compression}};

#[cfg(not(target_env = "msvc"))]
use tikv_jemallocator::Jemalloc;
Expand Down Expand Up @@ -118,11 +118,11 @@ fn align(
let base_dir;
match seqspec.extract::<PathBuf>() {
Ok(p) => {
spec = Assay::from_path(&p).unwrap();
spec = seqspec::Assay::from_path(&p).unwrap();
base_dir = p.parent().unwrap().to_path_buf();
},
_ => {
let s: PyRef<SeqSpec> = seqspec.extract()?;
let s: PyRef<Assay> = seqspec.extract()?;
spec = s.0.clone();
base_dir = ".".into();
}
Expand Down Expand Up @@ -270,7 +270,7 @@ fn precellar(m: &Bound<'_, PyModule>) -> PyResult<()> {

m.add("__version__", env!("CARGO_PKG_VERSION"))?;

m.add_class::<pyseqspec::SeqSpec>().unwrap();
m.add_class::<pyseqspec::Assay>().unwrap();

m.add_function(wrap_pyfunction!(make_genome_index, m)?)?;
m.add_function(wrap_pyfunction!(align, m)?)?;
Expand Down
21 changes: 10 additions & 11 deletions python/src/pyseqspec.rs
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
use std::collections::HashMap;

use pyo3::prelude::*;
use seqspec::{Assay, Read, Region};
use seqspec::{Read, Region};
use anyhow::Result;
use termtree::Tree;
use cached_path::Cache;

/** A SeqSpec object.
/** A Assay object.
A SeqSpec object is used to annotate sequencing libraries produced by genomics assays.
A Assay object is used to annotate sequencing libraries produced by genomics assays.
Genomic library structure depends on both the assay and sequencer (and kits) used to
generate and bind the assay-specific construct to the sequencing adapters to generate
a sequencing library. SeqSpec is specific to both a genomics assay and sequencer
a sequencing library. Assay is specific to both a genomics assay and sequencer
and provides a standardized format for describing the structure of sequencing
libraries and the resulting sequencing reads. See https://github.com/pachterlab/seqspec for more details.
Expand All @@ -26,17 +26,17 @@ use cached_path::Cache;
*/
#[pyclass]
#[repr(transparent)]
pub struct SeqSpec(pub(crate) Assay);
pub struct Assay(pub(crate) seqspec::Assay);

#[pymethods]
impl SeqSpec {
impl Assay {
#[new]
#[pyo3(signature = (path))]
pub fn new(path: &str) -> Result<Self> {
let cache = Cache::new()?;
let file = cache.cached_path(path)?;
let assay = Assay::from_path(file)?;
Ok(SeqSpec(assay))
let assay = seqspec::Assay::from_path(file)?;
Ok(Assay(assay))
}

/// Update read information in the SeqSpec object.
Expand Down Expand Up @@ -119,7 +119,7 @@ impl SeqSpec {
}

let tree = Tree::new("".to_string()).with_leaves(
assay.library_spec.iter().map(|region| build_tree(region, &read_list))
assay.library_spec.modalities().map(|region| build_tree(region, &read_list))
);
format!("{}", tree)
}
Expand All @@ -143,8 +143,7 @@ fn build_tree(region: &Region, read_list: &HashMap<String, Vec<&Read>>) -> Tree<
format!("{}({})", id, len)
};
Tree::new(label)
.with_leaves(region.regions.as_ref().unwrap_or(&Vec::new())
.iter().map(|child| build_tree(child, read_list)))
.with_leaves(region.subregions.iter().map(|child| build_tree(child, read_list)))
}

fn format_read(read: &Read) -> String {
Expand Down
2 changes: 1 addition & 1 deletion seqspec/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ flate2 = "1.0"
indexmap = "2.5"
log = "0.4"
noodles = { version = "0.80", features = ["core", "fastq", "async"] }
serde = { version = "1.0", features = ["derive"] }
serde = { version = "1.0", features = ["derive", "rc"] }
serde_yaml = "0.9"
zstd = { version = "0.13", features = ["zstdmt"] }
multi_reader = "0.1"
107 changes: 61 additions & 46 deletions seqspec/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,16 @@ mod read;
mod region;
pub mod utils;

use log::warn;
use read::{ReadSpan, RegionIndex};
pub use read::{Read, File, UrlType, Strand};
use region::LibSpec;
pub use region::{Region, RegionType, SequenceType, Onlist};

use serde::{Deserialize, Deserializer, Serialize};
use serde_yaml::{self, Value};
use std::{fs, ops::Range, str::FromStr};
use anyhow::{bail, Result};
use std::{fs, str::FromStr};
use anyhow::{bail, anyhow, Result};
use std::path::Path;

#[derive(Deserialize, Serialize, Debug, Clone, PartialEq)]
Expand All @@ -25,8 +28,8 @@ pub struct Assay {
pub library_kit: LibraryKit,
pub sequence_protocol: SequenceProtocol,
pub sequence_kit: SequenceKit,
pub sequence_spec: read::Sequences,
pub library_spec: Vec<Region>,
pub sequence_spec: read::SeqSpec,
pub library_spec: LibSpec,
}

impl Assay {
Expand All @@ -46,49 +49,39 @@ impl Assay {
min_len: Option<usize>,
max_len: Option<usize>,
) -> Result<()> {
let all_reads = &mut self.sequence_spec;
let mut read_exist = false;
let mut read_buffer = Read::default();
read_buffer.read_id = read_id.to_string();

let read = if let Some(r) = all_reads.get_mut(read_id) {
read_exist = true;
r
let mut read_buffer = if let Some(r) = self.sequence_spec.get(read_id) {
r.clone()
} else {
&mut read_buffer
};

if !read_exist {
assert!(modality.is_some(), "modality must be provided for a new read");
assert!(primer_id.is_some(), "primer_id must be provided for a new read");
assert!(is_reverse.is_some(), "is_reverse must be provided for a new read");
}
Read { read_id: read_id.to_string(), ..Default::default() }
};

if let Some(rev) = is_reverse {
read.strand = if rev { Strand::Neg } else { Strand::Pos };
read_buffer.strand = if rev { Strand::Neg } else { Strand::Pos };
}
if let Some(modality) = modality {
read.modality = Modality::from_str(modality)?;
read_buffer.modality = Modality::from_str(modality)?;
}
if let Some(primer_id) = primer_id {
read.primer_id = primer_id.to_string();
read_buffer.primer_id = primer_id.to_string();
}
if let Some(fastq) = fastqs {
read.files = Some(fastq.iter().map(|path| File::from_fastq(path)).collect::<Result<Vec<File>>>()?);
read_buffer.files = Some(fastq.iter().map(|path| File::from_fastq(path)).collect::<Result<Vec<File>>>()?);
}

if (min_len.is_none() || max_len.is_none()) && read.files.is_some() {
let len = read.actual_len("./")?;
read.min_len = min_len.unwrap_or(len) as u32;
read.max_len = max_len.unwrap_or(len) as u32;
if (min_len.is_none() || max_len.is_none()) && read_buffer.files.is_some() {
let len = read_buffer.actual_len("./")?;
read_buffer.min_len = min_len.unwrap_or(len) as u32;
read_buffer.max_len = max_len.unwrap_or(len) as u32;
} else {
read.min_len = min_len.unwrap() as u32;
read.max_len = max_len.unwrap() as u32;
read_buffer.min_len = min_len.unwrap() as u32;
read_buffer.max_len = max_len.unwrap() as u32;
}

if !read_exist {
all_reads.insert(read_buffer.read_id.clone(), read_buffer);
}
self.validate_reads(&read_buffer)?;
self.sequence_spec.insert(read_id.to_string(), read_buffer);

Ok(())
}
Expand All @@ -98,27 +91,49 @@ impl Assay {
}

/// Get the index of atomic regions of each read in the sequence spec.
pub fn get_index_of(&self, modality: Modality) -> impl Iterator<Item = (&Read, Vec<(&Region, Range<u32>)>)> {
pub fn get_index_by_modality(&self, modality: Modality) -> impl Iterator<Item = (&Read, RegionIndex)> {
self.sequence_spec.values().filter_map(move |read| if read.modality == modality {
let region = self.get_region_by_modality(modality)?;
let index = read.get_index(region)
.expect(&format!("Region: {} does not have Primer: {}", region.region_id, read.primer_id));
let index = self.get_index(&read.read_id)
.expect(&format!("Cannot find index for Read: {}", read.read_id));
Some((read, index))
} else {
None
})
}

/// Get the index of atomic regions of a read in the sequence spec.
pub fn get_index(&self, read_id: &str) -> Option<RegionIndex> {
let read = self.sequence_spec.get(read_id)?;
let region = self.library_spec.get_parent(&read.primer_id)?;
read.get_index(region)
}

pub fn iter_reads(&self, modality: Modality) -> impl Iterator<Item = &Read> {
self.sequence_spec.values().filter(move |read| read.modality == modality)
}

/// Get the top-level region for a given modality, i.e., the region that contains all other regions.
pub fn get_region_by_modality(&self, modality: Modality) -> Option<&Region> {
self.library_spec.iter().find(|region| {
region.sequence_type == SequenceType::Joined &&
region.region_type == RegionType::Modality(modality)
})
pub fn validate_reads(&self, read: &Read) -> Result<()> {
let region = self.library_spec.get_parent(&read.primer_id)
.ok_or_else(|| anyhow!("Primer not found: {}", read.primer_id))?;
// Check if the primer exists
if let Some(index) = read.get_index(region) {
match index.readlen_info {
ReadSpan::Covered | ReadSpan::Span(_) => {},
ReadSpan::NotEnough => {
warn!("'{}' does not cover the region", read.read_id);
},
ReadSpan::MayReadThrough(id) => {
warn!("'{}' may read through and contain sequences from: '{}'", read.read_id, id);
},
ReadSpan::ReadThrough(id) => {
warn!("'{}' length exceeds maximum length of the variable-length region (insertion), \
truncating the reads to the maximum length of the region. \
Read reads through and contains sequences from: '{}'.
If this is not the desired behavior, please adjust the region lengths.", read.read_id, id);
}
}
}
Ok(())
}
}

Expand Down Expand Up @@ -351,14 +366,14 @@ mod tests {
let yaml_str = fs::read_to_string(YAML_FILE).expect("Failed to read file");

let assay: Assay = serde_yaml::from_str(&yaml_str).expect("Failed to parse YAML");
for (read, regions) in assay.get_index_of(Modality::RNA) {
println!("{}: {:?}", read.read_id, regions.into_iter().map(|x| (x.0.region_type, x.1)).collect::<Vec<_>>());
for (read, index) in assay.get_index_by_modality(Modality::RNA) {
println!("{}: {:?}", read.read_id, index.index.into_iter().map(|x| (x.1, x.2)).collect::<Vec<_>>());
}
for (read, regions) in assay.get_index_of(Modality::ATAC) {
println!("{}: {:?}", read.read_id, regions.into_iter().map(|x| (x.0.region_type, x.1)).collect::<Vec<_>>());
for (read, index) in assay.get_index_by_modality(Modality::ATAC) {
println!("{}: {:?}", read.read_id, index.index.into_iter().map(|x| (x.1, x.2)).collect::<Vec<_>>());
}
for (read, regions) in assay.get_index_of(Modality::Protein) {
println!("{}: {:?}", read.read_id, regions.into_iter().map(|x| (x.0.region_type, x.1)).collect::<Vec<_>>());
for (read, index) in assay.get_index_by_modality(Modality::Protein) {
println!("{}: {:?}", read.read_id, index.index.into_iter().map(|x| (x.1, x.2)).collect::<Vec<_>>());
}
}
}
Loading

0 comments on commit b1ba0f8

Please sign in to comment.