Skip to content

Commit

Permalink
add add_illumina_reads
Browse files Browse the repository at this point in the history
  • Loading branch information
kaizhang committed Oct 18, 2024
1 parent dde1547 commit 6b8ae88
Show file tree
Hide file tree
Showing 3 changed files with 187 additions and 27 deletions.
26 changes: 24 additions & 2 deletions python/src/pyseqspec.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use std::{collections::HashMap, path::PathBuf};
use std::{collections::HashMap, path::PathBuf, str::FromStr};

use pyo3::prelude::*;
use seqspec::{Read, Region};
use seqspec::{Modality, Read, Region};
use anyhow::Result;
use termtree::Tree;

Expand Down Expand Up @@ -50,6 +50,27 @@ impl Assay {
self.0.file.clone()
}

/// Add default Illumina reads to the Assay object.
///
/// This method adds default Illumina reads to the Assay object.
///
/// Parameters
/// ----------
/// modality: str
/// The modality of the read.
/// length: int
/// The length of the read.
/// forward_strand_workflow: bool
/// Whether the reads are sequenced in the forward strand workflow.
#[pyo3(
signature = (modality, *, length=150, forward_strand_workflow=false),
text_signature = "($self, modality, *, length=150, forward_strand_workflow=False)",
)]
pub fn add_illumina_reads(&mut self, modality: &str, length: usize, forward_strand_workflow: bool) -> Result<()> {
let modality = Modality::from_str(modality)?;
self.0.add_illumina_reads(modality, length, forward_strand_workflow)
}

/// Update read information in the Assay object.
///
/// This method updates the read information in the Assay object.
Expand Down Expand Up @@ -90,6 +111,7 @@ impl Assay {
} else {
vec![f.extract::<String>().unwrap()]
});
let modality = modality.map(|x| Modality::from_str(x)).transpose()?;

self.0.update_read(
read_id, modality, primer_id, is_reverse,
Expand Down
180 changes: 155 additions & 25 deletions seqspec/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ pub use region::{Region, RegionType, SequenceType, Onlist};

use serde::{Deserialize, Deserializer, Serialize};
use serde_yaml::{self, Value};
use std::{fs, path::PathBuf, str::FromStr};
use std::{fs, path::PathBuf, str::FromStr, sync::Arc};
use anyhow::{bail, anyhow, Result};
use std::path::Path;

Expand Down Expand Up @@ -77,11 +77,115 @@ impl Assay {
});
}

/// Add default Illumina reads to the sequence spec.
pub fn add_illumina_reads(&mut self, modality: Modality, read_len: usize, forward_strand_workflow: bool) -> Result<()> {
fn advance_until(iterator: &mut std::slice::Iter<'_, Arc<Region>>, f: fn(&Region) -> bool) -> Option<(Arc<Region>, Vec<Arc<Region>>)> {
let mut regions = Vec::new();
while let Some(next_region) = iterator.next() {
if f(next_region) {
return Some((next_region.clone(), regions))
} else {
regions.push(next_region.clone());
}
}
None
}

fn get_length(regions: &[Arc<Region>], reverse: bool) -> usize {
if reverse {
regions.iter()
.skip_while(|region| region.sequence_type == SequenceType::Fixed)
.map(|region| region.len().unwrap() as usize).sum()
} else {
regions.iter().rev()
.skip_while(|region| region.sequence_type == SequenceType::Fixed)
.map(|region| region.len().unwrap() as usize).sum()
}
}

fn is_read1(region: &Region) -> bool {
region.region_type == RegionType::NexteraRead1 || region.region_type == RegionType::TruseqRead1
}

fn is_read2(region: &Region) -> bool {
region.region_type == RegionType::NexteraRead2 || region.region_type == RegionType::TruseqRead2
}

fn is_p5(region: &Region) -> bool {
region.region_type == RegionType::IlluminaP5
}

fn is_p7(region: &Region) -> bool {
region.region_type == RegionType::IlluminaP7
}

self.delete_all_reads(modality);
let regions = self.library_spec.get_modality(&modality).ok_or_else(|| anyhow!("Modality not found: {:?}", modality))?.clone();
let mut regions = regions.subregions.iter();
while let Some(current_region) = regions.next() {
if is_p5(&current_region) {
if let Some((next_region, acc)) = advance_until(&mut regions, is_read1) {
self.update_read::<PathBuf>(
&format!("{}-R1", modality.to_string()),
Some(modality),
Some(&next_region.region_id),
Some(false),
None, Some(read_len), Some(read_len)
)?;
if forward_strand_workflow {
let acc_len = get_length(acc.as_slice(), false);
if acc_len > 0 {
self.update_read::<PathBuf>(
&format!("{}-I1", modality.to_string()),
Some(modality),
Some(&current_region.region_id),
Some(false),
None, Some(acc_len), Some(acc_len)
)?;
}
} else {
let acc_len = get_length(acc.as_slice(), true);
if acc_len > 0 {
self.update_read::<PathBuf>(
&format!("{}-I1", modality.to_string()),
Some(modality),
Some(&next_region.region_id),
Some(true),
None, Some(acc_len), Some(acc_len)
)?;
}
}
}
} else if is_read2(&current_region) {
if let Some((_, acc)) = advance_until(&mut regions, is_p7) {
let acc_len = get_length(acc.as_slice(), false);
self.update_read::<PathBuf>(
&format!("{}-R2", modality.to_string()),
Some(modality),
Some(&current_region.region_id),
Some(true),
None, Some(read_len), Some(read_len)
)?;
if acc_len > 0 {
self.update_read::<PathBuf>(
&format!("{}-I2", modality.to_string()),
Some(modality),
Some(&current_region.region_id),
Some(false),
None, Some(acc_len), Some(acc_len)
)?;
}
}
}
}
Ok(())
}

/// Update read information. If the read does not exist, it will be created.
pub fn update_read<P: AsRef<Path>>(
&mut self,
read_id: &str,
modality: Option<&str>,
modality: Option<Modality>,
primer_id: Option<&str>,
is_reverse: Option<bool>,
fastqs: Option<&[P]>,
Expand All @@ -101,7 +205,7 @@ impl Assay {
read_buffer.strand = if rev { Strand::Neg } else { Strand::Pos };
}
if let Some(modality) = modality {
read_buffer.modality = Modality::from_str(modality)?;
read_buffer.modality = modality;
}
if let Some(primer_id) = primer_id {
read_buffer.primer_id = primer_id.to_string();
Expand Down Expand Up @@ -129,6 +233,15 @@ impl Assay {
self.sequence_spec.shift_remove(read_id)
}

pub fn delete_all_reads(&mut self, modality: Modality) {
let ids: Vec<_> = self.iter_reads(modality).map(|read| {
read.read_id.to_string()
}).collect();
ids.into_iter().for_each(|id| {
self.delete_read(&id);
});
}

/// Get the index of atomic regions of each read in the sequence spec.
pub fn get_index_by_modality(&self, modality: Modality) -> impl Iterator<Item = (&Read, RegionIndex)> {
self.sequence_spec.values().filter_map(move |read| if read.modality == modality {
Expand Down Expand Up @@ -171,28 +284,32 @@ impl Assay {
If this is not the desired behavior, please adjust the region lengths.", read.read_id, id);
}
}
let mut validators: Vec<_> = index.index.iter().map(|(region_id, _, range)| {
let region = self.library_spec.get(region_id).unwrap();
ReadValidator::new(region)
.with_range(range.start as usize ..range.end as usize)
.with_strand(read.strand)
}).collect();

read.open().unwrap().records().take(500).try_for_each(|record| {
let record = record?;
for validator in &mut validators {
validator.validate(record.sequence())?;
}
anyhow::Ok(())
})?;
for validator in validators {
let percent_matched = validator.frac_matched() * 100.0;
if percent_matched < 5.0 {
bail!("Read '{}' failed validation for region '{}'. \
Percentage of matched bases: {:.2}%", read.read_id, validator.id(), percent_matched);
} else if percent_matched < 50.0 {
warn!("Read '{}' has low percentage of matched bases for region '{}'. \
Percentage of matched bases: {:.2}%", read.read_id, validator.id(), percent_matched);

if let Some(mut reader) = read.open() {
let mut validators: Vec<_> = index.index.iter().map(|(region_id, _, range)| {
let region = self.library_spec.get(region_id).unwrap();
ReadValidator::new(region)
.with_range(range.start as usize ..range.end as usize)
.with_strand(read.strand)
}).collect();

reader.records().take(500).try_for_each(|record| {
let record = record?;
for validator in &mut validators {
validator.validate(record.sequence())?;
}
anyhow::Ok(())
})?;

for validator in validators {
let percent_matched = validator.frac_matched() * 100.0;
if percent_matched < 5.0 {
bail!("Read '{}' failed validation for region '{}'. \
Percentage of matched bases: {:.2}%", read.read_id, validator.id(), percent_matched);
} else if percent_matched < 50.0 {
warn!("Read '{}' has low percentage of matched bases for region '{}'. \
Percentage of matched bases: {:.2}%", read.read_id, validator.id(), percent_matched);
}
}
}
}
Expand Down Expand Up @@ -240,6 +357,19 @@ impl FromStr for Modality {
}
}

impl ToString for Modality {
fn to_string(&self) -> String {
match self {
Modality::DNA => "dna".to_string(),
Modality::RNA => "rna".to_string(),
Modality::Tag => "tag".to_string(),
Modality::Protein => "protein".to_string(),
Modality::ATAC => "atac".to_string(),
Modality::Crispr => "crispr".to_string(),
}
}
}

#[derive(Debug, Clone, PartialEq)]
pub enum LibraryProtocol {
Standard(String),
Expand Down
8 changes: 8 additions & 0 deletions seqspec/src/region.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,14 @@ impl Region {
false
}
}

pub fn len(&self) -> Option<u32> {
if self.min_len == self.max_len {
Some(self.min_len)
} else {
None
}
}
}

fn deserialize_regions<'de, D>(deserializer: D) -> Result<Vec<Arc<Region>>, D::Error>
Expand Down

0 comments on commit 6b8ae88

Please sign in to comment.