Skip to content

Commit

Permalink
work on seqspec
Browse files Browse the repository at this point in the history
  • Loading branch information
kaizhang committed Sep 30, 2024
1 parent 1028fb0 commit e7f7177
Show file tree
Hide file tree
Showing 12 changed files with 314 additions and 93 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
*/target
target
*.gz

# Byte-compiled / optimized / DLL files
Expand Down
7 changes: 7 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[workspace]
members = ["seqspec", "precellar", "python"]
resolver = "2"

[workspace.dependencies]
seqspec = { version = "0.1", path = "seqspec" }
precellar = { version = "0.1", path = "precellar" }
2 changes: 2 additions & 0 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ methods.
.. autosummary::
:toctree: _autosummary

SeqSpec

make_genome_index
align
make_fragment
Expand Down
3 changes: 1 addition & 2 deletions precellar/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ edition = "2021"
[dependencies]
anyhow = "1.0"
bed-utils = "0.5.1"
#bwa = { git = "https://github.com/regulatory-genomics/bwa-rust.git", rev = "69d482501956039588f94ce9f87367d7ae8f19af" }
bwa-mem2 = { git = "https://github.com/regulatory-genomics/bwa-mem2-rust.git", rev = "07eda9b9c2815ae52b3fa30b01de0e19fae31fe0" }
bstr = "1.0"
cached-path = "0.6"
Expand All @@ -22,6 +21,6 @@ kdam = "0.5.2"
rayon = "1.10"
smallvec = "1.13"
serde = "1.0"
serde_yaml = "0.9"
seqspec = { version = "0.1", workspace = true }
regex = "1.6"
zstd = { version = "0.13", features = ["zstdmt"] }
10 changes: 5 additions & 5 deletions precellar/src/align.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use crate::barcode::{BarcodeCorrector, Whitelist};
use crate::seqspec::{Assay, Modality, Read, Region, RegionType, SequenceType};
use seqspec::{Assay, Modality, Read, Region, RegionType, SequenceType};
use crate::qc::{AlignQC, Metrics};

use bstr::BString;
Expand Down Expand Up @@ -207,7 +207,7 @@ impl<A: Alinger> FastqProcessor<A> {
pub fn gen_raw_fastq_records(&self) -> FastqRecords<impl BufRead> {
let modality = self.modality();
let data = self.assay.get_index_of(modality)
.map(|(read, regions)| (read, regions, read.read_fastq(self.base_dir.clone())));
.map(|(read, regions)| (read, regions, crate::io::read_fastq(read, self.base_dir.clone())));
FastqRecords::new(data)
}

Expand All @@ -219,7 +219,7 @@ impl<A: Alinger> FastqProcessor<A> {
.find(|(_, index)| index.into_iter().any(|x| x.0.region_type.is_barcode())).unwrap();
let range = index.into_iter().find(|x| x.0.region_type.is_barcode()).unwrap().1;

read.read_fastq(&self.base_dir).records().for_each(|record| {
crate::io::read_fastq(read, &self.base_dir).records().for_each(|record| {
let mut record = record.unwrap();
record = slice_fastq_record(&record, range.start as usize, range.end as usize);
if read.is_reverse() {
Expand All @@ -242,7 +242,7 @@ impl<A: Alinger> FastqProcessor<A> {
}
let region = regions[0];
if region.sequence_type == SequenceType::Onlist {
Ok(Whitelist::new(region.onlist.as_ref().unwrap().read()?))
Ok(Whitelist::new(crate::io::read_onlist(region.onlist.as_ref().unwrap())?))
} else {
Ok(Whitelist::empty())
}
Expand Down Expand Up @@ -270,7 +270,7 @@ impl<R: BufRead> FastqRecords<R> {
{
let records = iter.map(|(read, regions, reader)|
FastqRecord {
id: read.id().to_string(),
id: read.read_id.to_string(),
is_reverse: read.is_reverse(),
subregion: regions.into_iter().filter_map(|x| {
let region_type = x.0.region_type;
Expand Down
38 changes: 37 additions & 1 deletion precellar/src/io.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
use std::{fs::File, io::{BufWriter, Write}, path::{Path, PathBuf}, str::FromStr};
use std::{fs::File, io::{BufRead, BufReader, BufWriter, Write}, path::{Path, PathBuf}, str::FromStr};
use anyhow::{Context, Result, anyhow};
use noodles::fastq;
use cached_path::Cache;

/// Open a file, possibly compressed. Supports gzip and zstd.
pub fn open_file_for_read<P: AsRef<Path>>(file: P) -> Box<dyn std::io::Read> {
Expand Down Expand Up @@ -80,4 +82,38 @@ pub fn open_file_for_write<P: AsRef<Path>>(
},
};
Ok(writer)
}

pub fn read_fastq<P: AsRef<Path>>(read: &seqspec::Read, base_dir: P) -> fastq::Reader<impl BufRead> {
let base_dir = base_dir.as_ref().to_path_buf();
let reader = multi_reader::MultiReader::new(
read.files.clone().unwrap().into_iter().map(move |file| open_file(&file, &base_dir))
);
fastq::Reader::new(BufReader::new(reader))
}

pub fn read_onlist(onlist: &seqspec::Onlist) -> Result<Vec<String>> {
let cache = Cache::new()?;
let file = cache.cached_path(&onlist.url)?;
let reader = std::io::BufReader::new(open_file_for_read(file));
Ok(reader.lines().map(|x| x.unwrap()).collect())
}

fn open_file<P: AsRef<Path>>(file: &seqspec::File, base_dir: P) -> Box<dyn std::io::Read> {
match file.urltype {
seqspec::UrlType::Local => {
let mut path = PathBuf::from(&file.url);
path = if path.is_absolute() {
path
} else {
base_dir.as_ref().join(path)
};
Box::new(open_file_for_read(path))
}
_ => {
let cache = Cache::new().unwrap();
let file = cache.cached_path(&file.url).unwrap();
Box::new(open_file_for_read(file))
}
}
}
1 change: 0 additions & 1 deletion precellar/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
pub mod seqspec;
pub mod barcode;
pub mod align;
pub mod fragment;
Expand Down
6 changes: 4 additions & 2 deletions python/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,15 @@ crate-type = ["cdylib"]

[dependencies]
anyhow = "1.0"
#bwa = { git = "https://github.com/regulatory-genomics/bwa-rust.git", rev = "69d482501956039588f94ce9f87367d7ae8f19af" }
bwa-mem2 = { git = "https://github.com/regulatory-genomics/bwa-mem2-rust.git", rev = "07eda9b9c2815ae52b3fa30b01de0e19fae31fe0" }
bstr = "1.0"
either = "1.13"
itertools = "0.13"
noodles = { version = "0.80", features = ["core", "fastq", "bam", "sam", "bgzf"] }
precellar = { path = "../precellar" }
seqspec = { version = "0.1", workspace = true }
serde_yaml = "0.9"
termtree = "0.5"
precellar = { version = "0.1", workspace = true }
regex = "1.6"
log = "0.4"
env_logger = "0.11"
Expand Down
34 changes: 26 additions & 8 deletions python/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
mod utils;
mod pyseqspec;

use std::{collections::HashMap, path::PathBuf, str::FromStr};
use bwa_mem2::{AlignerOpts, BurrowsWheelerAligner, FMIndex, PairedEndStats};
Expand All @@ -13,8 +14,10 @@ use ::precellar::{
align::{Alinger, FastqProcessor, NameCollatedRecords},
fragment::FragmentGenerator,
io::{open_file_for_write, Compression},
qc::{FragmentQC, Metrics, AlignQC}, seqspec::{Assay, Modality},
qc::{FragmentQC, Metrics, AlignQC},
};
use pyseqspec::SeqSpec;
use seqspec::{Assay, Modality};

#[cfg(not(target_env = "msvc"))]
use tikv_jemallocator::Jemalloc;
Expand Down Expand Up @@ -46,8 +49,9 @@ fn make_genome_index(
/// Parameters
/// ----------
///
/// seqspec: Path
/// File path to the sequencing specification, see https://github.com/pachterlab/seqspec.
/// seqspec: SeqSpec | Path
/// A SeqSpec object or file path to the yaml sequencing specification file, see
/// https://github.com/pachterlab/seqspec.
/// genom_index: Path
/// File path to the genome index. The genome index can be created by the `make_genome_index` function.
/// modality: str
Expand Down Expand Up @@ -95,7 +99,7 @@ fn make_genome_index(
)]
fn align(
py: Python<'_>,
seqspec: PathBuf,
seqspec: Bound<'_, PyAny>,
genome_index: PathBuf,
modality: &str,
output_bam: Option<PathBuf>,
Expand All @@ -108,11 +112,23 @@ fn align(
temp_dir: Option<PathBuf>,
num_threads: u32,
) -> Result<HashMap<String, f64>> {
let modality = Modality::from_str(modality).unwrap();

assert!(output_bam.is_some() || output_fragment.is_some(), "either output_bam or output_fragment must be provided");

let spec = Assay::from_path(&seqspec).unwrap();
let modality = Modality::from_str(modality).unwrap();
let spec;
let base_dir;
match seqspec.extract::<PathBuf>() {
Ok(p) => {
spec = Assay::from_path(&p).unwrap();
base_dir = p.parent().unwrap().to_path_buf();
},
_ => {
let s: PyRef<SeqSpec> = seqspec.extract()?;
spec = s.0.clone();
base_dir = ".".into();
}
}

let aligner = BurrowsWheelerAligner::new(
FMIndex::read(genome_index).unwrap(),
AlignerOpts::default().with_n_threads(num_threads as usize),
Expand All @@ -121,7 +137,7 @@ fn align(
let header = aligner.header();
let mut processor = FastqProcessor::new(spec, aligner).with_modality(modality)
.with_barcode_correct_prob(0.9)
.with_base_dir(seqspec.parent().unwrap());
.with_base_dir(base_dir);
let mut fragment_qc = FragmentQC::default();
mito_dna.into_iter().for_each(|x| {
processor.add_mito_dna(&x);
Expand Down Expand Up @@ -255,6 +271,8 @@ fn precellar(m: &Bound<'_, PyModule>) -> PyResult<()> {

m.add("__version__", env!("CARGO_PKG_VERSION"))?;

m.add_class::<pyseqspec::SeqSpec>().unwrap();

m.add_function(wrap_pyfunction!(make_genome_index, m)?)?;
m.add_function(wrap_pyfunction!(align, m)?)?;
m.add_function(wrap_pyfunction!(make_fragment, m)?)?;
Expand Down
135 changes: 135 additions & 0 deletions python/src/pyseqspec.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
use std::{path::PathBuf, str::FromStr};

use pyo3::prelude::*;
use seqspec::{Assay, File, Modality, Read, Region, Strand, UrlType};
use anyhow::Result;
use termtree::Tree;

/** A SeqSpec object.
A SeqSpec object is used to annotate sequencing libraries produced by genomics assays.
Genomic library structure depends on both the assay and sequencer (and kits) used to
generate and bind the assay-specific construct to the sequencing adapters to generate
a sequencing library. SeqSpec is specific to both a genomics assay and sequencer
and provides a standardized format for describing the structure of sequencing
libraries and the resulting sequencing reads. See https://github.com/pachterlab/seqspec for more details.
Parameters
----------
path
Path to the AnnData file.
See Also
--------
align
*/
#[pyclass]
#[repr(transparent)]
pub struct SeqSpec(pub(crate) Assay);

#[pymethods]
impl SeqSpec {
#[new]
#[pyo3(signature = (path))]
pub fn new(path: &str) -> Result<Self> {
let assay = Assay::from_path(path)?;
Ok(SeqSpec(assay))
}

/// Add a fastq file containing reads to the AnnData object.
///
/// Parameters
/// ----------
/// read_id: str
/// The id of the read.
/// modality: str
/// The modality of the read.
/// primer_id: str
/// The id of the primer.
/// is_reverse: bool
/// Whether the read is reverse.
/// fastq: Path | list[Path]
/// The path to the fastq file containing the reads.
#[pyo3(
signature = (read_id, *, modality, primer_id, is_reverse, fastq),
text_signature = "($self, read_id, *, modality, primer_id, is_reverse, fastq)",
)]
pub fn add_read(
&mut self,
read_id: &str,
modality: &str,
primer_id: &str,
is_reverse: bool,
fastq: Bound<'_, PyAny>,
) -> Result<()> {
let fastq = if fastq.is_instance_of::<pyo3::types::PyList>() {
fastq.extract::<Vec<PathBuf>>()?
} else {
vec![fastq.extract::<PathBuf>()?]
};

let assay = &mut self.0;

let mut reads = assay.sequence_spec.take().unwrap_or(Vec::new());
reads = reads.into_iter().filter(|r| r.read_id != read_id).collect();

let mut read = Read::default();
if is_reverse {
read.strand = Strand::Neg;
} else {
read.strand = Strand::Pos;
}
read.modality = Modality::from_str(modality)?;
read.primer_id = primer_id.to_string();
read.files = Some(fastq.into_iter().map(|path| make_file_path(path)).collect::<Result<Vec<File>>>()?);

reads.push(read);
assay.sequence_spec = Some(reads);
Ok(())
}

/*
/// Identify the position of elements in a spec.
///
/// Parameters
/// ----------
/// read_id: str
/// The id of the read.
#[pyo3(
signature = (modality=None),
text_signature = "($self, modality=None)",
)]
pub fn index(&mut self, modality: &str) -> Result<()> {
*/

#[pyo3(text_signature = "($self)")]
pub fn to_yaml(&self) -> String {
serde_yaml::to_string(&self.0).unwrap()
}

fn __repr__(&self) -> String {
let assay = &self.0;
let tree = Tree::new("".to_string())
.with_leaves(assay.library_spec.as_ref().unwrap_or(&Vec::new()).iter().map(|region| build_tree(region)));
format!("{}", tree)
}
}

fn build_tree(region: &Region) -> Tree<String> {
Tree::new(region.region_id.clone())
.with_leaves(region.regions.as_ref().unwrap_or(&Vec::new())
.iter().map(|child| build_tree(child)))
}

fn make_file_path(path: PathBuf) -> Result<File> {
let file = std::fs::File::open(&path)?;
Ok(File {
file_id: path.file_name().unwrap().to_str().unwrap().to_string(),
filename: path.file_name().unwrap().to_str().unwrap().to_string(),
filetype: "fastq".to_string(),
filesize: file.metadata()?.len(),
url: path.to_str().unwrap().to_string(),
urltype: UrlType::Local,
md5: "0".to_string(),
})
}
10 changes: 10 additions & 0 deletions seqspec/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[package]
name = "seqspec"
version = "0.1.0"
edition = "2021"

[dependencies]
anyhow = "1.0"
log = "0.4"
serde = { version = "1.0", features = ["derive"] }
serde_yaml = "0.9"
Loading

0 comments on commit e7f7177

Please sign in to comment.