From d6206c99c62f43c20ece6bc9602097cae5e3225a Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Thu, 17 Oct 2024 15:55:38 +0800 Subject: [PATCH] improve `strip_barcode_from_fastq` --- docs/_templates/autosummary/class.rst | 4 +- docs/api.rst | 4 +- docs/conf.py | 3 - precellar/src/barcode.rs | 7 +- precellar/src/utils.rs | 111 +++++++++++++++----- python/src/utils.rs | 143 +++++++++++++++++++------- 6 files changed, 204 insertions(+), 68 deletions(-) diff --git a/docs/_templates/autosummary/class.rst b/docs/_templates/autosummary/class.rst index 5df72b1..d73094a 100644 --- a/docs/_templates/autosummary/class.rst +++ b/docs/_templates/autosummary/class.rst @@ -11,7 +11,7 @@ .. autosummary:: :toctree: . {% for item in attributes %} - {{ item }} + ~{{ objname }}.{{ item }} {%- endfor %} {% endif %} {% endblock %} @@ -24,7 +24,7 @@ :toctree: . {% for item in methods %} {%- if item != '__init__' %} - {{ objname }}.{{ item }} + ~{{ objname }}.{{ item }} {%- endif -%} {%- endfor %} {% endif %} diff --git a/docs/api.rst b/docs/api.rst index f810070..8c6e08a 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -7,8 +7,8 @@ methods. .. currentmodule:: precellar -SeqSpec -~~~~~~~ +Assay +~~~~~ .. autosummary:: :toctree: _autosummary diff --git a/docs/conf.py b/docs/conf.py index 8d5c641..08f23ba 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -2,9 +2,6 @@ import re import sys -import warnings -import os -import subprocess import precellar from precellar import utils diff --git a/precellar/src/barcode.rs b/precellar/src/barcode.rs index a8ad779..e2f60d1 100644 --- a/precellar/src/barcode.rs +++ b/precellar/src/barcode.rs @@ -178,8 +178,13 @@ impl BarcodeCorrector { } } +/// Barcode correction problem: Given a whitelist of barcodes, and a sequenced barcode with quality scores, +/// decide which barcode in the whitelist generated the sequenced barcode. + /// Convert quality scores to base-calling error probabilities. +/// This is interpreted as the likelihood of being a valid barcode base. +#[inline(always)] fn probability(qual: u8) -> f64 { - let offset = 33.0; + let offset = 33.0; // Illumina quality score offset 10f64.powf(-((qual as f64 - offset) / 10.0)) } \ No newline at end of file diff --git a/precellar/src/utils.rs b/precellar/src/utils.rs index 198d843..08ab4bd 100644 --- a/precellar/src/utils.rs +++ b/precellar/src/utils.rs @@ -1,36 +1,94 @@ -use log::warn; +use std::ops::Range; + use noodles::fastq; use regex::Regex; use anyhow::{Result, anyhow}; use bstr::ByteSlice; -pub fn strip_barcode_from_read_name( +#[derive(Debug, Clone)] +pub enum GroupIndex { + Position(usize), + Named(String), +} + +impl From for GroupIndex { + fn from(i: usize) -> Self { + GroupIndex::Position(i) + } +} + +impl From<&str> for GroupIndex { + fn from(s: &str) -> Self { + GroupIndex::Named(s.to_string()) + } +} + +impl From for GroupIndex { + fn from(s: String) -> Self { + GroupIndex::Named(s) + } +} + +pub fn strip_fastq( mut fq: fastq::Record, regex: &Regex, - left_add: usize, - right_add: usize, -) -> Result<(fastq::Record, String)> { - let read_name = fq.name().to_str()?; - let (barcode, name) = remove_barcode(read_name, regex, left_add, right_add)?; - *fq.name_mut() = name.into(); - Ok((fq, barcode)) + group_indices: Option<&[GroupIndex]>, + from_description: bool, +) -> Result<(fastq::Record, Option>)> { + let txt = if from_description { + fq.description().to_str()? + } else { + fq.name().to_str()? + }; + + let (new_name, matches) = strip_from(txt, regex, group_indices)?; + + if from_description { + *fq.description_mut() = new_name.into(); + } else { + *fq.name_mut() = new_name.into(); + } + + Ok((fq, matches)) } -fn remove_barcode(name: &str, re: &Regex, left_add: usize, right_add: usize) -> Result<(String, String)> { - let mut mat = re.captures(name) - .and_then(|x| x.get(1)) - .ok_or(anyhow!("The regex must contain exactly one capturing group matching the barcode"))? - .range(); - let barcode = name.get(mat.clone()).unwrap().to_string(); - if barcode.is_empty() { - warn!("regex match is empty for read name: {}", name); - Ok((barcode, name.to_string())) +fn strip_from( + txt: &str, + regex: &Regex, + group_indices: Option<&[GroupIndex]>, +) -> Result<(String, Option>)> { + let caps = regex.captures(txt).ok_or(anyhow!("No match found for: {}", txt))?; + let new_name = remove_substrings(txt, caps.iter().skip(1).map(|m| m.unwrap().range())); + let matches = if let Some(indices) = group_indices { + let matches = indices.iter().map(|idx| match idx { + GroupIndex::Position(i) => caps.get(*i+1).map(|m| m.as_str().to_string()), + GroupIndex::Named(name) => caps.name(name).map(|m| m.as_str().to_string()), + }).collect::>>().ok_or(anyhow!("Failed to extract barcodes"))?; + Some(matches) } else { - mat = (mat.start - left_add)..(mat.end + right_add); - let new_name = name.get(..mat.start).unwrap().to_string() + name.get(mat.end..).unwrap(); - Ok((barcode, new_name)) + None + }; + Ok((new_name, matches)) +} + +fn remove_substrings(s: &str, ranges: I) -> String +where + I: IntoIterator>, +{ + let mut result = String::with_capacity(s.len()); + let mut last_index = 0; + + for Range {start, end} in ranges.into_iter() { + // Ensure valid ranges and skip invalid ones + if start < end && end <= s.len() { + result.push_str(&s[last_index..start]); // Append part before the range + last_index = end; // Move past the removed range + } } + // Append remaining part after the last removed range + result.push_str(&s[last_index..]); + result } #[cfg(test)] @@ -40,9 +98,12 @@ mod tests { #[test] fn test_strip_barcode() { let name = "A01535:24:HW2MMDSX2:2:1359:8513:3458:bd:69:Y6:10:TGATAGGTTG"; - let re = Regex::new(r"(..:..:..:..):\w+$").unwrap(); - let (barcode, new_name) = remove_barcode(name, &re, 1, 1).unwrap(); - assert_eq!(barcode, "bd:69:Y6:10"); - assert_eq!(new_name, "A01535:24:HW2MMDSX2:2:1359:8513:3458TGATAGGTTG"); + let re = Regex::new(r"3458(:)(?..:..:..:..)(:)(?[ATCG]+)$").unwrap(); + let indices = ["barcode".into(), "umi".into()]; + let (new_name, matches) = strip_from(name, &re, Some(indices.as_slice())).unwrap(); + let matches = matches.unwrap(); + assert_eq!(matches[0], "bd:69:Y6:10"); + assert_eq!(matches[1], "TGATAGGTTG"); + assert_eq!(new_name, "A01535:24:HW2MMDSX2:2:1359:8513:3458"); } } \ No newline at end of file diff --git a/python/src/utils.rs b/python/src/utils.rs index 92c82cd..dbcdcc3 100644 --- a/python/src/utils.rs +++ b/python/src/utils.rs @@ -1,9 +1,9 @@ use std::{io::{BufReader, BufWriter}, path::PathBuf, str::FromStr}; -use precellar::utils::strip_barcode_from_read_name; +use precellar::utils::strip_fastq; use seqspec::utils::{open_file_for_read, open_file_for_write, Compression}; use noodles::fastq::{self, Reader, io::Writer}; use anyhow::Result; -use pyo3::prelude::*; +use pyo3::{prelude::*, types::PyDict}; use regex::Regex; /// Remove barcode from the read names of fastq records. @@ -19,34 +19,77 @@ use regex::Regex; /// out_fq: Path /// File path to the output fastq file. /// regex: str -/// Extract barcodes from read names of BAM records using regular expressions. -/// Reguler expressions should contain exactly one capturing group -/// (Parentheses group the regex between them) that matches -/// the barcodes. For example, `barcode_regex="(..:..:..:..):\\w+$"` -/// extracts `bd:69:Y6:10` from -/// `A01535:24:HW2MMDSX2:2:1359:8513:3458:bd:69:Y6:10:TGATAGGTTG`. -/// You can test your regex on this website: https://regex101.com/. -/// out_barcode: Path | None -/// File path to the output fastq file containing the extracted barcodes. -/// If None, the barcodes will not be written to a file. -/// left_add: int -/// Additional bases to strip from the left side of the barcode. -/// right_add: int -/// Additional bases to strip from the right side of the barcode. +/// Extract barcodes from read names or descriptions from the input fastq file using regular expressions. +/// We use [capturing groups](https://en.wikipedia.org/wiki/Regular_expression#POSIX_basic_and_extended) +/// to extract and remove the barcodes. For more details, please +/// read the examples below. You can test your regex on this website: https://regex101.com/. +/// out_barcode: dict[int | str, Path] | Path | None +/// A dictionary that maps the index or name of the capturing group to the output fastq file path. +/// If a single file path is provided, the first capturing group will be written to that file. +/// If None, none of the barcodes will be written to a file. +/// from_description: bool +/// If True, the barcode will be extracted from the description instead of the name. /// compression: str | None /// Compression algorithm to use. If None, the compression algorithm will be inferred from the file extension. /// compression_level: int | None /// Compression level to use. /// num_threads: int /// The number of threads to use. +/// +/// Examples +/// -------- +/// Suppose the read names in the fastq file contain barcodes and UMIs in the following format: +/// `A01535:24:HW2MMDSX2:2:1359:8513:3458:bd:69:Y6:10:TGATAGGTTG`, +/// where `bd:69:Y6:10` corresponds to the barcode and `TGATAGGTTG` corresponds to the UMI. +/// We can extract the barcode using the following regular expression: +/// `"(..:..:..:..)(:)\\w+$"`. Here we use two capturing groups. The first group captures the barcode, +/// and the second group captures the colon that separates the barcode from the UMI. +/// Texts matched by the capturing groups are removed from the read names. +/// So the resultant read name will be `A01535:24:HW2MMDSX2:2:1359:8513:3458:TGATAGGTTG`. +/// Here we use `out_barcode="test_out.fq.zst"` to write the first capturing group to a separate fastq file. +/// +/// >>> from precellar.utils import strip_barcode_from_fastq +/// >>> strip_barcode_from_fastq( +/// ... "test.fq.gz", +/// ... "test_out.fq.zst", +/// ... regex="(..:..:..:..)(:)\\w+$", +/// ... out_barcode="barcode.fq.zst", +/// ... ) +/// +/// It is possible to extract both the barcode and UMI at the same time, using the following regular expression: +/// `"(..:..:..:..)(:)([ACGTN]+)$"`. Here we use three capturing groups. The first group captures the barcode, +/// the second group captures the colon that separates the barcode from the UMI, and the third group captures the UMI. +/// To write the barcode and UMI to separate files, we can use a dictionary: +/// `out_barcode={0: "barcode.fq.zst", 2: "umi.fq.zst"}`. The integer keys correspond +/// to the index of the capturing groups. +/// +/// >>> strip_barcode_from_fastq( +/// ... "test.fq.gz", +/// ... "test_out.fq.zst", +/// ... regex="(..:..:..:..)(:)([ACGTN]+)$", +/// ... out_barcode={0: "barcode.fq.zst", 2: "umi.fq.zst"}, +/// ... ) +/// +/// We can use names to refer to the capturing groups instead of indices: +/// `"(?..:..:..:..)(:)(?[ACGTN]+)$"`. +/// +/// >>> strip_barcode_from_fastq( +/// ... "test.fq.gz", +/// ... "test_out.fq.zst", +/// ... regex="(?..:..:..:..)(:)(?[ACGTN]+)$", +/// ... out_barcode={'barcode': "barcode.fq.zst", 'umi': "umi.fq.zst"}, +/// ... ) +/// +/// If the barcode is stored in the description instead of the name, we can set `from_description=True`. +/// #[pyfunction] #[pyo3( signature = (in_fq, out_fq, - *, regex, out_barcode=None, left_add=0, right_add=0, + *, regex, out_barcode=None, from_description=false, compression=None, compression_level=None, num_threads=8, ), text_signature = "(in_fq, out_fq, - *, regex, out_barcode, left_add=0, right_add=0, + *, regex, out_barcode, from_description=False, compression=None, compression_level=None, num_threads=8)", )] fn strip_barcode_from_fastq( @@ -54,9 +97,9 @@ fn strip_barcode_from_fastq( in_fq: PathBuf, out_fq: PathBuf, regex: &str, - out_barcode: Option, - left_add: usize, - right_add: usize, + //out_barcode: Option, + out_barcode: Option>, + from_description: bool, compression: Option<&str>, compression_level: Option, num_threads: u32, @@ -68,11 +111,37 @@ fn strip_barcode_from_fastq( .or((&out_fq).try_into().ok()); Writer::new(BufWriter::new(open_file_for_write(out_fq, compression, compression_level, num_threads)?)) }; - let mut barcode_writer = out_barcode.as_ref().map(|output| { - let compression = compression.map(|x| Compression::from_str(x).unwrap()) - .or(output.try_into().ok()); - Writer::new(BufWriter::new(open_file_for_write(output, compression, compression_level, num_threads).unwrap())) - }); + + let (barcode_keys, mut barcode_writers) = if let Some(output) = out_barcode { + let mut keys = Vec::new(); + let mut files = Vec::new(); + match output.extract::() { + Ok(p) => { + keys.push(0.into()); + files.push(p); + }, + _ => { + let dict = output.downcast::().unwrap(); + dict.iter().for_each(|(k, v)| { + if let Ok(k) = k.extract::() { + keys.push(k.into()); + } else { + keys.push(k.extract::().unwrap().into()); + } + + files.push(v.extract::().unwrap()); + }); + }, + }; + let writers = files.into_iter().map(|file| { + let compression = compression.map(|x| Compression::from_str(x).unwrap()) + .or((&file).try_into().ok()); + Writer::new(BufWriter::new(open_file_for_write(file, compression, compression_level, num_threads).unwrap())) + }).collect(); + (Some(keys), writers) + } else { + (None, Vec::new()) + }; let re = Regex::new(regex)?; reader.records().enumerate().try_for_each(|(i, record)| { @@ -80,17 +149,21 @@ fn strip_barcode_from_fastq( py.check_signals().unwrap(); } let record = record?; - let (record, barcode) = strip_barcode_from_read_name(record, &re, left_add, right_add)?; + let (record, barcodes) = strip_fastq( + record, &re, barcode_keys.as_ref().map(|x| x.as_slice()), from_description + )?; fq_writer.write_record(&record)?; - if let Some(barcode_writer) = &mut barcode_writer { - let qual_score = vec![b'~'; barcode.len()]; - let fq = fastq::Record::new( - fastq::record::Definition::new(format!("BARCODE.{}", i), ""), - barcode.into_bytes(), - qual_score, - ); - barcode_writer.write_record(&fq)?; + if let Some(barcodes) = barcodes { + barcodes.into_iter().enumerate().for_each(|(i, barcode)| { + let qual_score = vec![b'~'; barcode.len()]; + let fq = fastq::Record::new( + fastq::record::Definition::new(record.name(), ""), + barcode.into_bytes(), + qual_score, + ); + barcode_writers.get_mut(i).unwrap().write_record(&fq).unwrap(); + }) } anyhow::Ok(())