Skip to content

Commit

Permalink
improve strip_barcode_from_fastq
Browse files Browse the repository at this point in the history
  • Loading branch information
kaizhang committed Oct 17, 2024
1 parent f48e51a commit d6206c9
Show file tree
Hide file tree
Showing 6 changed files with 204 additions and 68 deletions.
4 changes: 2 additions & 2 deletions docs/_templates/autosummary/class.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
.. autosummary::
:toctree: .
{% for item in attributes %}
{{ item }}
~{{ objname }}.{{ item }}
{%- endfor %}
{% endif %}
{% endblock %}
Expand All @@ -24,7 +24,7 @@
:toctree: .
{% for item in methods %}
{%- if item != '__init__' %}
{{ objname }}.{{ item }}
~{{ objname }}.{{ item }}
{%- endif -%}
{%- endfor %}
{% endif %}
Expand Down
4 changes: 2 additions & 2 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ methods.

.. currentmodule:: precellar

SeqSpec
~~~~~~~
Assay
~~~~~

.. autosummary::
:toctree: _autosummary
Expand Down
3 changes: 0 additions & 3 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,6 @@

import re
import sys
import warnings
import os
import subprocess

import precellar
from precellar import utils
Expand Down
7 changes: 6 additions & 1 deletion precellar/src/barcode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -178,8 +178,13 @@ impl BarcodeCorrector {
}
}

/// Barcode correction problem: Given a whitelist of barcodes, and a sequenced barcode with quality scores,
/// decide which barcode in the whitelist generated the sequenced barcode.

/// Convert quality scores to base-calling error probabilities.
/// This is interpreted as the likelihood of being a valid barcode base.
#[inline(always)]
fn probability(qual: u8) -> f64 {
let offset = 33.0;
let offset = 33.0; // Illumina quality score offset
10f64.powf(-((qual as f64 - offset) / 10.0))
}
111 changes: 86 additions & 25 deletions precellar/src/utils.rs
Original file line number Diff line number Diff line change
@@ -1,36 +1,94 @@
use log::warn;
use std::ops::Range;

use noodles::fastq;
use regex::Regex;
use anyhow::{Result, anyhow};
use bstr::ByteSlice;

pub fn strip_barcode_from_read_name(
#[derive(Debug, Clone)]
pub enum GroupIndex {
Position(usize),
Named(String),
}

impl From<usize> for GroupIndex {
fn from(i: usize) -> Self {
GroupIndex::Position(i)
}
}

impl From<&str> for GroupIndex {
fn from(s: &str) -> Self {
GroupIndex::Named(s.to_string())
}
}

impl From<String> for GroupIndex {
fn from(s: String) -> Self {
GroupIndex::Named(s)
}
}

pub fn strip_fastq(
mut fq: fastq::Record,
regex: &Regex,
left_add: usize,
right_add: usize,
) -> Result<(fastq::Record, String)> {
let read_name = fq.name().to_str()?;
let (barcode, name) = remove_barcode(read_name, regex, left_add, right_add)?;
*fq.name_mut() = name.into();
Ok((fq, barcode))
group_indices: Option<&[GroupIndex]>,
from_description: bool,
) -> Result<(fastq::Record, Option<Vec<String>>)> {
let txt = if from_description {
fq.description().to_str()?
} else {
fq.name().to_str()?
};

let (new_name, matches) = strip_from(txt, regex, group_indices)?;

if from_description {
*fq.description_mut() = new_name.into();
} else {
*fq.name_mut() = new_name.into();
}

Ok((fq, matches))
}

fn remove_barcode(name: &str, re: &Regex, left_add: usize, right_add: usize) -> Result<(String, String)> {
let mut mat = re.captures(name)
.and_then(|x| x.get(1))
.ok_or(anyhow!("The regex must contain exactly one capturing group matching the barcode"))?
.range();
let barcode = name.get(mat.clone()).unwrap().to_string();
if barcode.is_empty() {
warn!("regex match is empty for read name: {}", name);
Ok((barcode, name.to_string()))
fn strip_from(
txt: &str,
regex: &Regex,
group_indices: Option<&[GroupIndex]>,
) -> Result<(String, Option<Vec<String>>)> {
let caps = regex.captures(txt).ok_or(anyhow!("No match found for: {}", txt))?;
let new_name = remove_substrings(txt, caps.iter().skip(1).map(|m| m.unwrap().range()));
let matches = if let Some(indices) = group_indices {
let matches = indices.iter().map(|idx| match idx {
GroupIndex::Position(i) => caps.get(*i+1).map(|m| m.as_str().to_string()),
GroupIndex::Named(name) => caps.name(name).map(|m| m.as_str().to_string()),
}).collect::<Option<Vec<_>>>().ok_or(anyhow!("Failed to extract barcodes"))?;
Some(matches)
} else {
mat = (mat.start - left_add)..(mat.end + right_add);
let new_name = name.get(..mat.start).unwrap().to_string() + name.get(mat.end..).unwrap();
Ok((barcode, new_name))
None
};
Ok((new_name, matches))
}

fn remove_substrings<I>(s: &str, ranges: I) -> String
where
I: IntoIterator<Item = Range<usize>>,
{
let mut result = String::with_capacity(s.len());
let mut last_index = 0;

for Range {start, end} in ranges.into_iter() {
// Ensure valid ranges and skip invalid ones
if start < end && end <= s.len() {
result.push_str(&s[last_index..start]); // Append part before the range
last_index = end; // Move past the removed range
}
}

// Append remaining part after the last removed range
result.push_str(&s[last_index..]);
result
}

#[cfg(test)]
Expand All @@ -40,9 +98,12 @@ mod tests {
#[test]
fn test_strip_barcode() {
let name = "A01535:24:HW2MMDSX2:2:1359:8513:3458:bd:69:Y6:10:TGATAGGTTG";
let re = Regex::new(r"(..:..:..:..):\w+$").unwrap();
let (barcode, new_name) = remove_barcode(name, &re, 1, 1).unwrap();
assert_eq!(barcode, "bd:69:Y6:10");
assert_eq!(new_name, "A01535:24:HW2MMDSX2:2:1359:8513:3458TGATAGGTTG");
let re = Regex::new(r"3458(:)(?<barcode>..:..:..:..)(:)(?<umi>[ATCG]+)$").unwrap();
let indices = ["barcode".into(), "umi".into()];
let (new_name, matches) = strip_from(name, &re, Some(indices.as_slice())).unwrap();
let matches = matches.unwrap();
assert_eq!(matches[0], "bd:69:Y6:10");
assert_eq!(matches[1], "TGATAGGTTG");
assert_eq!(new_name, "A01535:24:HW2MMDSX2:2:1359:8513:3458");
}
}
143 changes: 108 additions & 35 deletions python/src/utils.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
use std::{io::{BufReader, BufWriter}, path::PathBuf, str::FromStr};
use precellar::utils::strip_barcode_from_read_name;
use precellar::utils::strip_fastq;
use seqspec::utils::{open_file_for_read, open_file_for_write, Compression};
use noodles::fastq::{self, Reader, io::Writer};
use anyhow::Result;
use pyo3::prelude::*;
use pyo3::{prelude::*, types::PyDict};
use regex::Regex;

/// Remove barcode from the read names of fastq records.
Expand All @@ -19,44 +19,87 @@ use regex::Regex;
/// out_fq: Path
/// File path to the output fastq file.
/// regex: str
/// Extract barcodes from read names of BAM records using regular expressions.
/// Reguler expressions should contain exactly one capturing group
/// (Parentheses group the regex between them) that matches
/// the barcodes. For example, `barcode_regex="(..:..:..:..):\\w+$"`
/// extracts `bd:69:Y6:10` from
/// `A01535:24:HW2MMDSX2:2:1359:8513:3458:bd:69:Y6:10:TGATAGGTTG`.
/// You can test your regex on this website: https://regex101.com/.
/// out_barcode: Path | None
/// File path to the output fastq file containing the extracted barcodes.
/// If None, the barcodes will not be written to a file.
/// left_add: int
/// Additional bases to strip from the left side of the barcode.
/// right_add: int
/// Additional bases to strip from the right side of the barcode.
/// Extract barcodes from read names or descriptions from the input fastq file using regular expressions.
/// We use [capturing groups](https://en.wikipedia.org/wiki/Regular_expression#POSIX_basic_and_extended)
/// to extract and remove the barcodes. For more details, please
/// read the examples below. You can test your regex on this website: https://regex101.com/.
/// out_barcode: dict[int | str, Path] | Path | None
/// A dictionary that maps the index or name of the capturing group to the output fastq file path.
/// If a single file path is provided, the first capturing group will be written to that file.
/// If None, none of the barcodes will be written to a file.
/// from_description: bool
/// If True, the barcode will be extracted from the description instead of the name.
/// compression: str | None
/// Compression algorithm to use. If None, the compression algorithm will be inferred from the file extension.
/// compression_level: int | None
/// Compression level to use.
/// num_threads: int
/// The number of threads to use.
///
/// Examples
/// --------
/// Suppose the read names in the fastq file contain barcodes and UMIs in the following format:
/// `A01535:24:HW2MMDSX2:2:1359:8513:3458:bd:69:Y6:10:TGATAGGTTG`,
/// where `bd:69:Y6:10` corresponds to the barcode and `TGATAGGTTG` corresponds to the UMI.
/// We can extract the barcode using the following regular expression:
/// `"(..:..:..:..)(:)\\w+$"`. Here we use two capturing groups. The first group captures the barcode,
/// and the second group captures the colon that separates the barcode from the UMI.
/// Texts matched by the capturing groups are removed from the read names.
/// So the resultant read name will be `A01535:24:HW2MMDSX2:2:1359:8513:3458:TGATAGGTTG`.
/// Here we use `out_barcode="test_out.fq.zst"` to write the first capturing group to a separate fastq file.
///
/// >>> from precellar.utils import strip_barcode_from_fastq
/// >>> strip_barcode_from_fastq(
/// ... "test.fq.gz",
/// ... "test_out.fq.zst",
/// ... regex="(..:..:..:..)(:)\\w+$",
/// ... out_barcode="barcode.fq.zst",
/// ... )
///
/// It is possible to extract both the barcode and UMI at the same time, using the following regular expression:
/// `"(..:..:..:..)(:)([ACGTN]+)$"`. Here we use three capturing groups. The first group captures the barcode,
/// the second group captures the colon that separates the barcode from the UMI, and the third group captures the UMI.
/// To write the barcode and UMI to separate files, we can use a dictionary:
/// `out_barcode={0: "barcode.fq.zst", 2: "umi.fq.zst"}`. The integer keys correspond
/// to the index of the capturing groups.
///
/// >>> strip_barcode_from_fastq(
/// ... "test.fq.gz",
/// ... "test_out.fq.zst",
/// ... regex="(..:..:..:..)(:)([ACGTN]+)$",
/// ... out_barcode={0: "barcode.fq.zst", 2: "umi.fq.zst"},
/// ... )
///
/// We can use names to refer to the capturing groups instead of indices:
/// `"(?<barcode>..:..:..:..)(:)(?<umi>[ACGTN]+)$"`.
///
/// >>> strip_barcode_from_fastq(
/// ... "test.fq.gz",
/// ... "test_out.fq.zst",
/// ... regex="(?<barcode>..:..:..:..)(:)(?<umi>[ACGTN]+)$",
/// ... out_barcode={'barcode': "barcode.fq.zst", 'umi': "umi.fq.zst"},
/// ... )
///
/// If the barcode is stored in the description instead of the name, we can set `from_description=True`.
///
#[pyfunction]
#[pyo3(
signature = (in_fq, out_fq,
*, regex, out_barcode=None, left_add=0, right_add=0,
*, regex, out_barcode=None, from_description=false,
compression=None, compression_level=None, num_threads=8,
),
text_signature = "(in_fq, out_fq,
*, regex, out_barcode, left_add=0, right_add=0,
*, regex, out_barcode, from_description=False,
compression=None, compression_level=None, num_threads=8)",
)]
fn strip_barcode_from_fastq(
py: Python<'_>,
in_fq: PathBuf,
out_fq: PathBuf,
regex: &str,
out_barcode: Option<PathBuf>,
left_add: usize,
right_add: usize,
//out_barcode: Option<PathBuf>,
out_barcode: Option<Bound<'_, PyAny>>,
from_description: bool,
compression: Option<&str>,
compression_level: Option<u32>,
num_threads: u32,
Expand All @@ -68,29 +111,59 @@ fn strip_barcode_from_fastq(
.or((&out_fq).try_into().ok());
Writer::new(BufWriter::new(open_file_for_write(out_fq, compression, compression_level, num_threads)?))
};
let mut barcode_writer = out_barcode.as_ref().map(|output| {
let compression = compression.map(|x| Compression::from_str(x).unwrap())
.or(output.try_into().ok());
Writer::new(BufWriter::new(open_file_for_write(output, compression, compression_level, num_threads).unwrap()))
});

let (barcode_keys, mut barcode_writers) = if let Some(output) = out_barcode {
let mut keys = Vec::new();
let mut files = Vec::new();
match output.extract::<PathBuf>() {
Ok(p) => {
keys.push(0.into());
files.push(p);
},
_ => {
let dict = output.downcast::<PyDict>().unwrap();
dict.iter().for_each(|(k, v)| {
if let Ok(k) = k.extract::<usize>() {
keys.push(k.into());
} else {
keys.push(k.extract::<String>().unwrap().into());
}

files.push(v.extract::<PathBuf>().unwrap());
});
},
};
let writers = files.into_iter().map(|file| {
let compression = compression.map(|x| Compression::from_str(x).unwrap())
.or((&file).try_into().ok());
Writer::new(BufWriter::new(open_file_for_write(file, compression, compression_level, num_threads).unwrap()))
}).collect();
(Some(keys), writers)
} else {
(None, Vec::new())
};

let re = Regex::new(regex)?;
reader.records().enumerate().try_for_each(|(i, record)| {
if i % 1000000 == 0 {
py.check_signals().unwrap();
}
let record = record?;
let (record, barcode) = strip_barcode_from_read_name(record, &re, left_add, right_add)?;
let (record, barcodes) = strip_fastq(
record, &re, barcode_keys.as_ref().map(|x| x.as_slice()), from_description
)?;

fq_writer.write_record(&record)?;
if let Some(barcode_writer) = &mut barcode_writer {
let qual_score = vec![b'~'; barcode.len()];
let fq = fastq::Record::new(
fastq::record::Definition::new(format!("BARCODE.{}", i), ""),
barcode.into_bytes(),
qual_score,
);
barcode_writer.write_record(&fq)?;
if let Some(barcodes) = barcodes {
barcodes.into_iter().enumerate().for_each(|(i, barcode)| {
let qual_score = vec![b'~'; barcode.len()];
let fq = fastq::Record::new(
fastq::record::Definition::new(record.name(), ""),
barcode.into_bytes(),
qual_score,
);
barcode_writers.get_mut(i).unwrap().write_record(&fq).unwrap();
})
}

anyhow::Ok(())
Expand Down

0 comments on commit d6206c9

Please sign in to comment.