From d6206c99c62f43c20ece6bc9602097cae5e3225a Mon Sep 17 00:00:00 2001
From: Kai Zhang <zhangkai33@westlake.edu.cn>
Date: Thu, 17 Oct 2024 15:55:38 +0800
Subject: [PATCH] improve `strip_barcode_from_fastq`

---
 docs/_templates/autosummary/class.rst |   4 +-
 docs/api.rst                          |   4 +-
 docs/conf.py                          |   3 -
 precellar/src/barcode.rs              |   7 +-
 precellar/src/utils.rs                | 111 +++++++++++++++-----
 python/src/utils.rs                   | 143 +++++++++++++++++++-------
 6 files changed, 204 insertions(+), 68 deletions(-)
diff --git a/docs/_templates/autosummary/class.rst b/docs/_templates/autosummary/class.rst
index 5df72b1..d73094a 100644
--- a/docs/_templates/autosummary/class.rst
+++ b/docs/_templates/autosummary/class.rst
@@ -11,7 +11,7 @@
    .. autosummary::
       :toctree: .
    {% for item in attributes %}
-      {{ item }}
+      ~{{ objname }}.{{ item }}
    {%- endfor %}
    {% endif %}
    {% endblock %}
@@ -24,7 +24,7 @@
       :toctree: .
    {% for item in methods %}
       {%- if item != '__init__' %}
-      {{ objname }}.{{ item }}
+      ~{{ objname }}.{{ item }}
       {%- endif -%}
    {%- endfor %}
    {% endif %}
diff --git a/docs/api.rst b/docs/api.rst
index f810070..8c6e08a 100644
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -7,8 +7,8 @@ methods.
 
 .. currentmodule:: precellar
 
-SeqSpec
-~~~~~~~
+Assay
+~~~~~
 
 .. autosummary::
     :toctree: _autosummary
diff --git a/docs/conf.py b/docs/conf.py
index 8d5c641..08f23ba 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -2,9 +2,6 @@
 
 import re
 import sys
-import warnings
-import os
-import subprocess
 
 import precellar
 from precellar import utils
diff --git a/precellar/src/barcode.rs b/precellar/src/barcode.rs
index a8ad779..e2f60d1 100644
--- a/precellar/src/barcode.rs
+++ b/precellar/src/barcode.rs
@@ -178,8 +178,13 @@ impl BarcodeCorrector {
     }
 }
 
+/// Barcode correction problem: Given a whitelist of barcodes, and a sequenced barcode with quality scores, 
+/// decide which barcode in the whitelist generated the sequenced barcode.
+
 /// Convert quality scores to base-calling error probabilities.
+/// This is interpreted as the likelihood of being a valid barcode base.
+#[inline(always)]
 fn probability(qual: u8) -> f64 {
-    let offset = 33.0;
+    let offset = 33.0;  // Illumina quality score offset
     10f64.powf(-((qual as f64 - offset) / 10.0))
 }
\ No newline at end of file
diff --git a/precellar/src/utils.rs b/precellar/src/utils.rs
index 198d843..08ab4bd 100644
--- a/precellar/src/utils.rs
+++ b/precellar/src/utils.rs
@@ -1,36 +1,94 @@
-use log::warn;
+use std::ops::Range;
+
 use noodles::fastq;
 use regex::Regex;
 use anyhow::{Result, anyhow};
 use bstr::ByteSlice;
 
-pub fn strip_barcode_from_read_name(
+#[derive(Debug, Clone)]
+pub enum GroupIndex {
+    Position(usize),
+    Named(String),
+}
+
+impl From<usize> for GroupIndex {
+    fn from(i: usize) -> Self {
+        GroupIndex::Position(i)
+    }
+}
+
+impl From<&str> for GroupIndex {
+    fn from(s: &str) -> Self {
+        GroupIndex::Named(s.to_string())
+    }
+}
+
+impl From<String> for GroupIndex {
+    fn from(s: String) -> Self {
+        GroupIndex::Named(s)
+    }
+}
+
+pub fn strip_fastq(
     mut fq: fastq::Record,
     regex: &Regex,
-    left_add: usize,
-    right_add: usize,
-) -> Result<(fastq::Record, String)> {
-    let read_name = fq.name().to_str()?;
-    let (barcode, name) = remove_barcode(read_name, regex, left_add, right_add)?;
-    *fq.name_mut() = name.into();
-    Ok((fq, barcode))
+    group_indices: Option<&[GroupIndex]>,
+    from_description: bool,
+) -> Result<(fastq::Record, Option<Vec<String>>)> {
+    let txt = if from_description {
+        fq.description().to_str()?
+    } else {
+        fq.name().to_str()?
+    };
+
+    let (new_name, matches) = strip_from(txt, regex, group_indices)?;
+
+    if from_description {
+        *fq.description_mut() = new_name.into();
+    } else {
+        *fq.name_mut() = new_name.into();
+    }
+
+    Ok((fq, matches))
 }
 
-fn remove_barcode(name: &str, re: &Regex, left_add: usize, right_add: usize) -> Result<(String, String)> {
-    let mut mat = re.captures(name)
-        .and_then(|x| x.get(1))
-        .ok_or(anyhow!("The regex must contain exactly one capturing group matching the barcode"))?
-        .range();
-    let barcode = name.get(mat.clone()).unwrap().to_string();
-    if barcode.is_empty() {
-        warn!("regex match is empty for read name: {}", name);
-        Ok((barcode, name.to_string()))
+fn strip_from(
+    txt: &str,
+    regex: &Regex,
+    group_indices: Option<&[GroupIndex]>,
+) -> Result<(String, Option<Vec<String>>)> {
+    let caps = regex.captures(txt).ok_or(anyhow!("No match found for: {}", txt))?;
+    let new_name = remove_substrings(txt, caps.iter().skip(1).map(|m| m.unwrap().range()));
+    let matches = if let Some(indices) = group_indices {
+        let matches = indices.iter().map(|idx| match idx {
+            GroupIndex::Position(i) => caps.get(*i+1).map(|m| m.as_str().to_string()),
+            GroupIndex::Named(name) => caps.name(name).map(|m| m.as_str().to_string()),
+        }).collect::<Option<Vec<_>>>().ok_or(anyhow!("Failed to extract barcodes"))?;
+        Some(matches)
     } else {
-        mat = (mat.start - left_add)..(mat.end + right_add);
-        let new_name = name.get(..mat.start).unwrap().to_string() + name.get(mat.end..).unwrap();
-        Ok((barcode, new_name))
+        None
+    };
+    Ok((new_name, matches))
+}
+
+fn remove_substrings<I>(s: &str, ranges: I) -> String
+where
+    I: IntoIterator<Item = Range<usize>>,
+{
+    let mut result = String::with_capacity(s.len());
+    let mut last_index = 0;
+
+    for Range {start, end} in ranges.into_iter() {
+        // Ensure valid ranges and skip invalid ones
+        if start < end && end <= s.len() {
+            result.push_str(&s[last_index..start]); // Append part before the range
+            last_index = end; // Move past the removed range
+        }
     }
 
+    // Append remaining part after the last removed range
+    result.push_str(&s[last_index..]);
+    result
 }
 
 #[cfg(test)]
@@ -40,9 +98,12 @@ mod tests {
     #[test]
     fn test_strip_barcode() {
         let name = "A01535:24:HW2MMDSX2:2:1359:8513:3458:bd:69:Y6:10:TGATAGGTTG";
-        let re = Regex::new(r"(..:..:..:..):\w+$").unwrap();
-        let (barcode, new_name) = remove_barcode(name, &re, 1, 1).unwrap();
-        assert_eq!(barcode, "bd:69:Y6:10");
-        assert_eq!(new_name, "A01535:24:HW2MMDSX2:2:1359:8513:3458TGATAGGTTG");
+        let re = Regex::new(r"3458(:)(?<barcode>..:..:..:..)(:)(?<umi>[ATCG]+)$").unwrap();
+        let indices = ["barcode".into(), "umi".into()];
+        let (new_name, matches) = strip_from(name, &re, Some(indices.as_slice())).unwrap();
+        let matches = matches.unwrap();
+        assert_eq!(matches[0], "bd:69:Y6:10");
+        assert_eq!(matches[1], "TGATAGGTTG");
+        assert_eq!(new_name, "A01535:24:HW2MMDSX2:2:1359:8513:3458");
     }
 }
\ No newline at end of file
diff --git a/python/src/utils.rs b/python/src/utils.rs
index 92c82cd..dbcdcc3 100644
--- a/python/src/utils.rs
+++ b/python/src/utils.rs
@@ -1,9 +1,9 @@
 use std::{io::{BufReader, BufWriter}, path::PathBuf, str::FromStr};
-use precellar::utils::strip_barcode_from_read_name;
+use precellar::utils::strip_fastq;
 use seqspec::utils::{open_file_for_read, open_file_for_write, Compression};
 use noodles::fastq::{self, Reader, io::Writer};
 use anyhow::Result;
-use pyo3::prelude::*;
+use pyo3::{prelude::*, types::PyDict};
 use regex::Regex;
 
 /// Remove barcode from the read names of fastq records.
@@ -19,34 +19,77 @@ use regex::Regex;
 /// out_fq: Path
 ///     File path to the output fastq file.
 /// regex: str
-///     Extract barcodes from read names of BAM records using regular expressions.
-///     Reguler expressions should contain exactly one capturing group 
-///     (Parentheses group the regex between them) that matches
-///     the barcodes. For example, `barcode_regex="(..:..:..:..):\\w+$"`
-///     extracts `bd:69:Y6:10` from
-///     `A01535:24:HW2MMDSX2:2:1359:8513:3458:bd:69:Y6:10:TGATAGGTTG`.
-///     You can test your regex on this website: https://regex101.com/.
-/// out_barcode: Path | None
-///     File path to the output fastq file containing the extracted barcodes.
-///     If None, the barcodes will not be written to a file.
-/// left_add: int
-///     Additional bases to strip from the left side of the barcode.
-/// right_add: int
-///     Additional bases to strip from the right side of the barcode.
+///     Extract barcodes from read names or descriptions from the input fastq file using regular expressions.
+///     We use [capturing groups](https://en.wikipedia.org/wiki/Regular_expression#POSIX_basic_and_extended)
+///     to extract and remove the barcodes. For more details, please
+///     read the examples below. You can test your regex on this website: https://regex101.com/.
+/// out_barcode: dict[int | str, Path] | Path | None
+///     A dictionary that maps the index or name of the capturing group to the output fastq file path.
+///     If a single file path is provided, the first capturing group will be written to that file.
+///     If None, none of the barcodes will be written to a file.
+/// from_description: bool
+///    If True, the barcode will be extracted from the description instead of the name.
 /// compression: str | None
 ///     Compression algorithm to use. If None, the compression algorithm will be inferred from the file extension.
 /// compression_level: int | None
 ///     Compression level to use.
 /// num_threads: int
 ///     The number of threads to use.
+/// 
+/// Examples
+/// --------
+/// Suppose the read names in the fastq file contain barcodes and UMIs in the following format:
+/// `A01535:24:HW2MMDSX2:2:1359:8513:3458:bd:69:Y6:10:TGATAGGTTG`,
+/// where `bd:69:Y6:10` corresponds to the barcode and `TGATAGGTTG` corresponds to the UMI.
+/// We can extract the barcode using the following regular expression:
+/// `"(..:..:..:..)(:)\\w+$"`. Here we use two capturing groups. The first group captures the barcode,
+/// and the second group captures the colon that separates the barcode from the UMI.
+/// Texts matched by the capturing groups are removed from the read names.
+/// So the resultant read name will be `A01535:24:HW2MMDSX2:2:1359:8513:3458:TGATAGGTTG`.
+/// Here we use `out_barcode="test_out.fq.zst"` to write the first capturing group to a separate fastq file.
+/// 
+/// >>> from precellar.utils import strip_barcode_from_fastq
+/// >>> strip_barcode_from_fastq(
+/// ...     "test.fq.gz",
+/// ...     "test_out.fq.zst",
+/// ...     regex="(..:..:..:..)(:)\\w+$",
+/// ...     out_barcode="barcode.fq.zst",
+/// ... )
+/// 
+/// It is possible to extract both the barcode and UMI at the same time, using the following regular expression:
+/// `"(..:..:..:..)(:)([ACGTN]+)$"`. Here we use three capturing groups. The first group captures the barcode,
+/// the second group captures the colon that separates the barcode from the UMI, and the third group captures the UMI.
+/// To write the barcode and UMI to separate files, we can use a dictionary:
+/// `out_barcode={0: "barcode.fq.zst", 2: "umi.fq.zst"}`. The integer keys correspond
+/// to the index of the capturing groups.
+/// 
+/// >>> strip_barcode_from_fastq(
+/// ...     "test.fq.gz",
+/// ...     "test_out.fq.zst",
+/// ...     regex="(..:..:..:..)(:)([ACGTN]+)$",
+/// ...     out_barcode={0: "barcode.fq.zst", 2: "umi.fq.zst"},
+/// ... )
+/// 
+/// We can use names to refer to the capturing groups instead of indices:
+/// `"(?<barcode>..:..:..:..)(:)(?<umi>[ACGTN]+)$"`.
+/// 
+/// >>> strip_barcode_from_fastq(
+/// ...     "test.fq.gz",
+/// ...     "test_out.fq.zst",
+/// ...     regex="(?<barcode>..:..:..:..)(:)(?<umi>[ACGTN]+)$",
+/// ...     out_barcode={'barcode': "barcode.fq.zst", 'umi': "umi.fq.zst"},
+/// ... )
+/// 
+/// If the barcode is stored in the description instead of the name, we can set `from_description=True`.
+/// 
 #[pyfunction]
 #[pyo3(
     signature = (in_fq, out_fq,
-        *, regex, out_barcode=None, left_add=0, right_add=0,
+        *, regex, out_barcode=None, from_description=false,
         compression=None, compression_level=None, num_threads=8,
     ),
     text_signature = "(in_fq, out_fq,
-        *, regex, out_barcode, left_add=0, right_add=0,
+        *, regex, out_barcode, from_description=False,
         compression=None, compression_level=None, num_threads=8)",
 )]
 fn strip_barcode_from_fastq(
@@ -54,9 +97,9 @@ fn strip_barcode_from_fastq(
     in_fq: PathBuf,
     out_fq: PathBuf,
     regex: &str,
-    out_barcode: Option<PathBuf>,
-    left_add: usize,
-    right_add: usize,
+    //out_barcode: Option<PathBuf>,
+    out_barcode: Option<Bound<'_, PyAny>>,
+    from_description: bool,
     compression: Option<&str>,
     compression_level: Option<u32>,
     num_threads: u32,
@@ -68,11 +111,37 @@ fn strip_barcode_from_fastq(
             .or((&out_fq).try_into().ok());
         Writer::new(BufWriter::new(open_file_for_write(out_fq, compression, compression_level, num_threads)?))
     };
-    let mut barcode_writer = out_barcode.as_ref().map(|output| {
-        let compression = compression.map(|x| Compression::from_str(x).unwrap())
-            .or(output.try_into().ok());
-        Writer::new(BufWriter::new(open_file_for_write(output, compression, compression_level, num_threads).unwrap()))
-    });
+
+    let (barcode_keys, mut barcode_writers) = if let Some(output) = out_barcode {
+        let mut keys = Vec::new();
+        let mut files = Vec::new();
+        match output.extract::<PathBuf>() {
+            Ok(p) => {
+                keys.push(0.into());
+                files.push(p);
+            },
+            _ => {
+                let dict = output.downcast::<PyDict>().unwrap();
+                dict.iter().for_each(|(k, v)| {
+                    if let Ok(k) = k.extract::<usize>() {
+                        keys.push(k.into());
+                    } else {
+                        keys.push(k.extract::<String>().unwrap().into());
+                    }
+
+                    files.push(v.extract::<PathBuf>().unwrap());
+                });
+            },
+        };
+        let writers = files.into_iter().map(|file| {
+            let compression = compression.map(|x| Compression::from_str(x).unwrap())
+                .or((&file).try_into().ok());
+            Writer::new(BufWriter::new(open_file_for_write(file, compression, compression_level, num_threads).unwrap()))
+        }).collect();
+        (Some(keys), writers)
+    } else {
+        (None, Vec::new())
+    };
 
     let re = Regex::new(regex)?;
     reader.records().enumerate().try_for_each(|(i, record)| {
@@ -80,17 +149,21 @@ fn strip_barcode_from_fastq(
             py.check_signals().unwrap();
         }
         let record = record?;
-        let (record, barcode) = strip_barcode_from_read_name(record, &re, left_add, right_add)?;
+        let (record, barcodes) = strip_fastq(
+            record, &re, barcode_keys.as_ref().map(|x| x.as_slice()), from_description
+        )?;
 
         fq_writer.write_record(&record)?;
-        if let Some(barcode_writer) = &mut barcode_writer {
-            let qual_score = vec![b'~'; barcode.len()];
-            let fq = fastq::Record::new(
-                fastq::record::Definition::new(format!("BARCODE.{}", i), ""),
-                barcode.into_bytes(),
-                qual_score,
-            );
-            barcode_writer.write_record(&fq)?;
+        if let Some(barcodes) = barcodes {
+            barcodes.into_iter().enumerate().for_each(|(i, barcode)| {
+                let qual_score = vec![b'~'; barcode.len()];
+                let fq = fastq::Record::new(
+                    fastq::record::Definition::new(record.name(), ""),
+                    barcode.into_bytes(),
+                    qual_score,
+                );
+                barcode_writers.get_mut(i).unwrap().write_record(&fq).unwrap();
+            })
         }
 
         anyhow::Ok(())