update tutorial

regulatory-genomics · Oct 1, 2024 · 0ee24fb · 0ee24fb
1 parent d4d4883
commit 0ee24fb
Show file tree

Hide file tree

Showing 5 changed files with 74 additions and 110 deletions.
diff --git a/docs/tutorials/generic.ipynb b/docs/tutorials/generic.ipynb
@@ -86,12 +86,14 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[90m[\u001b[0m2024-10-01T05:23:33Z \u001b[32mINFO \u001b[0m cached_path::cache\u001b[90m]\u001b[0m Treating ../../seqspec_templates/generic_atac.yaml as local file\n"
+      "\u001b[90m[\u001b[0m2024-10-01T15:18:02Z \u001b[32mINFO \u001b[0m cached_path::cache\u001b[90m]\u001b[0m Starting download of https://raw.githubusercontent.com/regulatory-genomics/precellar/refs/heads/main/seqspec_templates/generic_atac.yaml\n",
+      "\u001b[90m[\u001b[0m2024-10-01T15:18:02Z \u001b[32mINFO \u001b[0m cached_path::cache\u001b[90m]\u001b[0m Downloaded 2643 bytes\n",
+      "\u001b[90m[\u001b[0m2024-10-01T15:18:02Z \u001b[32mINFO \u001b[0m cached_path::cache\u001b[90m]\u001b[0m New version of https://raw.githubusercontent.com/regulatory-genomics/precellar/refs/heads/main/seqspec_templates/generic_atac.yaml cached\n"
      ]
     }
    ],
    "source": [
-    "assay = precellar.SeqSpec(\"../../seqspec_templates/generic_atac.yaml\")"
+    "assay = precellar.SeqSpec(\"https://raw.githubusercontent.com/regulatory-genomics/precellar/refs/heads/main/seqspec_templates/generic_atac.yaml\")"
    ]
   },
   {
@@ -105,9 +107,9 @@
        "\n",
        "└── atac(153-1150)\n",
        "    ├── atac-illumina_p5(29)\n",
-       "    ├── atac-read1(34)\n",
+       "    ├── atac-read1(34) [↓R1(1-98)]\n",
        "    ├── gDNA(1-1000)\n",
-       "    ├── atac-read2(34)\n",
+       "    ├── atac-read2(34) [↑R2(1-98), ↓I1(22)]\n",
        "    ├── atac-cell_barcode(22)\n",
        "    └── atac-illumina_p7(24)"
       ]
@@ -127,32 +129,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Read 1\n",
-    "assay.add_read(\n",
-    "    read_id=\"atac-R1\",\n",
-    "    fastq=\"R1_processed.fq.zst\",\n",
-    "    modality=\"atac\",\n",
-    "    is_reverse=False,\n",
-    "    primer_id=\"atac-read1\",\n",
-    ")\n",
-    "\n",
-    "# Index Read\n",
-    "assay.add_read(\n",
-    "    read_id=\"atac-I1\",\n",
-    "    fastq=\"I1.fq.zst\",\n",
-    "    modality=\"atac\",\n",
-    "    is_reverse=False,\n",
-    "    primer_id=\"atac-read2\",\n",
-    ")\n",
-    "\n",
-    "# Read 2\n",
-    "assay.add_read(\n",
-    "    read_id=\"atac-R2\",\n",
-    "    fastq=\"R2_processed.fq.zst\",\n",
-    "    modality=\"atac\",\n",
-    "    is_reverse=True,\n",
-    "    primer_id=\"atac-read2\",\n",
-    ")"
+    "assay.update_read(\"R1\", fastq=\"R1_processed.fq.zst\")\n",
+    "assay.update_read(\"I1\", fastq=\"I1.fq.zst\")\n",
+    "assay.update_read(\"R2\", fastq=\"R2_processed.fq.zst\")"
    ]
   },
   {
@@ -166,9 +145,9 @@
        "\n",
        "└── atac(153-1150)\n",
        "    ├── atac-illumina_p5(29)\n",
-       "    ├── atac-read1(34) [↓atac-R1(51)]\n",
+       "    ├── atac-read1(34) [↓R1(51)]\n",
        "    ├── gDNA(1-1000)\n",
-       "    ├── atac-read2(34) [↓atac-I1(22), ↑atac-R2(51)]\n",
+       "    ├── atac-read2(34) [↑R2(51), ↓I1(22)]\n",
        "    ├── atac-cell_barcode(22)\n",
        "    └── atac-illumina_p7(24)"
       ]
@@ -184,20 +163,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[90m[\u001b[0m2024-10-01T05:24:23Z \u001b[32mINFO \u001b[0m precellar::align\u001b[90m]\u001b[0m Counting barcodes...\n",
-      "\u001b[90m[\u001b[0m2024-10-01T05:24:23Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (atac-R1) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
-      "\u001b[90m[\u001b[0m2024-10-01T05:24:23Z \u001b[32mINFO \u001b[0m precellar::align\u001b[90m]\u001b[0m Found 2500 barcodes. 100.00% of them have an exact match in whitelist\n",
-      "\u001b[90m[\u001b[0m2024-10-01T05:24:23Z \u001b[32mINFO \u001b[0m precellar::align\u001b[90m]\u001b[0m Aligning reads...\n",
-      "\u001b[90m[\u001b[0m2024-10-01T05:24:23Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (atac-R1) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
-      "\u001b[90m[\u001b[0m2024-10-01T05:24:23Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (atac-R2) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
-      "100%|██████████| 2500/2500 [00:00<00:00, 15087.34it/s]"
+      "\u001b[90m[\u001b[0m2024-10-01T15:18:10Z \u001b[32mINFO \u001b[0m precellar::align\u001b[90m]\u001b[0m Counting barcodes...\n",
+      "\u001b[90m[\u001b[0m2024-10-01T15:18:10Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (R1) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
+      "\u001b[90m[\u001b[0m2024-10-01T15:18:10Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (R2) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
+      "\u001b[90m[\u001b[0m2024-10-01T15:18:10Z \u001b[32mINFO \u001b[0m precellar::align\u001b[90m]\u001b[0m Found 2500 barcodes. 100.00% of them have an exact match in whitelist\n",
+      "\u001b[90m[\u001b[0m2024-10-01T15:18:10Z \u001b[32mINFO \u001b[0m precellar::align\u001b[90m]\u001b[0m Aligning reads...\n",
+      "\u001b[90m[\u001b[0m2024-10-01T15:18:10Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (R1) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
+      "\u001b[90m[\u001b[0m2024-10-01T15:18:10Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (R2) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
+      "100%|██████████| 2500/2500 [00:00<00:00, 15545.42it/s]"
      ]
     }
    ],
@@ -212,27 +192,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "{'frac_q30_bases_read2': 0.9442745098039216,\n",
+       "{'frac_q30_bases_read1': 0.8179764705882353,\n",
+       " 'frac_valid_barcode': 1.0,\n",
+       " 'sequenced_read_pairs': 2500.0,\n",
+       " 'frac_q30_bases_barcode': 1.0,\n",
+       " 'frac_unmapped': 0.07640000000000002,\n",
        " 'sequenced_reads': 5000.0,\n",
        " 'frac_fragment_flanking_single_nucleosome': 0.0029791459781529296,\n",
-       " 'frac_q30_bases_read1': 0.8179764705882353,\n",
        " 'frac_confidently_mapped': 0.8524,\n",
-       " 'frac_nonnuclear': 0.0128,\n",
        " 'frac_fragment_in_nucleosome_free_region': 0.010427010923535254,\n",
-       " 'frac_unmapped': 0.07640000000000002,\n",
-       " 'frac_valid_barcode': 1.0,\n",
-       " 'sequenced_read_pairs': 2500.0,\n",
-       " 'frac_duplicates': 0.004940711462450593,\n",
-       " 'frac_q30_bases_barcode': 1.0}"
+       " 'frac_q30_bases_read2': 0.9442745098039216,\n",
+       " 'frac_nonnuclear': 0.0128,\n",
+       " 'frac_duplicates': 0.004940711462450593}"
       ]
      },
-     "execution_count": 10,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }

diff --git a/docs/tutorials/txg_multiome.ipynb b/docs/tutorials/txg_multiome.ipynb
@@ -39,7 +39,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[90m[\u001b[0m2024-09-30T16:24:31Z \u001b[32mINFO \u001b[0m cached_path::cache\u001b[90m]\u001b[0m Cached version of https://raw.githubusercontent.com/regulatory-genomics/precellar/refs/heads/main/seqspec_templates/10x_rna_atac.yaml is up-to-date\n"
+      "\u001b[90m[\u001b[0m2024-10-01T15:16:56Z \u001b[32mINFO \u001b[0m cached_path::cache\u001b[90m]\u001b[0m Cached version of https://raw.githubusercontent.com/regulatory-genomics/precellar/refs/heads/main/seqspec_templates/10x_rna_atac.yaml is up-to-date\n"
      ]
     }
    ],
@@ -65,20 +65,20 @@
        "\n",
        "├── rna(153-1150)\n",
        "│   ├── rna-illumina_p5(29)\n",
-       "│   ├── rna-truseq_read1(29)\n",
+       "│   ├── rna-truseq_read1(29) [↓rna-R1(28)]\n",
        "│   ├── rna-cell_barcode(16)\n",
        "│   ├── rna-umi(12)\n",
        "│   ├── rna-cDNA(1-1000)\n",
-       "│   ├── rna-truseq_read2(34)\n",
+       "│   ├── rna-truseq_read2(34) [↓rna-I1(8), ↑rna-R2(1-98)]\n",
        "│   ├── rna-index7(8)\n",
        "│   └── rna-illumina_p7(24)\n",
        "└── atac(153-1150)\n",
        "    ├── atac-illumina_p5(29)\n",
        "    ├── atac-cell_barcode(16)\n",
        "    ├── atac-linker(8)\n",
-       "    ├── atac-nextera_read1(33)\n",
+       "    ├── atac-nextera_read1(33) [↓atac-R1(1-98), ↑atac-I2(24)]\n",
        "    ├── atac-gDNA(1-1000)\n",
-       "    ├── atac-nextera_read2(34)\n",
+       "    ├── atac-nextera_read2(34) [↓atac-I1(8), ↑atac-R2(1-98)]\n",
        "    ├── atac-index7(8)\n",
        "    └── atac-illumina_p7(24)"
       ]
@@ -107,39 +107,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Read 1\n",
-    "assay.add_read(\n",
-    "    read_id=\"atac-R1\",\n",
-    "    fastq=\"/data/kzhang/projects/workshop/data/PS_DNA01-1_S2_L001_R1_001.fq.zst\",\n",
-    "    modality=\"atac\",\n",
-    "    is_reverse=False,\n",
-    "    primer_id=\"atac-nextera_read1\",\n",
-    ")\n",
-    "\n",
-    "# Index Read\n",
-    "assay.add_read(\n",
-    "    read_id=\"atac-I1\",\n",
-    "    fastq=\"/data/kzhang/projects/workshop/data/PS_DNA01-1_S2_L001_R2_001.fq.zst\",\n",
-    "    modality=\"atac\",\n",
-    "    is_reverse=True,\n",
-    "    primer_id=\"atac-nextera_read1\",\n",
-    ")\n",
-    "\n",
-    "# Read 2\n",
-    "assay.add_read(\n",
-    "    read_id=\"atac-R2\",\n",
-    "    fastq=\"/data/kzhang/projects/workshop/data/PS_DNA01-1_S2_L001_R3_001.fq.zst\",\n",
-    "    modality=\"atac\",\n",
-    "    is_reverse=True,\n",
-    "    primer_id=\"atac-nextera_read2\",\n",
-    ")"
+    "assay.update_read(\"atac-R1\", fastq=\"/data/kzhang/projects/workshop/data/PS_DNA01-1_S2_L001_R1_001.fq.zst\")\n",
+    "assay.update_read(\"atac-I2\", fastq=\"/data/kzhang/projects/workshop/data/PS_DNA01-1_S2_L001_R2_001.fq.zst\")\n",
+    "assay.update_read(\"atac-R2\", fastq=\"/data/kzhang/projects/workshop/data/PS_DNA01-1_S2_L001_R3_001.fq.zst\")"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We can visualize the SeqSpec object again after adding the ATAC data. Note that read information is added to the tree."
+    "We can visualize the SeqSpec object again after adding the ATAC data. Note that read information has been updated, reflecting the real read length of the fastq files."
    ]
   },
   {
@@ -153,20 +130,20 @@
        "\n",
        "├── rna(153-1150)\n",
        "│   ├── rna-illumina_p5(29)\n",
-       "│   ├── rna-truseq_read1(29)\n",
+       "│   ├── rna-truseq_read1(29) [↓rna-R1(28)]\n",
        "│   ├── rna-cell_barcode(16)\n",
        "│   ├── rna-umi(12)\n",
        "│   ├── rna-cDNA(1-1000)\n",
-       "│   ├── rna-truseq_read2(34)\n",
+       "│   ├── rna-truseq_read2(34) [↓rna-I1(8), ↑rna-R2(1-98)]\n",
        "│   ├── rna-index7(8)\n",
        "│   └── rna-illumina_p7(24)\n",
        "└── atac(153-1150)\n",
        "    ├── atac-illumina_p5(29)\n",
        "    ├── atac-cell_barcode(16)\n",
        "    ├── atac-linker(8)\n",
-       "    ├── atac-nextera_read1(33) [↓atac-R1(150), ↑atac-I1(24)]\n",
+       "    ├── atac-nextera_read1(33) [↓atac-R1(150), ↑atac-I2(24)]\n",
        "    ├── atac-gDNA(1-1000)\n",
-       "    ├── atac-nextera_read2(34) [↑atac-R2(150)]\n",
+       "    ├── atac-nextera_read2(34) [↓atac-I1(8), ↑atac-R2(150)]\n",
        "    ├── atac-index7(8)\n",
        "    └── atac-illumina_p7(24)"
       ]
@@ -215,14 +192,14 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[90m[\u001b[0m2024-09-30T16:24:39Z \u001b[32mINFO \u001b[0m precellar::align\u001b[90m]\u001b[0m Counting barcodes...\n",
-      "\u001b[90m[\u001b[0m2024-09-30T16:24:39Z \u001b[32mINFO \u001b[0m cached_path::cache\u001b[90m]\u001b[0m Cached version of https://teichlab.github.io/scg_lib_structs/data/10X-Genomics/atac_737K-arc-v1.txt.gz is up-to-date\n",
-      "\u001b[90m[\u001b[0m2024-09-30T16:24:39Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (atac-R1) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
-      "\u001b[90m[\u001b[0m2024-09-30T16:24:46Z \u001b[32mINFO \u001b[0m precellar::align\u001b[90m]\u001b[0m Found 12387063 barcodes. 94.19% of them have an exact match in whitelist\n",
-      "\u001b[90m[\u001b[0m2024-09-30T16:24:46Z \u001b[32mINFO \u001b[0m precellar::align\u001b[90m]\u001b[0m Aligning reads...\n",
-      "\u001b[90m[\u001b[0m2024-09-30T16:24:46Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (atac-R1) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
-      "\u001b[90m[\u001b[0m2024-09-30T16:24:46Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (atac-R2) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
-      "100%|██████████| 12387063/12387063 [05:29<00:00, 37567.71it/s]"
+      "\u001b[90m[\u001b[0m2024-10-01T15:17:03Z \u001b[32mINFO \u001b[0m precellar::align\u001b[90m]\u001b[0m Counting barcodes...\n",
+      "\u001b[90m[\u001b[0m2024-10-01T15:17:04Z \u001b[32mINFO \u001b[0m cached_path::cache\u001b[90m]\u001b[0m Cached version of https://teichlab.github.io/scg_lib_structs/data/10X-Genomics/atac_737K-arc-v1.txt.gz is up-to-date\n",
+      "\u001b[90m[\u001b[0m2024-10-01T15:17:04Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (atac-R1) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
+      "\u001b[90m[\u001b[0m2024-10-01T15:17:11Z \u001b[32mINFO \u001b[0m precellar::align\u001b[90m]\u001b[0m Found 12387063 barcodes. 94.19% of them have an exact match in whitelist\n",
+      "\u001b[90m[\u001b[0m2024-10-01T15:17:11Z \u001b[32mINFO \u001b[0m precellar::align\u001b[90m]\u001b[0m Aligning reads...\n",
+      "\u001b[90m[\u001b[0m2024-10-01T15:17:11Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (atac-R1) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
+      "\u001b[90m[\u001b[0m2024-10-01T15:17:11Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (atac-R2) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
+      "100%|██████████| 12387063/12387063 [05:33<00:00, 37127.51it/s]"
      ]
     }
    ],
@@ -244,14 +221,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{'frac_q30_bases_barcode': 0.9219054690365263, 'frac_nonnuclear': 7.443249461151526e-05, 'frac_valid_barcode': 0.9815396111249293, 'frac_unmapped': 0.008819422106194463, 'frac_confidently_mapped': 0.9292046064511015, 'frac_fragment_in_nucleosome_free_region': 4.518862917979508e-05, 'frac_duplicates': 0.33183332695206896, 'sequenced_reads': 24774126.0, 'frac_q30_bases_read1': 0.9173089515515771, 'frac_q30_bases_read2': 0.9229901653577338, 'sequenced_read_pairs': 12387063.0, 'frac_fragment_flanking_single_nucleosome': 2.912761037492815e-05}\n"
+      "{'frac_fragment_flanking_single_nucleosome': 2.912761037492815e-05, 'frac_confidently_mapped': 0.9292046064511015, 'frac_q30_bases_read1': 0.9173089515515771, 'frac_valid_barcode': 0.9815396111249293, 'frac_duplicates': 0.33183332695206896, 'frac_unmapped': 0.008819422106194463, 'frac_q30_bases_barcode': 0.9219054690365263, 'frac_nonnuclear': 7.443249461151526e-05, 'frac_q30_bases_read2': 0.9229901653577338, 'sequenced_read_pairs': 12387063.0, 'sequenced_reads': 24774126.0, 'frac_fragment_in_nucleosome_free_region': 4.518862917979508e-05}\n"
      ]
     }
    ],

diff --git a/precellar/src/align.rs b/precellar/src/align.rs
@@ -206,8 +206,14 @@ impl<A: Alinger> FastqProcessor<A> {
 
     pub fn gen_raw_fastq_records(&self) -> FastqRecords<impl BufRead> {
         let modality = self.modality();
-        let data = self.assay.get_index_of(modality)
-            .map(|(read, regions)| (read, regions, crate::io::read_fastq(read, self.base_dir.clone())));
+        let data = self.assay.get_index_of(modality).filter_map(|(read, regions)| {
+            let regions: Vec<_> = regions.into_iter().filter(|x| x.0.region_type.is_target()).collect();
+            if regions.is_empty() {
+                None
+            } else {
+                Some((read, regions, crate::io::read_fastq(read, self.base_dir.clone()).unwrap()))
+            }
+        });
         FastqRecords::new(data)
     }
 
@@ -220,7 +226,7 @@ impl<A: Alinger> FastqProcessor<A> {
             .expect("No barcode region found");
         let range = index.into_iter().find(|x| x.0.region_type.is_barcode()).unwrap().1;
 
-        crate::io::read_fastq(read, &self.base_dir).records().for_each(|record| {
+        crate::io::read_fastq(read, &self.base_dir).unwrap().records().for_each(|record| {
             let mut record = record.unwrap();
             record = slice_fastq_record(&record, range.start as usize, range.end as usize);
             if read.is_reverse() {
@@ -273,14 +279,7 @@ impl<R: BufRead> FastqRecords<R> {
             FastqRecord {
                 id: read.read_id.to_string(),
                 is_reverse: read.is_reverse(),
-                subregion: regions.into_iter().filter_map(|x| {
-                    let region_type = x.0.region_type;
-                    if region_type.is_barcode() || region_type.is_dna() {
-                        Some((region_type, x.1))
-                    } else {
-                        None
-                    }
-                }).collect(),
+                subregion: regions.into_iter().map(|x| (x.0.region_type, x.1)).collect(),
                 reader,
                 min_len: read.min_len as usize,
                 max_len: read.max_len as usize,

diff --git a/precellar/src/io.rs b/precellar/src/io.rs
@@ -84,16 +84,19 @@ pub fn open_file_for_write<P: AsRef<Path>>(
     Ok(writer)
 }
 
-pub fn read_fastq<P: AsRef<Path>>(read: &seqspec::Read, base_dir: P) -> fastq::Reader<impl BufRead> {
+pub fn read_fastq<P: AsRef<Path>>(read: &seqspec::Read, base_dir: P) -> Option<fastq::Reader<impl BufRead>> {
+    if read.files.is_none() {
+        return None;
+    }
     let base_dir = base_dir.as_ref().to_path_buf();
     let reader = multi_reader::MultiReader::new(
         read.files.clone().unwrap().into_iter().map(move |file| open_file(&file, &base_dir))
     );
-    fastq::Reader::new(BufReader::new(reader))
+    Some(fastq::Reader::new(BufReader::new(reader)))
 }
 
 pub fn get_read_length<P: AsRef<Path>>(read: &seqspec::Read, base_dir: P) -> Result<usize> {
-    let mut reader = read_fastq(read, base_dir);
+    let mut reader = read_fastq(read, base_dir).unwrap();
     let mut record = fastq::Record::default();
     reader.read_record(&mut record)?;
     Ok(record.sequence().len())

diff --git a/seqspec/src/lib.rs b/seqspec/src/lib.rs
@@ -489,6 +489,11 @@ pub enum RegionType {
 }
 
 impl RegionType {
+    /// Either a barcode, UMI, or DNA/cDNA region.
+    pub fn is_target(&self) -> bool {
+        self.is_barcode() || self.is_umi() || self.is_dna()
+    }
+
     pub fn is_barcode(&self) -> bool {
         match self {
             RegionType::Barcode => true,