Skip to content

Commit

Permalink
update tutorial
Browse files Browse the repository at this point in the history
  • Loading branch information
kaizhang committed Oct 1, 2024
1 parent d4d4883 commit 0ee24fb
Show file tree
Hide file tree
Showing 5 changed files with 74 additions and 110 deletions.
80 changes: 30 additions & 50 deletions docs/tutorials/generic.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -86,12 +86,14 @@
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[90m[\u001b[0m2024-10-01T05:23:33Z \u001b[32mINFO \u001b[0m cached_path::cache\u001b[90m]\u001b[0m Treating ../../seqspec_templates/generic_atac.yaml as local file\n"
"\u001b[90m[\u001b[0m2024-10-01T15:18:02Z \u001b[32mINFO \u001b[0m cached_path::cache\u001b[90m]\u001b[0m Starting download of https://raw.githubusercontent.com/regulatory-genomics/precellar/refs/heads/main/seqspec_templates/generic_atac.yaml\n",
"\u001b[90m[\u001b[0m2024-10-01T15:18:02Z \u001b[32mINFO \u001b[0m cached_path::cache\u001b[90m]\u001b[0m Downloaded 2643 bytes\n",
"\u001b[90m[\u001b[0m2024-10-01T15:18:02Z \u001b[32mINFO \u001b[0m cached_path::cache\u001b[90m]\u001b[0m New version of https://raw.githubusercontent.com/regulatory-genomics/precellar/refs/heads/main/seqspec_templates/generic_atac.yaml cached\n"
]
}
],
"source": [
"assay = precellar.SeqSpec(\"../../seqspec_templates/generic_atac.yaml\")"
"assay = precellar.SeqSpec(\"https://raw.githubusercontent.com/regulatory-genomics/precellar/refs/heads/main/seqspec_templates/generic_atac.yaml\")"
]
},
{
Expand All @@ -105,9 +107,9 @@
"\n",
"└── atac(153-1150)\n",
" ├── atac-illumina_p5(29)\n",
" ├── atac-read1(34)\n",
" ├── atac-read1(34) [↓R1(1-98)]\n",
" ├── gDNA(1-1000)\n",
" ├── atac-read2(34)\n",
" ├── atac-read2(34) [↑R2(1-98), ↓I1(22)]\n",
" ├── atac-cell_barcode(22)\n",
" └── atac-illumina_p7(24)"
]
Expand All @@ -127,32 +129,9 @@
"metadata": {},
"outputs": [],
"source": [
"# Read 1\n",
"assay.add_read(\n",
" read_id=\"atac-R1\",\n",
" fastq=\"R1_processed.fq.zst\",\n",
" modality=\"atac\",\n",
" is_reverse=False,\n",
" primer_id=\"atac-read1\",\n",
")\n",
"\n",
"# Index Read\n",
"assay.add_read(\n",
" read_id=\"atac-I1\",\n",
" fastq=\"I1.fq.zst\",\n",
" modality=\"atac\",\n",
" is_reverse=False,\n",
" primer_id=\"atac-read2\",\n",
")\n",
"\n",
"# Read 2\n",
"assay.add_read(\n",
" read_id=\"atac-R2\",\n",
" fastq=\"R2_processed.fq.zst\",\n",
" modality=\"atac\",\n",
" is_reverse=True,\n",
" primer_id=\"atac-read2\",\n",
")"
"assay.update_read(\"R1\", fastq=\"R1_processed.fq.zst\")\n",
"assay.update_read(\"I1\", fastq=\"I1.fq.zst\")\n",
"assay.update_read(\"R2\", fastq=\"R2_processed.fq.zst\")"
]
},
{
Expand All @@ -166,9 +145,9 @@
"\n",
"└── atac(153-1150)\n",
" ├── atac-illumina_p5(29)\n",
" ├── atac-read1(34) [↓atac-R1(51)]\n",
" ├── atac-read1(34) [↓R1(51)]\n",
" ├── gDNA(1-1000)\n",
" ├── atac-read2(34) [↓atac-I1(22), ↑atac-R2(51)]\n",
" ├── atac-read2(34) [↑R2(51), ↓I1(22)]\n",
" ├── atac-cell_barcode(22)\n",
" └── atac-illumina_p7(24)"
]
Expand All @@ -184,20 +163,21 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[90m[\u001b[0m2024-10-01T05:24:23Z \u001b[32mINFO \u001b[0m precellar::align\u001b[90m]\u001b[0m Counting barcodes...\n",
"\u001b[90m[\u001b[0m2024-10-01T05:24:23Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (atac-R1) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
"\u001b[90m[\u001b[0m2024-10-01T05:24:23Z \u001b[32mINFO \u001b[0m precellar::align\u001b[90m]\u001b[0m Found 2500 barcodes. 100.00% of them have an exact match in whitelist\n",
"\u001b[90m[\u001b[0m2024-10-01T05:24:23Z \u001b[32mINFO \u001b[0m precellar::align\u001b[90m]\u001b[0m Aligning reads...\n",
"\u001b[90m[\u001b[0m2024-10-01T05:24:23Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (atac-R1) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
"\u001b[90m[\u001b[0m2024-10-01T05:24:23Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (atac-R2) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
"100%|██████████| 2500/2500 [00:00<00:00, 15087.34it/s]"
"\u001b[90m[\u001b[0m2024-10-01T15:18:10Z \u001b[32mINFO \u001b[0m precellar::align\u001b[90m]\u001b[0m Counting barcodes...\n",
"\u001b[90m[\u001b[0m2024-10-01T15:18:10Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (R1) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
"\u001b[90m[\u001b[0m2024-10-01T15:18:10Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (R2) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
"\u001b[90m[\u001b[0m2024-10-01T15:18:10Z \u001b[32mINFO \u001b[0m precellar::align\u001b[90m]\u001b[0m Found 2500 barcodes. 100.00% of them have an exact match in whitelist\n",
"\u001b[90m[\u001b[0m2024-10-01T15:18:10Z \u001b[32mINFO \u001b[0m precellar::align\u001b[90m]\u001b[0m Aligning reads...\n",
"\u001b[90m[\u001b[0m2024-10-01T15:18:10Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (R1) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
"\u001b[90m[\u001b[0m2024-10-01T15:18:10Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (R2) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
"100%|██████████| 2500/2500 [00:00<00:00, 15545.42it/s]"
]
}
],
Expand All @@ -212,27 +192,27 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'frac_q30_bases_read2': 0.9442745098039216,\n",
"{'frac_q30_bases_read1': 0.8179764705882353,\n",
" 'frac_valid_barcode': 1.0,\n",
" 'sequenced_read_pairs': 2500.0,\n",
" 'frac_q30_bases_barcode': 1.0,\n",
" 'frac_unmapped': 0.07640000000000002,\n",
" 'sequenced_reads': 5000.0,\n",
" 'frac_fragment_flanking_single_nucleosome': 0.0029791459781529296,\n",
" 'frac_q30_bases_read1': 0.8179764705882353,\n",
" 'frac_confidently_mapped': 0.8524,\n",
" 'frac_nonnuclear': 0.0128,\n",
" 'frac_fragment_in_nucleosome_free_region': 0.010427010923535254,\n",
" 'frac_unmapped': 0.07640000000000002,\n",
" 'frac_valid_barcode': 1.0,\n",
" 'sequenced_read_pairs': 2500.0,\n",
" 'frac_duplicates': 0.004940711462450593,\n",
" 'frac_q30_bases_barcode': 1.0}"
" 'frac_q30_bases_read2': 0.9442745098039216,\n",
" 'frac_nonnuclear': 0.0128,\n",
" 'frac_duplicates': 0.004940711462450593}"
]
},
"execution_count": 10,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
Expand Down
69 changes: 23 additions & 46 deletions docs/tutorials/txg_multiome.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[90m[\u001b[0m2024-09-30T16:24:31Z \u001b[32mINFO \u001b[0m cached_path::cache\u001b[90m]\u001b[0m Cached version of https://raw.githubusercontent.com/regulatory-genomics/precellar/refs/heads/main/seqspec_templates/10x_rna_atac.yaml is up-to-date\n"
"\u001b[90m[\u001b[0m2024-10-01T15:16:56Z \u001b[32mINFO \u001b[0m cached_path::cache\u001b[90m]\u001b[0m Cached version of https://raw.githubusercontent.com/regulatory-genomics/precellar/refs/heads/main/seqspec_templates/10x_rna_atac.yaml is up-to-date\n"
]
}
],
Expand All @@ -65,20 +65,20 @@
"\n",
"├── rna(153-1150)\n",
"│ ├── rna-illumina_p5(29)\n",
"│ ├── rna-truseq_read1(29)\n",
"│ ├── rna-truseq_read1(29) [↓rna-R1(28)]\n",
"│ ├── rna-cell_barcode(16)\n",
"│ ├── rna-umi(12)\n",
"│ ├── rna-cDNA(1-1000)\n",
"│ ├── rna-truseq_read2(34)\n",
"│ ├── rna-truseq_read2(34) [↓rna-I1(8), ↑rna-R2(1-98)]\n",
"│ ├── rna-index7(8)\n",
"│ └── rna-illumina_p7(24)\n",
"└── atac(153-1150)\n",
" ├── atac-illumina_p5(29)\n",
" ├── atac-cell_barcode(16)\n",
" ├── atac-linker(8)\n",
" ├── atac-nextera_read1(33)\n",
" ├── atac-nextera_read1(33) [↓atac-R1(1-98), ↑atac-I2(24)]\n",
" ├── atac-gDNA(1-1000)\n",
" ├── atac-nextera_read2(34)\n",
" ├── atac-nextera_read2(34) [↓atac-I1(8), ↑atac-R2(1-98)]\n",
" ├── atac-index7(8)\n",
" └── atac-illumina_p7(24)"
]
Expand Down Expand Up @@ -107,39 +107,16 @@
"metadata": {},
"outputs": [],
"source": [
"# Read 1\n",
"assay.add_read(\n",
" read_id=\"atac-R1\",\n",
" fastq=\"/data/kzhang/projects/workshop/data/PS_DNA01-1_S2_L001_R1_001.fq.zst\",\n",
" modality=\"atac\",\n",
" is_reverse=False,\n",
" primer_id=\"atac-nextera_read1\",\n",
")\n",
"\n",
"# Index Read\n",
"assay.add_read(\n",
" read_id=\"atac-I1\",\n",
" fastq=\"/data/kzhang/projects/workshop/data/PS_DNA01-1_S2_L001_R2_001.fq.zst\",\n",
" modality=\"atac\",\n",
" is_reverse=True,\n",
" primer_id=\"atac-nextera_read1\",\n",
")\n",
"\n",
"# Read 2\n",
"assay.add_read(\n",
" read_id=\"atac-R2\",\n",
" fastq=\"/data/kzhang/projects/workshop/data/PS_DNA01-1_S2_L001_R3_001.fq.zst\",\n",
" modality=\"atac\",\n",
" is_reverse=True,\n",
" primer_id=\"atac-nextera_read2\",\n",
")"
"assay.update_read(\"atac-R1\", fastq=\"/data/kzhang/projects/workshop/data/PS_DNA01-1_S2_L001_R1_001.fq.zst\")\n",
"assay.update_read(\"atac-I2\", fastq=\"/data/kzhang/projects/workshop/data/PS_DNA01-1_S2_L001_R2_001.fq.zst\")\n",
"assay.update_read(\"atac-R2\", fastq=\"/data/kzhang/projects/workshop/data/PS_DNA01-1_S2_L001_R3_001.fq.zst\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can visualize the SeqSpec object again after adding the ATAC data. Note that read information is added to the tree."
"We can visualize the SeqSpec object again after adding the ATAC data. Note that read information has been updated, reflecting the real read length of the fastq files."
]
},
{
Expand All @@ -153,20 +130,20 @@
"\n",
"├── rna(153-1150)\n",
"│ ├── rna-illumina_p5(29)\n",
"│ ├── rna-truseq_read1(29)\n",
"│ ├── rna-truseq_read1(29) [↓rna-R1(28)]\n",
"│ ├── rna-cell_barcode(16)\n",
"│ ├── rna-umi(12)\n",
"│ ├── rna-cDNA(1-1000)\n",
"│ ├── rna-truseq_read2(34)\n",
"│ ├── rna-truseq_read2(34) [↓rna-I1(8), ↑rna-R2(1-98)]\n",
"│ ├── rna-index7(8)\n",
"│ └── rna-illumina_p7(24)\n",
"└── atac(153-1150)\n",
" ├── atac-illumina_p5(29)\n",
" ├── atac-cell_barcode(16)\n",
" ├── atac-linker(8)\n",
" ├── atac-nextera_read1(33) [↓atac-R1(150), ↑atac-I1(24)]\n",
" ├── atac-nextera_read1(33) [↓atac-R1(150), ↑atac-I2(24)]\n",
" ├── atac-gDNA(1-1000)\n",
" ├── atac-nextera_read2(34) [↑atac-R2(150)]\n",
" ├── atac-nextera_read2(34) [↓atac-I1(8), ↑atac-R2(150)]\n",
" ├── atac-index7(8)\n",
" └── atac-illumina_p7(24)"
]
Expand Down Expand Up @@ -215,14 +192,14 @@
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[90m[\u001b[0m2024-09-30T16:24:39Z \u001b[32mINFO \u001b[0m precellar::align\u001b[90m]\u001b[0m Counting barcodes...\n",
"\u001b[90m[\u001b[0m2024-09-30T16:24:39Z \u001b[32mINFO \u001b[0m cached_path::cache\u001b[90m]\u001b[0m Cached version of https://teichlab.github.io/scg_lib_structs/data/10X-Genomics/atac_737K-arc-v1.txt.gz is up-to-date\n",
"\u001b[90m[\u001b[0m2024-09-30T16:24:39Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (atac-R1) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
"\u001b[90m[\u001b[0m2024-09-30T16:24:46Z \u001b[32mINFO \u001b[0m precellar::align\u001b[90m]\u001b[0m Found 12387063 barcodes. 94.19% of them have an exact match in whitelist\n",
"\u001b[90m[\u001b[0m2024-09-30T16:24:46Z \u001b[32mINFO \u001b[0m precellar::align\u001b[90m]\u001b[0m Aligning reads...\n",
"\u001b[90m[\u001b[0m2024-09-30T16:24:46Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (atac-R1) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
"\u001b[90m[\u001b[0m2024-09-30T16:24:46Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (atac-R2) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
"100%|██████████| 12387063/12387063 [05:29<00:00, 37567.71it/s]"
"\u001b[90m[\u001b[0m2024-10-01T15:17:03Z \u001b[32mINFO \u001b[0m precellar::align\u001b[90m]\u001b[0m Counting barcodes...\n",
"\u001b[90m[\u001b[0m2024-10-01T15:17:04Z \u001b[32mINFO \u001b[0m cached_path::cache\u001b[90m]\u001b[0m Cached version of https://teichlab.github.io/scg_lib_structs/data/10X-Genomics/atac_737K-arc-v1.txt.gz is up-to-date\n",
"\u001b[90m[\u001b[0m2024-10-01T15:17:04Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (atac-R1) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
"\u001b[90m[\u001b[0m2024-10-01T15:17:11Z \u001b[32mINFO \u001b[0m precellar::align\u001b[90m]\u001b[0m Found 12387063 barcodes. 94.19% of them have an exact match in whitelist\n",
"\u001b[90m[\u001b[0m2024-10-01T15:17:11Z \u001b[32mINFO \u001b[0m precellar::align\u001b[90m]\u001b[0m Aligning reads...\n",
"\u001b[90m[\u001b[0m2024-10-01T15:17:11Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (atac-R1) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
"\u001b[90m[\u001b[0m2024-10-01T15:17:11Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (atac-R2) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
"100%|██████████| 12387063/12387063 [05:33<00:00, 37127.51it/s]"
]
}
],
Expand All @@ -244,14 +221,14 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'frac_q30_bases_barcode': 0.9219054690365263, 'frac_nonnuclear': 7.443249461151526e-05, 'frac_valid_barcode': 0.9815396111249293, 'frac_unmapped': 0.008819422106194463, 'frac_confidently_mapped': 0.9292046064511015, 'frac_fragment_in_nucleosome_free_region': 4.518862917979508e-05, 'frac_duplicates': 0.33183332695206896, 'sequenced_reads': 24774126.0, 'frac_q30_bases_read1': 0.9173089515515771, 'frac_q30_bases_read2': 0.9229901653577338, 'sequenced_read_pairs': 12387063.0, 'frac_fragment_flanking_single_nucleosome': 2.912761037492815e-05}\n"
"{'frac_fragment_flanking_single_nucleosome': 2.912761037492815e-05, 'frac_confidently_mapped': 0.9292046064511015, 'frac_q30_bases_read1': 0.9173089515515771, 'frac_valid_barcode': 0.9815396111249293, 'frac_duplicates': 0.33183332695206896, 'frac_unmapped': 0.008819422106194463, 'frac_q30_bases_barcode': 0.9219054690365263, 'frac_nonnuclear': 7.443249461151526e-05, 'frac_q30_bases_read2': 0.9229901653577338, 'sequenced_read_pairs': 12387063.0, 'sequenced_reads': 24774126.0, 'frac_fragment_in_nucleosome_free_region': 4.518862917979508e-05}\n"
]
}
],
Expand Down
21 changes: 10 additions & 11 deletions precellar/src/align.rs
Original file line number Diff line number Diff line change
Expand Up @@ -206,8 +206,14 @@ impl<A: Alinger> FastqProcessor<A> {

pub fn gen_raw_fastq_records(&self) -> FastqRecords<impl BufRead> {
let modality = self.modality();
let data = self.assay.get_index_of(modality)
.map(|(read, regions)| (read, regions, crate::io::read_fastq(read, self.base_dir.clone())));
let data = self.assay.get_index_of(modality).filter_map(|(read, regions)| {
let regions: Vec<_> = regions.into_iter().filter(|x| x.0.region_type.is_target()).collect();
if regions.is_empty() {
None
} else {
Some((read, regions, crate::io::read_fastq(read, self.base_dir.clone()).unwrap()))
}
});
FastqRecords::new(data)
}

Expand All @@ -220,7 +226,7 @@ impl<A: Alinger> FastqProcessor<A> {
.expect("No barcode region found");
let range = index.into_iter().find(|x| x.0.region_type.is_barcode()).unwrap().1;

crate::io::read_fastq(read, &self.base_dir).records().for_each(|record| {
crate::io::read_fastq(read, &self.base_dir).unwrap().records().for_each(|record| {
let mut record = record.unwrap();
record = slice_fastq_record(&record, range.start as usize, range.end as usize);
if read.is_reverse() {
Expand Down Expand Up @@ -273,14 +279,7 @@ impl<R: BufRead> FastqRecords<R> {
FastqRecord {
id: read.read_id.to_string(),
is_reverse: read.is_reverse(),
subregion: regions.into_iter().filter_map(|x| {
let region_type = x.0.region_type;
if region_type.is_barcode() || region_type.is_dna() {
Some((region_type, x.1))
} else {
None
}
}).collect(),
subregion: regions.into_iter().map(|x| (x.0.region_type, x.1)).collect(),
reader,
min_len: read.min_len as usize,
max_len: read.max_len as usize,
Expand Down
9 changes: 6 additions & 3 deletions precellar/src/io.rs
Original file line number Diff line number Diff line change
Expand Up @@ -84,16 +84,19 @@ pub fn open_file_for_write<P: AsRef<Path>>(
Ok(writer)
}

pub fn read_fastq<P: AsRef<Path>>(read: &seqspec::Read, base_dir: P) -> fastq::Reader<impl BufRead> {
pub fn read_fastq<P: AsRef<Path>>(read: &seqspec::Read, base_dir: P) -> Option<fastq::Reader<impl BufRead>> {
if read.files.is_none() {
return None;
}
let base_dir = base_dir.as_ref().to_path_buf();
let reader = multi_reader::MultiReader::new(
read.files.clone().unwrap().into_iter().map(move |file| open_file(&file, &base_dir))
);
fastq::Reader::new(BufReader::new(reader))
Some(fastq::Reader::new(BufReader::new(reader)))
}

pub fn get_read_length<P: AsRef<Path>>(read: &seqspec::Read, base_dir: P) -> Result<usize> {
let mut reader = read_fastq(read, base_dir);
let mut reader = read_fastq(read, base_dir).unwrap();
let mut record = fastq::Record::default();
reader.read_record(&mut record)?;
Ok(record.sequence().len())
Expand Down
5 changes: 5 additions & 0 deletions seqspec/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -489,6 +489,11 @@ pub enum RegionType {
}

impl RegionType {
/// Either a barcode, UMI, or DNA/cDNA region.
pub fn is_target(&self) -> bool {
self.is_barcode() || self.is_umi() || self.is_dna()
}

pub fn is_barcode(&self) -> bool {
match self {
RegionType::Barcode => true,
Expand Down

0 comments on commit 0ee24fb

Please sign in to comment.