Skip to content

Commit

Permalink
Deploying to gh-pages from @ 6b8ae88 🚀
Browse files Browse the repository at this point in the history
  • Loading branch information
kaizhang committed Oct 18, 2024
0 parents commit 132987d
Show file tree
Hide file tree
Showing 290 changed files with 85,908 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .buildinfo
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Sphinx build info version 1
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
config: e0eb6db499003519cf5a02c49f3f533e
tags: 645f666f9bcd5a90fca523b33c5a78b7
Binary file not shown.
Binary file added .doctrees/_autosummary/precellar.SeqSpec.doctree
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added .doctrees/_autosummary/precellar.align.doctree
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added .doctrees/api.doctree
Binary file not shown.
Binary file added .doctrees/environment.pickle
Binary file not shown.
Binary file added .doctrees/index.doctree
Binary file not shown.
Binary file added .doctrees/install.doctree
Binary file not shown.
246 changes: 246 additions & 0 deletions .doctrees/nbsphinx/tutorials/generic.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Processing barcoded Fastq files\n",
"\n",
"You would likely encounter barcoded fastq files when working with single cell ATAC-seq data.\n",
"As on early days of single cell ATAC-seq, cell barcodes are usually added to the read name of the fastq files.\n",
"This notebook demonstrates how to process these barcoded fastq files."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import precellar"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Extracting cell barcodes from read names"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"@CCAGCACAAGCCATCCTATCGT:A00953:155:HVCHLDRXX:1:1101:1036:1031 1:N:0:1\n",
"ANCTTGGATCATCAGGTTTGTCTGTAGCTGATTTATTTCTTTAAGTTTCCC\n",
"+\n",
"F#FFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF\n",
"@TAACCACTACGAATGACTGACA:A00953:155:HVCHLDRXX:1:1101:1127:1031 1:N:0:1\n",
"TNCCAGGACCAGTGACCGTCACCCGCAGTAAGGATCGGGGCGGCTCCGCCA\n",
"+\n",
"F#:FFFFFFFFF:FFFFF:FF,F,FFFFFFFF,FFF:FFFF:FFFFFF,FF\n",
"@CGATATGTAGGGGACTAATTCC:A00953:155:HVCHLDRXX:1:1101:1145:1031 1:N:0:1\n",
"GNCGGATCACAAGGTCAGGAGTTCGAGACCTGGCTGGCCAACACGGTGAAA\n",
"\n",
"gzip: stdout: Broken pipe\n"
]
}
],
"source": [
"!zcat R1.fq.gz | head"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"precellar.utils.strip_barcode_from_fastq(\n",
" 'R1.fq.gz',\n",
" 'R1_processed.fq.zst',\n",
" out_barcode='I1.fq.zst',\n",
" regex=\"^([ACTG]+):\",\n",
" right_add=1,\n",
")\n",
"\n",
"precellar.utils.strip_barcode_from_fastq(\n",
" 'R2.fq.gz',\n",
" 'R2_processed.fq.zst',\n",
" regex=\"^([ACTG]+):\",\n",
" right_add=1,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[90m[\u001b[0m2024-10-01T15:18:02Z \u001b[32mINFO \u001b[0m cached_path::cache\u001b[90m]\u001b[0m Starting download of https://raw.githubusercontent.com/regulatory-genomics/precellar/refs/heads/main/seqspec_templates/generic_atac.yaml\n",
"\u001b[90m[\u001b[0m2024-10-01T15:18:02Z \u001b[32mINFO \u001b[0m cached_path::cache\u001b[90m]\u001b[0m Downloaded 2643 bytes\n",
"\u001b[90m[\u001b[0m2024-10-01T15:18:02Z \u001b[32mINFO \u001b[0m cached_path::cache\u001b[90m]\u001b[0m New version of https://raw.githubusercontent.com/regulatory-genomics/precellar/refs/heads/main/seqspec_templates/generic_atac.yaml cached\n"
]
}
],
"source": [
"assay = precellar.SeqSpec(\"https://raw.githubusercontent.com/regulatory-genomics/precellar/refs/heads/main/seqspec_templates/generic_atac.yaml\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\n",
"└── atac(153-1150)\n",
" ├── atac-illumina_p5(29)\n",
" ├── atac-read1(34) [↓R1(1-98)]\n",
" ├── gDNA(1-1000)\n",
" ├── atac-read2(34) [↑R2(1-98), ↓I1(22)]\n",
" ├── atac-cell_barcode(22)\n",
" └── atac-illumina_p7(24)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"assay"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"assay.update_read(\"R1\", fastq=\"R1_processed.fq.zst\")\n",
"assay.update_read(\"I1\", fastq=\"I1.fq.zst\")\n",
"assay.update_read(\"R2\", fastq=\"R2_processed.fq.zst\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\n",
"└── atac(153-1150)\n",
" ├── atac-illumina_p5(29)\n",
" ├── atac-read1(34) [↓R1(51)]\n",
" ├── gDNA(1-1000)\n",
" ├── atac-read2(34) [↑R2(51), ↓I1(22)]\n",
" ├── atac-cell_barcode(22)\n",
" └── atac-illumina_p7(24)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"assay"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[90m[\u001b[0m2024-10-01T15:18:10Z \u001b[32mINFO \u001b[0m precellar::align\u001b[90m]\u001b[0m Counting barcodes...\n",
"\u001b[90m[\u001b[0m2024-10-01T15:18:10Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (R1) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
"\u001b[90m[\u001b[0m2024-10-01T15:18:10Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (R2) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
"\u001b[90m[\u001b[0m2024-10-01T15:18:10Z \u001b[32mINFO \u001b[0m precellar::align\u001b[90m]\u001b[0m Found 2500 barcodes. 100.00% of them have an exact match in whitelist\n",
"\u001b[90m[\u001b[0m2024-10-01T15:18:10Z \u001b[32mINFO \u001b[0m precellar::align\u001b[90m]\u001b[0m Aligning reads...\n",
"\u001b[90m[\u001b[0m2024-10-01T15:18:10Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (R1) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
"\u001b[90m[\u001b[0m2024-10-01T15:18:10Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (R2) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
"100%|██████████| 2500/2500 [00:00<00:00, 15545.42it/s]"
]
}
],
"source": [
"qc = precellar.align(\n",
" assay, \"/data/kzhang/GRCh38/hg38.fa.gz\",\n",
" modality=\"atac\",\n",
" output_fragment=\"atac_fragments.tsv.zst\",\n",
" num_threads=32,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'frac_q30_bases_read1': 0.8179764705882353,\n",
" 'frac_valid_barcode': 1.0,\n",
" 'sequenced_read_pairs': 2500.0,\n",
" 'frac_q30_bases_barcode': 1.0,\n",
" 'frac_unmapped': 0.07640000000000002,\n",
" 'sequenced_reads': 5000.0,\n",
" 'frac_fragment_flanking_single_nucleosome': 0.0029791459781529296,\n",
" 'frac_confidently_mapped': 0.8524,\n",
" 'frac_fragment_in_nucleosome_free_region': 0.010427010923535254,\n",
" 'frac_q30_bases_read2': 0.9442745098039216,\n",
" 'frac_nonnuclear': 0.0128,\n",
" 'frac_duplicates': 0.004940711462450593}"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"qc"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit 132987d

Please sign in to comment.