Skip to content

Commit

Permalink
Merge pull request #17 from hisplan/master
Browse files Browse the repository at this point in the history
v0.2.10
  • Loading branch information
hisplan authored Aug 26, 2021
2 parents e8eb418 + 0bfed70 commit 6df1e33
Show file tree
Hide file tree
Showing 6 changed files with 107 additions and 13 deletions.
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ To process data locally using SEQC, you must install the <a href=https://github.
Once all dependencies have been installed, SEQC can be installed by running:

```bash
export SEQC_VERSION="0.2.9"
export SEQC_VERSION="0.2.10"
wget https://github.com/hisplan/seqc/archive/v${SEQC_VERSION}.tar.gz
tar xvzf v${SEQC_VERSION}.tar.gz
cd seqc-${SEQC_VERSION}
Expand Down Expand Up @@ -116,7 +116,6 @@ SEQC run ten_x_v3 \
--barcode-files ./whitelist/ \
--barcode-fastq ./barcode/ \
--genomic-fastq ./genomic/ \
--upload-prefix ./seqc-results/ \
--output-prefix PBMC \
--no-filter-low-coverage \
--min-poly-t 0 \
Expand Down
2 changes: 1 addition & 1 deletion docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

## Generating Reference Packages

This generates a reference package (STAR index and GTF) using SEQC v0.2.9.
This generates a reference package (STAR index and GTF) using SEQC v0.2.10.

- Ensembl 86
- Gene annotation file that contains only the reference chromosomes (no scaffolds, no patches)
Expand Down
6 changes: 3 additions & 3 deletions docs/install-SUSE.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ conda install -c bioconda star
## Install SEQC

```
wget https://github.com/dpeerlab/seqc/archive/v0.2.9.tar.gz
tar xvzf v0.2.9.tar.gz
cd seqc-0.2.9/
wget https://github.com/dpeerlab/seqc/archive/v0.2.10.tar.gz
tar xvzf v0.2.10.tar.gz
cd seqc-0.2.10/
pip install .
```
26 changes: 20 additions & 6 deletions src/seqc/core/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,22 @@ def merge_fastq_files(
:returns str merged_fastq: name of merged fastq file
"""

# hack:
# Due to the non-platform agnostic glob behavior,
# it is possible that L001_R1 is merged with L002_R2 (not L001_R2).
# to avoid this problem, we first sort.
# this is a temporary hacky solution
barcode_fastq = sorted(barcode_fastq)
genomic_fastq = sorted(genomic_fastq)

log.info("Merging genomic reads and barcode annotations.")
for bar_fq, gen_fq in zip(barcode_fastq, genomic_fastq):
log.info(
"Merge {} with {}".format(
os.path.basename(bar_fq), os.path.basename(gen_fq)
)
)

merged_fastq = fastq.merge_paired(
merge_function=technology_platform.merge_function,
fout=output_stem + "_merged.fastq",
Expand Down Expand Up @@ -285,7 +300,7 @@ def create_read_array(

if args.platform == "ten_x_v2" or args.platform == "ten_x_v3":
log.notify("Setting min_poly_t=0 for 10x v2 & v3")
args.min_poly_t = 0
args.min_poly_t = 0

max_insert_size = args.max_insert_size
if args.filter_mode == "scRNA-seq":
Expand Down Expand Up @@ -413,15 +428,11 @@ def create_read_array(
log.info("Resolving ambiguous alignments.")
mm_results = ra.resolve_ambiguous_alignments()



# 121319782799149 / 614086965 / pos=49492038 / AAACATAACG
# 121319782799149 / 512866590 / pos=49490848 / TCAATTAATC (1 hemming dist away from TCAATTAATT)
# ra.data["rmt"][91490] = 512866590
# ra.positions[91490] = 49492038



# correct errors
log.info("Identifying RMT errors.")
df_umi_correction = platform.apply_rmt_correction(ra, error_rate)
Expand Down Expand Up @@ -468,6 +479,10 @@ def create_read_array(
sparse_frame=True, genes_to_symbols=args.index + "annotations.gtf"
)

# generate 10x compatible count matrix
log.info("Creating 10x compatible counts matrix.")
ra.to_10x_count_matrix(genes_to_symbols=args.index + "annotations.gtf")

# Save sparse matrices
log.info("Saving sparse matrices")
scipy.io.mmwrite(args.output_prefix + "_sparse_read_counts.mtx", sp_reads.data)
Expand Down Expand Up @@ -663,4 +678,3 @@ def create_read_array(
)
email_body = email_body.replace("\n", "<br>").replace("\t", "&emsp;")
email_user(summary_archive, email_body, args.email)

81 changes: 81 additions & 0 deletions src/seqc/read_array.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import os
import gzip
import numpy as np
import pandas as pd
from seqc.alignment import sam
Expand All @@ -9,7 +11,9 @@
from seqc import multialignment
from seqc.sparse_frame import SparseFrame
from seqc import log
from seqc.sequence.encodings import DNA3Bit
from scipy.stats import hypergeom
import scipy.io
from collections import OrderedDict


Expand Down Expand Up @@ -658,3 +662,80 @@ def to_count_matrix(
)

f.close()

def to_10x_count_matrix(self, genes_to_symbols):
"""
Convert the ReadArray into a 10x compatible count matrix.
"""

mols_mat = {}

for i, data, gene, pos in self.iter_active():
try:
rmt = data["rmt"]
if rmt not in mols_mat[data["cell"], gene]:
mols_mat[data["cell"], gene].append(rmt)
except KeyError:
mols_mat[data["cell"], gene] = [rmt]

# create a sparse frame without gene ID to gene name conversion
mols_mat1 = SparseFrame.from_dict(
{k: len(v) for k, v in mols_mat.items()},
genes_to_symbols=False,
)

# cell barcode in numeric format
cb_1234 = mols_mat1.index

# convert numeric cell barcode to ACGT format
dna3bit = DNA3Bit()
cb_acgt = list(map(lambda cb: dna3bit.decode(cb).decode(), cb_1234))

# gene IDs without any ENSG/ENSMUSG prefix
gene_ids = mols_mat1.columns

# fixmme: need to add the ENGS/ENSMUSG prefix, but
# no easy way to determine which species, so will use the "ENSX" prefix instead for the time being
# ENSG00000243485 ---> ENSX00000243485
# ENSMUSG00000051951 --> ENSX00000051951
# what about hybrid genome?
# SEQC's original way of handling genes without ENGS/ENSMUSG prefix can mess things up.
gene_ids = list(map(lambda id: "ENSX{0:011d}".format(id), gene_ids))

# create a sparse frame with gene ID to gene name conversion
mols_mat2 = SparseFrame.from_dict(
{k: len(v) for k, v in mols_mat.items()},
genes_to_symbols=genes_to_symbols,
)

# gene names (e.g. Xkr4)
gene_names = mols_mat2.columns.tolist()

# directory to store 10x compatible count matrix
path_out = "raw_feature_bc_matrix/"
os.makedirs(path_out, exist_ok=True)

# create .mtx
path_mtx = os.path.join(path_out, "matrix.mtx")
path_mtx_gz = path_mtx + ".gz"

# transpose to make it 10x compatiable (features x barcodes)
scipy.io.mmwrite(path_mtx, mols_mat1.data.T, comment="SEQC")

# gzip to .mtx.gz
with open(path_mtx, "rb") as fin:
with gzip.open(path_mtx_gz, "wb") as fout:
fout.write(fin.read())

# remove .mtx
os.remove(path_mtx)

# generate barcodes file: barcodes.tsv.gz
with gzip.open(os.path.join(path_out, "barcodes.tsv.gz"), "wt") as fout:
for cb in cb_acgt:
fout.write(cb + "\n")

# generate features file: features.tsv.gz
with gzip.open(os.path.join(path_out, "features.tsv.gz"), "wt") as fout:
for (id, name) in zip(gene_ids, gene_names):
fout.write(f"{id}\t{name}\tGene Expression\n")
2 changes: 1 addition & 1 deletion src/seqc/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.2.9"
__version__ = "0.2.10"

0 comments on commit 6df1e33

Please sign in to comment.