Merge pull request #17 from hisplan/master

v0.2.10
dpeerlab · Aug 26, 2021 · 6df1e33 · 6df1e33
2 parents e8eb418 + 0bfed70
commit 6df1e33
Show file tree

Hide file tree

Showing 6 changed files with 107 additions and 13 deletions.
diff --git a/README.md b/README.md
@@ -38,7 +38,7 @@ To process data locally using SEQC, you must install the <a href=https://github.
 Once all dependencies have been installed, SEQC can be installed by running:
 
 ```bash
-export SEQC_VERSION="0.2.9"
+export SEQC_VERSION="0.2.10"
 wget https://github.com/hisplan/seqc/archive/v${SEQC_VERSION}.tar.gz
 tar xvzf v${SEQC_VERSION}.tar.gz
 cd seqc-${SEQC_VERSION}
@@ -116,7 +116,6 @@ SEQC run ten_x_v3 \
   --barcode-files ./whitelist/ \
   --barcode-fastq ./barcode/ \
   --genomic-fastq ./genomic/ \
-  --upload-prefix ./seqc-results/ \
   --output-prefix PBMC \
   --no-filter-low-coverage \
   --min-poly-t 0 \

diff --git a/docs/README.md b/docs/README.md
@@ -8,7 +8,7 @@
 
 ## Generating Reference Packages
 
-This generates a reference package (STAR index and GTF) using SEQC v0.2.9.
+This generates a reference package (STAR index and GTF) using SEQC v0.2.10.
 
 - Ensembl 86
 - Gene annotation file that contains only the reference chromosomes (no scaffolds, no patches)

diff --git a/docs/install-SUSE.md b/docs/install-SUSE.md
@@ -43,8 +43,8 @@ conda install -c bioconda star
 ## Install SEQC
 
 ```
-wget https://github.com/dpeerlab/seqc/archive/v0.2.9.tar.gz
-tar xvzf v0.2.9.tar.gz
-cd seqc-0.2.9/
+wget https://github.com/dpeerlab/seqc/archive/v0.2.10.tar.gz
+tar xvzf v0.2.10.tar.gz
+cd seqc-0.2.10/
 pip install .
 ```
diff --git a/src/seqc/core/run.py b/src/seqc/core/run.py
@@ -137,7 +137,22 @@ def merge_fastq_files(
         :returns str merged_fastq: name of merged fastq file
         """
 
+        # hack:
+        # Due to the non-platform agnostic glob behavior,
+        # it is possible that L001_R1 is merged with L002_R2 (not L001_R2).
+        # to avoid this problem, we first sort.
+        # this is a temporary hacky solution
+        barcode_fastq = sorted(barcode_fastq)
+        genomic_fastq = sorted(genomic_fastq)
+
         log.info("Merging genomic reads and barcode annotations.")
+        for bar_fq, gen_fq in zip(barcode_fastq, genomic_fastq):
+            log.info(
+                "Merge {} with {}".format(
+                    os.path.basename(bar_fq), os.path.basename(gen_fq)
+                )
+            )
+
         merged_fastq = fastq.merge_paired(
             merge_function=technology_platform.merge_function,
             fout=output_stem + "_merged.fastq",
@@ -285,7 +300,7 @@ def create_read_array(
 
         if args.platform == "ten_x_v2" or args.platform == "ten_x_v3":
             log.notify("Setting min_poly_t=0 for 10x v2 & v3")
-            args.min_poly_t = 0            
+            args.min_poly_t = 0
 
         max_insert_size = args.max_insert_size
         if args.filter_mode == "scRNA-seq":
@@ -413,15 +428,11 @@ def create_read_array(
             log.info("Resolving ambiguous alignments.")
             mm_results = ra.resolve_ambiguous_alignments()
 
-
-
             # 121319782799149 / 614086965 / pos=49492038 / AAACATAACG
             # 121319782799149 / 512866590 / pos=49490848 / TCAATTAATC (1 hemming dist away from TCAATTAATT)
             # ra.data["rmt"][91490] = 512866590
             # ra.positions[91490] = 49492038
 
-
-
             # correct errors
             log.info("Identifying RMT errors.")
             df_umi_correction = platform.apply_rmt_correction(ra, error_rate)
@@ -468,6 +479,10 @@ def create_read_array(
             sparse_frame=True, genes_to_symbols=args.index + "annotations.gtf"
         )
 
+        # generate 10x compatible count matrix
+        log.info("Creating 10x compatible counts matrix.")
+        ra.to_10x_count_matrix(genes_to_symbols=args.index + "annotations.gtf")
+
         # Save sparse matrices
         log.info("Saving sparse matrices")
         scipy.io.mmwrite(args.output_prefix + "_sparse_read_counts.mtx", sp_reads.data)
@@ -663,4 +678,3 @@ def create_read_array(
             )
             email_body = email_body.replace("\n", "<br>").replace("\t", "&emsp;")
             email_user(summary_archive, email_body, args.email)
-
diff --git a/src/seqc/read_array.py b/src/seqc/read_array.py
@@ -1,3 +1,5 @@
+import os
+import gzip
 import numpy as np
 import pandas as pd
 from seqc.alignment import sam
@@ -9,7 +11,9 @@
 from seqc import multialignment
 from seqc.sparse_frame import SparseFrame
 from seqc import log
+from seqc.sequence.encodings import DNA3Bit
 from scipy.stats import hypergeom
+import scipy.io
 from collections import OrderedDict
 
 
@@ -658,3 +662,80 @@ def to_count_matrix(
             )
 
         f.close()
+
+    def to_10x_count_matrix(self, genes_to_symbols):
+        """
+        Convert the ReadArray into a 10x compatible count matrix.
+        """
+
+        mols_mat = {}
+
+        for i, data, gene, pos in self.iter_active():
+            try:
+                rmt = data["rmt"]
+                if rmt not in mols_mat[data["cell"], gene]:
+                    mols_mat[data["cell"], gene].append(rmt)
+            except KeyError:
+                mols_mat[data["cell"], gene] = [rmt]
+
+        # create a sparse frame without gene ID to gene name conversion
+        mols_mat1 = SparseFrame.from_dict(
+            {k: len(v) for k, v in mols_mat.items()},
+            genes_to_symbols=False,
+        )
+
+        # cell barcode in numeric format
+        cb_1234 = mols_mat1.index
+
+        # convert numeric cell barcode to ACGT format
+        dna3bit = DNA3Bit()
+        cb_acgt = list(map(lambda cb: dna3bit.decode(cb).decode(), cb_1234))
+
+        # gene IDs without any ENSG/ENSMUSG prefix
+        gene_ids = mols_mat1.columns
+
+        # fixmme: need to add the ENGS/ENSMUSG prefix, but
+        # no easy way to determine which species, so will use the "ENSX" prefix instead for the time being
+        # ENSG00000243485 ---> ENSX00000243485
+        # ENSMUSG00000051951 --> ENSX00000051951
+        # what about hybrid genome?
+        # SEQC's original way of handling genes without ENGS/ENSMUSG prefix can mess things up.
+        gene_ids = list(map(lambda id: "ENSX{0:011d}".format(id), gene_ids))
+
+        # create a sparse frame with gene ID to gene name conversion
+        mols_mat2 = SparseFrame.from_dict(
+            {k: len(v) for k, v in mols_mat.items()},
+            genes_to_symbols=genes_to_symbols,
+        )
+
+        # gene names (e.g. Xkr4)
+        gene_names = mols_mat2.columns.tolist()
+
+        # directory to store 10x compatible count matrix
+        path_out = "raw_feature_bc_matrix/"
+        os.makedirs(path_out, exist_ok=True)
+
+        # create .mtx
+        path_mtx = os.path.join(path_out, "matrix.mtx")
+        path_mtx_gz = path_mtx + ".gz"
+
+        # transpose to make it 10x compatiable (features x barcodes)
+        scipy.io.mmwrite(path_mtx, mols_mat1.data.T, comment="SEQC")
+
+        # gzip to .mtx.gz
+        with open(path_mtx, "rb") as fin:
+            with gzip.open(path_mtx_gz, "wb") as fout:
+                fout.write(fin.read())
+
+        # remove .mtx
+        os.remove(path_mtx)
+
+        # generate barcodes file: barcodes.tsv.gz
+        with gzip.open(os.path.join(path_out, "barcodes.tsv.gz"), "wt") as fout:
+            for cb in cb_acgt:
+                fout.write(cb + "\n")
+
+        # generate features file: features.tsv.gz
+        with gzip.open(os.path.join(path_out, "features.tsv.gz"), "wt") as fout:
+            for (id, name) in zip(gene_ids, gene_names):
+                fout.write(f"{id}\t{name}\tGene Expression\n")
diff --git a/src/seqc/version.py b/src/seqc/version.py
@@ -1 +1 @@
-__version__ = "0.2.9"
+__version__ = "0.2.10"