Modified abundance utility so that datasets get listed in same order …

…as datasets file
mortazavilab · Nov 1, 2019 · cbb8f2c · cbb8f2c
1 parent 79fdd40
commit cbb8f2c
Show file tree

Hide file tree

Showing 3 changed files with 49 additions and 7 deletions.
diff --git a/src/talon/post/create_abundance_file_from_database.py b/src/talon/post/create_abundance_file_from_database.py
@@ -70,14 +70,24 @@ def fetch_dataset_list(dataset_file, database):
 
     conn = sqlite3.connect(database)
     cursor = conn.cursor()
+    all_db_datasets = qutils.fetch_all_datasets(cursor)
+    conn.close()
+
+    if dataset_file == None:
+
+        return all_db_datasets
 
-    if dataset_file != None:
-        datasets = qutils.parse_datasets(dataset_file, cursor)
     else:
-        datasets = qutils.fetch_all_datasets(cursor)
+        datasets = []
+        with open(dataset_file, 'r') as f:
+            for line in f:
+                dataset = line.strip()
+                if dataset not in all_db_datasets:
+                    raise ValueError("Dataset name '%s' not found in database" \
+                                      % (dataset))
+                datasets.append(dataset)
 
-    conn.close()
-    return datasets
+        return datasets
 
 def create_abundance_dict(database, datasets):
     """Process the abundance table by dataset in order to create a dictionary

diff --git a/testing_suite/input_files/chr11_and_Tcf3/testing_datasets2.txt b/testing_suite/input_files/chr11_and_Tcf3/testing_datasets2.txt
@@ -0,0 +1,3 @@
+D12
+PB65_B018
+PB65_B017
diff --git a/testing_suite/test_abundance_utility.py b/testing_suite/test_abundance_utility.py
@@ -84,14 +84,43 @@ def test_with_dataset_list(self):
         data = pd.read_csv(abd, sep="\t", header = 0)
 
         print(data)
-        assert set(list(data.columns)) == set(["gene_ID", "transcript_ID",
+        assert list(data.columns) == ["gene_ID", "transcript_ID",
                                                "annot_gene_id",
                                                "annot_transcript_id", "annot_gene_name",
                                                "annot_transcript_name", "n_exons",
                                                "length", "gene_novelty",
                                                "transcript_novelty",
                                                "ISM_subtype",
-                                               "PB65_B018", "D12"])
+                                               "D12", "PB65_B018"]
         assert data.shape[0] == 8
         assert set(data.transcript_ID) == set([1744, 8437, 8453, 8456, 8457, 8458, 8459, 8460]) 
 
+    def test_dataset_order(self):
+        """ Make sure datasets appear in same order as dataset file """
+
+        database =  "scratch/chr11_and_Tcf3.db"
+        datasets = "input_files/chr11_and_Tcf3/testing_datasets2.txt"
+
+        for i in range(10):
+            try:
+                subprocess.check_output(
+                    ["talon_abundance", "--db", database,
+                     "-a", "gencode_vM7",
+                     "-b", "mm10",
+                     "--datasets", datasets,
+                     "--o", "scratch/chr11_and_Tcf3_dset2"])
+            except:
+                pytest.fail("Talon abundance crashed on whitelist case")
+
+            # Now check the correctness of the abundance file
+            abd = "scratch/chr11_and_Tcf3_dset2_talon_abundance.tsv"
+            data = pd.read_csv(abd, sep="\t", header = 0)
+
+            assert list(data.columns) == ["gene_ID", "transcript_ID",
+                                                   "annot_gene_id",
+                                                   "annot_transcript_id", "annot_gene_name",
+                                                   "annot_transcript_name", "n_exons",
+                                                   "length", "gene_novelty",
+                                                   "transcript_novelty",
+                                                   "ISM_subtype",
+                                                   "D12", "PB65_B018", "PB65_B017"]