Skip to content

Commit

Permalink
Modified abundance utility so that datasets get listed in same order …
Browse files Browse the repository at this point in the history
…as datasets file
  • Loading branch information
Dana Elizabeth Wyman committed Nov 1, 2019
1 parent 79fdd40 commit cbb8f2c
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 7 deletions.
20 changes: 15 additions & 5 deletions src/talon/post/create_abundance_file_from_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,14 +70,24 @@ def fetch_dataset_list(dataset_file, database):

conn = sqlite3.connect(database)
cursor = conn.cursor()
all_db_datasets = qutils.fetch_all_datasets(cursor)
conn.close()

if dataset_file == None:

return all_db_datasets

if dataset_file != None:
datasets = qutils.parse_datasets(dataset_file, cursor)
else:
datasets = qutils.fetch_all_datasets(cursor)
datasets = []
with open(dataset_file, 'r') as f:
for line in f:
dataset = line.strip()
if dataset not in all_db_datasets:
raise ValueError("Dataset name '%s' not found in database" \
% (dataset))
datasets.append(dataset)

conn.close()
return datasets
return datasets

def create_abundance_dict(database, datasets):
"""Process the abundance table by dataset in order to create a dictionary
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
D12
PB65_B018
PB65_B017
33 changes: 31 additions & 2 deletions testing_suite/test_abundance_utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,14 +84,43 @@ def test_with_dataset_list(self):
data = pd.read_csv(abd, sep="\t", header = 0)

print(data)
assert set(list(data.columns)) == set(["gene_ID", "transcript_ID",
assert list(data.columns) == ["gene_ID", "transcript_ID",
"annot_gene_id",
"annot_transcript_id", "annot_gene_name",
"annot_transcript_name", "n_exons",
"length", "gene_novelty",
"transcript_novelty",
"ISM_subtype",
"PB65_B018", "D12"])
"D12", "PB65_B018"]
assert data.shape[0] == 8
assert set(data.transcript_ID) == set([1744, 8437, 8453, 8456, 8457, 8458, 8459, 8460])

def test_dataset_order(self):
""" Make sure datasets appear in same order as dataset file """

database = "scratch/chr11_and_Tcf3.db"
datasets = "input_files/chr11_and_Tcf3/testing_datasets2.txt"

for i in range(10):
try:
subprocess.check_output(
["talon_abundance", "--db", database,
"-a", "gencode_vM7",
"-b", "mm10",
"--datasets", datasets,
"--o", "scratch/chr11_and_Tcf3_dset2"])
except:
pytest.fail("Talon abundance crashed on whitelist case")

# Now check the correctness of the abundance file
abd = "scratch/chr11_and_Tcf3_dset2_talon_abundance.tsv"
data = pd.read_csv(abd, sep="\t", header = 0)

assert list(data.columns) == ["gene_ID", "transcript_ID",
"annot_gene_id",
"annot_transcript_id", "annot_gene_name",
"annot_transcript_name", "n_exons",
"length", "gene_novelty",
"transcript_novelty",
"ISM_subtype",
"D12", "PB65_B018", "PB65_B017"]

0 comments on commit cbb8f2c

Please sign in to comment.