-
Notifications
You must be signed in to change notification settings - Fork 0
/
cyp_drugs.py
43 lines (34 loc) · 1.6 KB
/
cyp_drugs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import chembl_downloader
from textwrap import dedent
from typing import Optional
def chembl_drugs(*drug: str, file_name: Optional[str] = None):
"""
Obtain approved drugs' ChEMBL IDs, generic drug/small molecule names, max phases and canonical SMILES
via using drug names only with an option to save dataframe as tsv files
:param drug: Enter generic drug names in capital letters to search in ChEMBL database e.g. "AMLODIPINE"
:param file_name: Enter file name if needed in order to save dataframe as a .tsv file in working directory
:return: A dataframe of small molecules/drugs derived from ChEMBL database
along with their ChEMBL IDs, max phases and canonical SMILES
"""
# dedent to remove leading whitespaces from every line
# https://docs.python.org/3/library/textwrap.html#textwrap.dedent
sql = dedent(
f"""\
SELECT
MOLECULE_DICTIONARY.chembl_id,
MOLECULE_DICTIONARY.pref_name,
MOLECULE_DICTIONARY.max_phase,
COMPOUND_STRUCTURES.canonical_smiles
FROM MOLECULE_DICTIONARY
JOIN COMPOUND_STRUCTURES ON MOLECULE_DICTIONARY.molregno == COMPOUND_STRUCTURES.molregno
WHERE molecule_dictionary.pref_name IN '{drug}'
"""
).strip().strip('\'').replace('\'(', '(')
# default query uses the latest ChEMBL version (this script uses version 34)
df = chembl_downloader.query(sql, version="34")
if file_name == None:
return df
else:
# save df as .tsv files if a file name is added
df.to_csv(f"{file_name}.tsv", sep='\t', index=False)
return df