Skip to content

Commit

Permalink
Merge pull request #123 from monarch-initiative/io-misc-refactor-and-…
Browse files Browse the repository at this point in the history
…upgrade

IO misc refactoring & upgrades
  • Loading branch information
joeflack4 authored Sep 6, 2024
2 parents dc3c79a + 06494d1 commit db4ddd2
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 30 deletions.
10 changes: 5 additions & 5 deletions omim2obo/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def omim2obo(use_cache: bool = False):

# Parse mimTitles.txt
# - Get id's, titles, and type
omim_type_and_titles, omim_replaced = parse_mim_titles(get_mim_file('mimTitles.txt', download_files_tf))
omim_type_and_titles, omim_replaced = parse_mim_titles(get_mim_file('mimTitles', download_files_tf))
omim_ids = list(omim_type_and_titles.keys())

if CONFIG['verbose']:
Expand Down Expand Up @@ -216,8 +216,8 @@ def omim2obo(use_cache: bool = False):

# Gene ID
# Why is 'skos:exactMatch' appropriate for disease::gene relationships? - joeflack4 2022/06/06
get_mim_file('genemap2.txt', download_files_tf) # dl latest file
mim2gene_lines: List[str] = get_mim_file('mim2gene.txt', download_files_tf) # dl latest file & return
get_mim_file('genemap2', download_files_tf) # dl latest file
mim2gene_lines: List[str] = get_mim_file('mim2gene', download_files_tf) # dl latest file & return
gene_map, pheno_map, hgnc_map = parse_mim2gene(mim2gene_lines)
for mim_number, entrez_id in gene_map.items():
graph.add((OMIM[mim_number], SKOS.exactMatch, NCBIGENE[entrez_id]))
Expand All @@ -235,7 +235,7 @@ def omim2obo(use_cache: bool = False):
graph.add((OMIM[mim_number], SKOS.exactMatch, HGNC[hgnc_symbol_id_map[hgnc_symbol]]))

# Phenotypic Series
pheno_series = parse_phenotypic_series_titles(get_mim_file('phenotypicSeries.txt', download_files_tf))
pheno_series = parse_phenotypic_series_titles(get_mim_file('phenotypicSeries', download_files_tf))
for ps_id in pheno_series:
graph.add((OMIMPS[ps_id], RDF.type, OWL.Class))
graph.add((OMIMPS[ps_id], RDFS.label, Literal(pheno_series[ps_id][0])))
Expand All @@ -245,7 +245,7 @@ def omim2obo(use_cache: bool = False):
graph.add((OMIM[mim_number], RDFS.subClassOf, OMIMPS[ps_id]))

# Morbid map
morbid_map: Dict = parse_morbid_map(get_mim_file('morbidmap.txt', download_files_tf))
morbid_map: Dict = parse_morbid_map(get_mim_file('morbidmap', download_files_tf))
for mim_number, mim_data in morbid_map.items():
# todo?: unused `mim_data` keys. Should they be used?
# - phenotype_label: Similar to p_lab in 'assocs', but has more info
Expand Down
71 changes: 46 additions & 25 deletions omim2obo/parsers/omim_txt_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,18 +57,49 @@
}


def get_mim_file(file_name: str, download=False, return_df=False) -> Union[List[str], pd.DataFrame]:
def convert_txt_to_tsv(file_name: str):
"""Convert OMIM text file to TSV, saving the TSV.
todo: Preserve comments at top and bottom. There's a func that does this somewhere for the ones at top.
todo: If we refactor codebase to use pandas instead of lines, consider:
- This converts 'NULL' to `nan`. Is this what we want?
"""
Retrieve OMIM downloadable text file from the OMIM download server
file_name = file_name if file_name.endswith('.txt') else file_name + '.txt'
mim_file_path: PosixPath = DATA_DIR / file_name
mim_file_tsv_path: str = str(mim_file_path).replace('.txt', '.tsv')
with open(mim_file_path, 'r') as file:
lines = file.readlines()

# Find the header (last comment line at the top) and start of data
header = None
data_start = 0
for i, line in enumerate(lines):
if line.startswith('#'):
header = line.strip('#').strip().split('\t')
data_start = i + 1
else:
break

# Find the end of data (first comment line from the bottom)
data_end = len(lines)
for i in range(len(lines) - 1, data_start - 1, -1):
if lines[i].startswith('#'):
data_end = i
else:
break

# Create DataFrame
data = [line.strip().split('\t') for line in lines[data_start:data_end]]
df = pd.DataFrame(data, columns=header)
df.to_csv(mim_file_tsv_path, sep='\t', index=False)


def get_mim_file(file_name: str, download=False, return_df=False) -> Union[List[str], pd.DataFrame]:
"""Retrieve OMIM downloadable text file from the OMIM download server
:param return_df: If False, returns List[str] of each line in the file, else a DataFrame.
"""
file_headers = {
'mim2gene.txt': '# MIM Number MIM Entry Type (see FAQ 1.3 at https://omim.org/help/faq) '
'Entrez Gene ID (NCBI) Approved Gene Symbol (HGNC) Ensembl Gene ID (Ensembl)',
'genemap2.txt': '# Chromosome Genomic Position Start Genomic Position End Cyto Location '
'Computed Cyto Location MIM Number Gene Symbols Gene Name Approved Gene Symbol '
'Entrez Gene ID Ensembl Gene ID Comments Phenotypes Mouse Gene Symbol/ID'
}
file_name = file_name if file_name.endswith('.txt') else file_name + '.txt'
mim_file_path: PosixPath = DATA_DIR / file_name
mim_file_tsv_path: str = str(mim_file_path).replace('.txt', '.tsv')

Expand All @@ -82,22 +113,10 @@ def get_mim_file(file_name: str, download=False, return_df=False) -> Union[List[
if resp.status_code == 200:
text = resp.text
if not text.startswith('<!DOCTYPE html>'):
# Save file
# Save
with open(mim_file_path, 'w') as fout:
fout.write(text)
# TODO: mim2gene.txt & genemap2.txt: Need to uncomment out the first line
# modify 'text'. what's the nature of it? how to edit just that one line?

# todo: This is brittle in that it depends on these headers not changing. Would be better, eventually,
# to read the lines into a list, then find the first line w/out a comment, get its index, then -1 to
# get index of prev line, then use that to remove the leading '# ' from that line.
# todo: also would be good to do this, because the other TSVs won't have their headers. It doesn't
# matter that much atm, because these files aren't used for anything programmatic.
if file_name in file_headers:
header = file_headers[file_name]
text = text.replace(header, header[2:]) # removes leading comment
with open(mim_file_tsv_path, 'w') as fout:
fout.write(text)
convert_txt_to_tsv(file_name)
LOG.info(f'{file_name} retrieved and updated')
else:
raise RuntimeError('Unexpected response: ' + text)
Expand All @@ -110,11 +129,13 @@ def get_mim_file(file_name: str, download=False, return_df=False) -> Union[List[
raise RuntimeError(msg)

if return_df:
df = pd.read_csv(mim_file_tsv_path, comment='#', sep='\t')
return df
return pd.read_csv(mim_file_tsv_path, comment='#', sep='\t')
else:
with open(mim_file_path, 'r') as fin:
lines: List[str] = fin.readlines()
# Remove comments
# - OMIM files always have comments at the top, and sometimes also at the bottom.
lines = [x for x in lines if not x.startswith('#')]
return lines


Expand Down

0 comments on commit db4ddd2

Please sign in to comment.