diff --git a/omim2obo/main.py b/omim2obo/main.py index 0b0064b..df495f2 100644 --- a/omim2obo/main.py +++ b/omim2obo/main.py @@ -129,7 +129,7 @@ def omim2obo(use_cache: bool = False): # Parse mimTitles.txt # - Get id's, titles, and type - omim_type_and_titles, omim_replaced = parse_mim_titles(get_mim_file('mimTitles.txt', download_files_tf)) + omim_type_and_titles, omim_replaced = parse_mim_titles(get_mim_file('mimTitles', download_files_tf)) omim_ids = list(omim_type_and_titles.keys()) if CONFIG['verbose']: @@ -216,8 +216,8 @@ def omim2obo(use_cache: bool = False): # Gene ID # Why is 'skos:exactMatch' appropriate for disease::gene relationships? - joeflack4 2022/06/06 - get_mim_file('genemap2.txt', download_files_tf) # dl latest file - mim2gene_lines: List[str] = get_mim_file('mim2gene.txt', download_files_tf) # dl latest file & return + get_mim_file('genemap2', download_files_tf) # dl latest file + mim2gene_lines: List[str] = get_mim_file('mim2gene', download_files_tf) # dl latest file & return gene_map, pheno_map, hgnc_map = parse_mim2gene(mim2gene_lines) for mim_number, entrez_id in gene_map.items(): graph.add((OMIM[mim_number], SKOS.exactMatch, NCBIGENE[entrez_id])) @@ -235,7 +235,7 @@ def omim2obo(use_cache: bool = False): graph.add((OMIM[mim_number], SKOS.exactMatch, HGNC[hgnc_symbol_id_map[hgnc_symbol]])) # Phenotypic Series - pheno_series = parse_phenotypic_series_titles(get_mim_file('phenotypicSeries.txt', download_files_tf)) + pheno_series = parse_phenotypic_series_titles(get_mim_file('phenotypicSeries', download_files_tf)) for ps_id in pheno_series: graph.add((OMIMPS[ps_id], RDF.type, OWL.Class)) graph.add((OMIMPS[ps_id], RDFS.label, Literal(pheno_series[ps_id][0]))) @@ -245,7 +245,7 @@ def omim2obo(use_cache: bool = False): graph.add((OMIM[mim_number], RDFS.subClassOf, OMIMPS[ps_id])) # Morbid map - morbid_map: Dict = parse_morbid_map(get_mim_file('morbidmap.txt', download_files_tf)) + morbid_map: Dict = parse_morbid_map(get_mim_file('morbidmap', download_files_tf)) for mim_number, mim_data in morbid_map.items(): # todo?: unused `mim_data` keys. Should they be used? # - phenotype_label: Similar to p_lab in 'assocs', but has more info diff --git a/omim2obo/parsers/omim_txt_parser.py b/omim2obo/parsers/omim_txt_parser.py index 3a1958e..020d3e2 100644 --- a/omim2obo/parsers/omim_txt_parser.py +++ b/omim2obo/parsers/omim_txt_parser.py @@ -57,18 +57,49 @@ } -def get_mim_file(file_name: str, download=False, return_df=False) -> Union[List[str], pd.DataFrame]: +def convert_txt_to_tsv(file_name: str): + """Convert OMIM text file to TSV, saving the TSV. + + todo: Preserve comments at top and bottom. There's a func that does this somewhere for the ones at top. + todo: If we refactor codebase to use pandas instead of lines, consider: + - This converts 'NULL' to `nan`. Is this what we want? """ - Retrieve OMIM downloadable text file from the OMIM download server + file_name = file_name if file_name.endswith('.txt') else file_name + '.txt' + mim_file_path: PosixPath = DATA_DIR / file_name + mim_file_tsv_path: str = str(mim_file_path).replace('.txt', '.tsv') + with open(mim_file_path, 'r') as file: + lines = file.readlines() + + # Find the header (last comment line at the top) and start of data + header = None + data_start = 0 + for i, line in enumerate(lines): + if line.startswith('#'): + header = line.strip('#').strip().split('\t') + data_start = i + 1 + else: + break + + # Find the end of data (first comment line from the bottom) + data_end = len(lines) + for i in range(len(lines) - 1, data_start - 1, -1): + if lines[i].startswith('#'): + data_end = i + else: + break + + # Create DataFrame + data = [line.strip().split('\t') for line in lines[data_start:data_end]] + df = pd.DataFrame(data, columns=header) + df.to_csv(mim_file_tsv_path, sep='\t', index=False) + + +def get_mim_file(file_name: str, download=False, return_df=False) -> Union[List[str], pd.DataFrame]: + """Retrieve OMIM downloadable text file from the OMIM download server + :param return_df: If False, returns List[str] of each line in the file, else a DataFrame. """ - file_headers = { - 'mim2gene.txt': '# MIM Number MIM Entry Type (see FAQ 1.3 at https://omim.org/help/faq) ' - 'Entrez Gene ID (NCBI) Approved Gene Symbol (HGNC) Ensembl Gene ID (Ensembl)', - 'genemap2.txt': '# Chromosome Genomic Position Start Genomic Position End Cyto Location ' - 'Computed Cyto Location MIM Number Gene Symbols Gene Name Approved Gene Symbol ' - 'Entrez Gene ID Ensembl Gene ID Comments Phenotypes Mouse Gene Symbol/ID' - } + file_name = file_name if file_name.endswith('.txt') else file_name + '.txt' mim_file_path: PosixPath = DATA_DIR / file_name mim_file_tsv_path: str = str(mim_file_path).replace('.txt', '.tsv') @@ -82,22 +113,10 @@ def get_mim_file(file_name: str, download=False, return_df=False) -> Union[List[ if resp.status_code == 200: text = resp.text if not text.startswith(''): - # Save file + # Save with open(mim_file_path, 'w') as fout: fout.write(text) - # TODO: mim2gene.txt & genemap2.txt: Need to uncomment out the first line - # modify 'text'. what's the nature of it? how to edit just that one line? - - # todo: This is brittle in that it depends on these headers not changing. Would be better, eventually, - # to read the lines into a list, then find the first line w/out a comment, get its index, then -1 to - # get index of prev line, then use that to remove the leading '# ' from that line. - # todo: also would be good to do this, because the other TSVs won't have their headers. It doesn't - # matter that much atm, because these files aren't used for anything programmatic. - if file_name in file_headers: - header = file_headers[file_name] - text = text.replace(header, header[2:]) # removes leading comment - with open(mim_file_tsv_path, 'w') as fout: - fout.write(text) + convert_txt_to_tsv(file_name) LOG.info(f'{file_name} retrieved and updated') else: raise RuntimeError('Unexpected response: ' + text) @@ -110,11 +129,13 @@ def get_mim_file(file_name: str, download=False, return_df=False) -> Union[List[ raise RuntimeError(msg) if return_df: - df = pd.read_csv(mim_file_tsv_path, comment='#', sep='\t') - return df + return pd.read_csv(mim_file_tsv_path, comment='#', sep='\t') else: with open(mim_file_path, 'r') as fin: lines: List[str] = fin.readlines() + # Remove comments + # - OMIM files always have comments at the top, and sometimes also at the bottom. + lines = [x for x in lines if not x.startswith('#')] return lines