TranslatorSRI · gaurav · Jun 27, 2023 · May 16, 2023 · May 16, 2023 · May 16, 2023
diff --git a/README.md b/README.md
@@ -22,8 +22,6 @@ strong dependencies against the Babel code.
 
 ## Configuration
 
-Babel requires Python 3.11 or later.
-
 Before running, edit `config.json` and set the `babel_downloads` and `babel_output` directories.  Do not edit the
 remaining items, which are used to control the build process.
 
@@ -121,14 +119,10 @@ You can also run Babel with [Docker](https://www.docker.com/). There are
 two directories you need to bind or mount from outside the container:
 
 ```
-$ docker run -it --rm --mount type=bind,source=...,target=/home/runner/babel/input_data/private --mount type=bind,source=...,target=/home/runner/babel/babel_downloads --entrypoint /bin/bash ggvaidya/babel
+$ docker run -it --rm --mount type=bind,source=...,target=/home/runner/babel/babel_downloads --entrypoint /bin/bash ggvaidya/babel
 ```
 
-These two directories should be set up as following:
-* `babel/input_data/private` is used to store some input files
-  that you will need to download yourself:
-    * `MRCONSO.RRF` and `MRSTY.RRF`: parts of the UMLS release, need to be downloaded from [the UMLS download website](https://www.nlm.nih.gov/research/umls/licensedcontent/umlsknowledgesources.html).
-* `babel/babel_downloads` is used to store data files downloaded during Babel assembly.
+The download directory (`babel/babel_downloads`) is used to store data files downloaded during Babel assembly.
 
 The script `scripts/build-babel.sh` can be used to run `snakemake` with a few useful settings (although just running
 `snakemake --cores 5` should work just fine.)

diff --git a/config.json b/config.json
@@ -3,7 +3,9 @@
   "download_directory": "babel_downloads",
   "intermediate_directory": "babel_outputs/intermediate",
   "output_directory": "babel_outputs",
+
   "biolink_version": "3.3.3",
+  "umls_version": "2023AA",
 
   "ncbi_files": ["gene2ensembl.gz", "gene_info.gz", "gene_orthologs.gz", "gene_refseq_uniprotkb_collab.gz", "mim2gene_medgen"],
   "ubergraph_ontologies": ["UBERON", "CL", "GO", "NCIT", "ECO", "ECTO", "ENVO", "HP", "UPHENO","BFO","BSPO","CARO","CHEBI","CP","GOREL","IAO","MAXO","MONDO","PATO","PR","RO","UBPROP"],

diff --git a/kubernetes/babel-downloads.k8s.yaml b/kubernetes/babel-downloads.k8s.yaml
@@ -13,5 +13,5 @@ spec:
     - ReadWriteOnce
   resources:
     requests:
-      storage: 500Gi
+      storage: 600Gi
   storageClassName: basic
diff --git a/kubernetes/babel-outputs.k8s.yaml b/kubernetes/babel-outputs.k8s.yaml
@@ -15,5 +15,5 @@ spec:
     - ReadWriteOnce
   resources:
     requests:
-      storage: 300Gi
+      storage: 400Gi
   storageClassName: basic
diff --git a/src/babel_utils.py b/src/babel_utils.py
@@ -1,3 +1,4 @@
+import logging
 from ftplib import FTP
 from io import BytesIO
 import gzip
@@ -202,14 +203,19 @@ def pull_via_urllib(url: str, in_file_name: str, decompress = True, subpath=None
     # return the filename to the caller
     return out_file_name
 
-def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[]):
+def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],icrdf_filename=None):
     """
     :param synonym_list:
     :param ofname:
     :param node_type:
     :param labels:
     :param extra_prefixes: We default to only allowing the prefixes allowed for a particular type in Biolink.
         If you want to allow additional prefixes, list them here.
+    :param icrdf_filename: (REQUIRED) The file to read the information content from (icRDF.tsv). Although this is a
+        named parameter to make it easier to specify this when calling write_compendium(), it is REQUIRED, and
+        write_compendium() will throw a RuntimeError if it is not specified. This is to ensure that it has been
+        properly specified as a prerequisite in a Snakemake file, so that write_compendium() is not run until after
+        icRDF.tsv has been generated.
     :return:
     """
     config = get_config()
@@ -218,7 +224,13 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[]):
     node_factory = NodeFactory(make_local_name(''),biolink_version)
     synonym_factory = SynonymFactory(make_local_name(''))
     description_factory = DescriptionFactory(make_local_name(''))
-    ic_factory = InformationContentFactory(f'{get_config()["input_directory"]}/icRDF.tsv')
+
+    # Create an InformationContentFactory based on the specified icRDF.tsv file. Default to the one in the download
+    # directory.
+    if not icrdf_filename:
+        raise RuntimeError("No icrdf_filename parameter provided to write_compendium() -- this is required!")
+    ic_factory = InformationContentFactory(icrdf_filename)
+
     node_test = node_factory.create_node(input_identifiers=[],node_type=node_type,labels={},extra_prefixes = extra_prefixes)
     with jsonlines.open(os.path.join(cdir,'compendia',ofname),'w') as outf, jsonlines.open(os.path.join(cdir,'synonyms',ofname),'w') as sfile:
         for slist in synonym_list:
@@ -253,8 +265,22 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[]):
                     document = {"curie": node["identifiers"][0]["identifier"],
                                 "names": synonyms_list,
                                 "types": [ t[8:] for t in node_factory.get_ancestors(node["type"])]} #remove biolink:
+
                     if "label" in node["identifiers"][0]:
                         document["preferred_name"] = node["identifiers"][0]["label"]
+
+                    # We previously used the shortest length of a name as a proxy for how good a match it is, i.e. given
+                    # two concepts that both have the word "acetaminophen" in them, we assume that the shorter one is the
+                    # more interesting one for users. I'm not sure if there's a better way to do that -- for instance,
+                    # could we consider the information content values? -- but in the interests of getting something
+                    # working quickly, this code restores that previous method.
+
+                    # Since synonyms_list is sorted,
+                    if len(synonyms_list) == 0:
+                        logging.warning(f"Synonym list for {node} is empty: no valid name.")
+                    else:
+                        document["shortest_name_length"] = len(synonyms_list[0])
+
                     sfile.write( document )
                 except Exception as ex:
                     print(f"Exception thrown while write_compendium() was generating {ofname}: {ex}")

diff --git a/src/createcompendia/anatomy.py b/src/createcompendia/anatomy.py
@@ -62,7 +62,7 @@ def write_mesh_ids(outfile):
     meshmap['A11.284'] = CELLULAR_COMPONENT
     mesh.write_ids(meshmap,outfile)
 
-def write_umls_ids(outfile):
+def write_umls_ids(mrsty, outfile):
     #UMLS categories:
     #A1.2 Anatomical Structure
     #A1.2.1 Embryonic Structure
@@ -77,7 +77,7 @@ def write_umls_ids(outfile):
     umlsmap = {x: ANATOMICAL_ENTITY for x in ['A1.2', 'A1.2.1', 'A1.2.3.1', 'A1.2.3.2', 'A2.1.4.1', 'A2.1.5.1', 'A2.1.5.2']}
     umlsmap['A1.2.3.3'] = CELL
     umlsmap['A1.2.3.4'] = CELLULAR_COMPONENT
-    umls.write_umls_ids(umlsmap,outfile)
+    umls.write_umls_ids(mrsty, umlsmap, outfile)
 
 #Ignore list notes:
 #The BTO and BAMs and HTTP (braininfo) identifiers promote over-glommed nodes
@@ -96,10 +96,10 @@ def build_anatomy_obo_relationships(outdir):
         build_sets(f'{UBERON}:0001062', {UBERON:uberon, GO:go, CL:cl},'xref', ignore_list=ignore_list)
         build_sets(f'{GO}:0005575', {UBERON:uberon, GO:go, CL:cl},'xref', ignore_list=ignore_list)
 
-def build_anatomy_umls_relationships(idfile,outfile):
-    umls.build_sets(idfile, outfile, {'SNOMEDCT_US':SNOMEDCT,'MSH': MESH, 'NCI': NCIT})
+def build_anatomy_umls_relationships(mrconso, idfile,outfile):
+    umls.build_sets(mrconso, idfile, outfile, {'SNOMEDCT_US':SNOMEDCT,'MSH': MESH, 'NCI': NCIT})
 
-def build_compendia(concordances, identifiers):
+def build_compendia(concordances, identifiers, icrdf_filename):
     """:concordances: a list of files from which to read relationships
        :identifiers: a list of files from which to read identifiers and optional categories"""
     dicts = {}
@@ -122,7 +122,7 @@ def build_compendia(concordances, identifiers):
     typed_sets = create_typed_sets(set([frozenset(x) for x in dicts.values()]),types)
     for biotype,sets in typed_sets.items():
         baretype = biotype.split(':')[-1]
-        write_compendium(sets,f'{baretype}.txt',biotype,{})
+        write_compendium(sets,f'{baretype}.txt',biotype,{}, icrdf_filename=icrdf_filename)
 
 def create_typed_sets(eqsets,types):
     """Given a set of sets of equivalent identifiers, we want to type each one into

diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py
@@ -23,7 +23,7 @@ def get_type_from_smiles(smiles):
     else:
         return SMALL_MOLECULE
 
-def write_umls_ids(outfile):
+def write_umls_ids(mrsty, outfile):
     groups = ['A1.4.1.1.1.1', #antibiotic
               'A1.4.1.1.3.2', # Hormone
               'A1.4.1.1.3.3',# Enzyme
@@ -40,11 +40,11 @@ def write_umls_ids(outfile):
     #'A1.4.1.1.3.6',# Receptor
     #'A1.4.1.2.1.7 Amino Acid, Peptide, or Protein
     umlsmap = {a:CHEMICAL_ENTITY for a in groups}
-    umls.write_umls_ids(umlsmap, outfile)
+    umls.write_umls_ids(mrsty, umlsmap, outfile)
 
 
-def build_chemical_umls_relationships(idfile,outfile):
-    umls.build_sets(idfile, outfile, {'MSH': MESH,  'DRUGBANK': DRUGBANK})
+def build_chemical_umls_relationships(mrconso, idfile, outfile):
+    umls.build_sets(mrconso, idfile, outfile, {'MSH': MESH,  'DRUGBANK': DRUGBANK})
 
 
 def write_pubchem_ids(labelfile,smilesfile,outfile):
@@ -162,7 +162,7 @@ def write_drugbank_ids(infile,outfile):
     written = set()
     with open(infile,'r') as inf, open(outfile,'w') as outf:
         header_line = inf.readline()
-        assert(header_line == "UCI\tSRC_ID\tSRC_COMPOUND_ID\tASSIGNMENT\n", f"Incorrect header line in {infile}: {header_line}")
+        assert header_line == "UCI\tSRC_ID\tSRC_COMPOUND_ID\tASSIGNMENT\n", f"Incorrect header line in {infile}: {header_line}"
         for line in inf:
             x = line.rstrip().split('\t')
             if x[1] == drugbank_id:
@@ -241,11 +241,11 @@ def write_unichem_concords(structfile,reffile,outdir):
         concfiles[num] = open(concname,'w')
     with open(reffile,'rt') as inf:
         header_line = inf.readline()
-        assert(header_line == "UCI\tSRC_ID\tSRC_COMPOUND_ID\tASSIGNMENT\n", f"Incorrect header line in {reffile}: {header_line}")
+        assert header_line == "UCI\tSRC_ID\tSRC_COMPOUND_ID\tASSIGNMENT\n", f"Incorrect header line in {reffile}: {header_line}"
         for line in inf:
             x = line.rstrip().split('\t')
             outf = concfiles[x[1]]
-            assert(x[3] == '1') # Only '1' (current) assignments should be in this file
+            assert x[3] == '1'  # Only '1' (current) assignments should be in this file
                                 # (see https://chembl.gitbook.io/unichem/definitions/what-is-an-assignment).
             outf.write(f'{unichem_data_sources[x[1]]}:{x[2]}\toio:equivalent\t{inchikeys[x[0]]}\n')
     for outf in concfiles.values():
@@ -256,7 +256,7 @@ def read_inchikeys(struct_file):
     inchikeys = {}
     with gzip.open(struct_file, 'rt') as inf:
         header_line = inf.readline()
-        assert(header_line == "UCI\tSTANDARDINCHI\tSTANDARDINCHIKEY\n", f"Unexpected header line in {struct_file}: {header_line}")
+        assert header_line == "UCI\tSTANDARDINCHI\tSTANDARDINCHIKEY\n", f"Unexpected header line in {struct_file}: {header_line}"
         for sline in inf:
             line = sline.rstrip().split('\t')
             if len(line) == 0:
@@ -511,7 +511,7 @@ def build_untyped_compendia(concordances, identifiers,unichem_partial, untyped_c
         for s in untyped_sets:
             outf.write(f'{set(s)}\n')
 
-def build_compendia(type_file,untyped_compendia_file):
+def build_compendia(type_file, untyped_compendia_file, icrdf_filename):
     types = {}
     with open(type_file,'r') as inf:
         for line in inf:
@@ -525,7 +525,7 @@ def build_compendia(type_file,untyped_compendia_file):
     typed_sets = create_typed_sets(untyped_sets, types)
     for biotype, sets in typed_sets.items():
         baretype = biotype.split(':')[-1]
-        write_compendium(sets, f'{baretype}.txt', biotype, {})
+        write_compendium(sets, f'{baretype}.txt', biotype, {}, icrdf_filename=icrdf_filename)
 
 def create_typed_sets(eqsets, types):
     """

diff --git a/src/createcompendia/diseasephenotype.py b/src/createcompendia/diseasephenotype.py
@@ -54,7 +54,7 @@ def write_mesh_ids(outfile):
     meshmap['C23'] = PHENOTYPIC_FEATURE
     mesh.write_ids(meshmap,outfile,order=[DISEASE,PHENOTYPIC_FEATURE])
 
-def write_umls_ids(outfile,badumlsfile):
+def write_umls_ids(mrsty, outfile,badumlsfile):
     badumls=set()
     with open(badumlsfile,'r') as inf:
         for line in inf:
@@ -81,7 +81,7 @@ def write_umls_ids(outfile,badumlsfile):
     #A2.2.2 Sign or Symptom
     umlsmap['A2.2.1'] = PHENOTYPIC_FEATURE
     umlsmap['A2.2.2'] = PHENOTYPIC_FEATURE
-    umls.write_umls_ids(umlsmap,outfile,blacklist=badumls)
+    umls.write_umls_ids(mrsty, umlsmap, outfile, blacklist=badumls)
 
 
 def build_disease_obo_relationships(outdir):
@@ -105,7 +105,7 @@ def build_disease_efo_relationships(idfile,outfile):
     efo.make_concords(idfile, outfile)
 
 
-def build_disease_umls_relationships(idfile,outfile,omimfile,ncitfile):
+def build_disease_umls_relationships(mrconso, idfile, outfile, omimfile, ncitfile):
     #UMLS contains xrefs between a disease UMLS and a gene OMIM. So here we are saying: if you are going to link to
     # an omim identifier, make sure it's a disease omim, not some other thing.
     good_ids = {}
@@ -115,15 +115,15 @@ def build_disease_umls_relationships(idfile,outfile,omimfile,ncitfile):
             for line in inf:
                 x = line.split()[0]
                 good_ids[prefix].add(x)
-    umls.build_sets(idfile, outfile, {'SNOMEDCT_US':SNOMEDCT,'MSH': MESH, 'NCI': NCIT, 'HPO': HP, 'MDR':MEDDRA, 'OMIM': OMIM},acceptable_identifiers=good_ids)
+    umls.build_sets(mrconso, idfile, outfile, {'SNOMEDCT_US':SNOMEDCT,'MSH': MESH, 'NCI': NCIT, 'HPO': HP, 'MDR':MEDDRA, 'OMIM': OMIM},acceptable_identifiers=good_ids)
 
 def build_disease_doid_relationships(idfile,outfile):
     doid.build_xrefs(idfile, outfile, other_prefixes={'ICD10CM':ICD10, 'ICD9CM':ICD9, 'ICDO': ICD0, 'NCI': NCIT,
                                                       'SNOMEDCT_US_2018_03_01': SNOMEDCT, 'SNOMEDCT_US_2019_09_01': SNOMEDCT,
                                                       'SNOMEDCT_US_2020_03_01': SNOMEDCT, 'SNOMEDCT_US_2020_09_01': SNOMEDCT,
                                                       'UMLS_CUI': UMLS, 'KEGG': KEGGDISEASE})
 
-def build_compendium(concordances, identifiers, mondoclose, badxrefs):
+def build_compendium(concordances, identifiers, mondoclose, badxrefs, icrdf_filename):
     """:concordances: a list of files from which to read relationships
        :identifiers: a list of files from which to read identifiers and optional categories"""
     dicts = {}
@@ -171,7 +171,7 @@ def build_compendium(concordances, identifiers, mondoclose, badxrefs):
     typed_sets = create_typed_sets(set([frozenset(x) for x in dicts.values()]),types)
     for biotype,sets in typed_sets.items():
         baretype = biotype.split(':')[-1]
-        write_compendium(sets,f'{baretype}.txt',biotype,{})
+        write_compendium(sets,f'{baretype}.txt',biotype,{}, icrdf_filename=icrdf_filename)
 
 def create_typed_sets(eqsets,types):
     """Given a set of sets of equivalent identifiers, we want to type each one into
@@ -228,7 +228,7 @@ def read_badxrefs(fn):
             morebad.add( (x[0],x[1]) )
     return morebad
 
-def load_diseases_and_phenotypes(concords,idlists,badhpos,badhpoxrefs):
+def load_diseases_and_phenotypes(concords,idlists,badhpos,badhpoxrefs, icrdf_filename):
     #print('disease/phenotype')
     #print('get and write hp sets')
     #bad_mappings = read_bad_hp_mappings(badhpos)
@@ -299,8 +299,8 @@ def load_diseases_and_phenotypes(concords,idlists,badhpos,badhpoxrefs):
     print('dump it')
     fs = set([frozenset(x) for x in dicts.values()])
     diseases,phenotypes = create_typed_sets(fs)
-    write_compendium(diseases,'disease.txt','biolink:Disease',labels)
-    write_compendium(phenotypes,'phenotypes.txt','biolink:PhenotypicFeature',labels)
+    write_compendium(diseases,'disease.txt','biolink:Disease',labels, icrdf_filename=icrdf_filename)
+    write_compendium(phenotypes,'phenotypes.txt','biolink:PhenotypicFeature',labels, icrdf_filename=icrdf_filename)
 
 if __name__ == '__main__':
     with open('crapfile','w') as crapfile:

diff --git a/src/createcompendia/gene.py b/src/createcompendia/gene.py
@@ -1,5 +1,6 @@
 import re
 
+from src import babel_utils
 from src.prefixes import OMIM,ENSEMBL,NCBIGENE,WORMBASE, MGI, ZFIN, DICTYBASE, FLYBASE, RGD, SGD, HGNC, UMLS
 from src.categories import GENE
 
@@ -97,7 +98,7 @@ def write_omim_ids(infile,outfile):
             if chunks[1] == 'gene':
                 outf.write(f'{OMIM}:{chunks[0]}\n')
 
-def write_umls_ids(outfile):
+def write_umls_ids(mrconso, mrsty, outfile):
     """Find the UMLS entities that are genes.  This is complicated by the fact that UMLS  semantic type doesn't
     have a corresponding GENE class.  It has something (A1.2.3.5) which includes genes, but also includes genomes and
     variants and gene properties and gene families.  We can do some filtering by looking around in the MRCONSO as well
@@ -111,7 +112,6 @@ def write_umls_ids(outfile):
     blacklist=set(['C0017361', #recessive genes
                    'C0017346', #Gag viral gene family
                     ])
-    mrsty = os.path.join('input_data', 'private', 'MRSTY.RRF')
     umls_keepers = set()
     with open(mrsty, 'r') as inf:
         for line in inf:
@@ -121,7 +121,6 @@ def write_umls_ids(outfile):
                 umls_keepers.add(x[0])
     umls_keepers.difference_update(blacklist)
     #Now filter out OMIM variants
-    mrconso = os.path.join('input_data', 'private', 'MRCONSO.RRF')
     with open(mrconso,'r') as inf:
         for line in inf:
             x = line.strip().split('|')
@@ -248,11 +247,11 @@ def write_ensembl_ids(ensembl_dir, outfile):
                             outf.write(f'{gid}\n')
 
 
-def build_gene_umls_hgnc_relationships(umls_idfile,outfile):
+def build_gene_umls_hgnc_relationships(mrconso, umls_idfile, outfile):
     #Could also add MESH, if that were a valid gene prefix
-    umls.build_sets(umls_idfile, outfile, {'HGNC':HGNC})
+    umls.build_sets(mrconso, umls_idfile, outfile, {'HGNC':HGNC})
 
-def build_gene_compendia(concordances, identifiers):
+def build_gene_compendia(concordances, identifiers, icrdf_filename):
     """:concordances: a list of files from which to read relationships
        :identifiers: a list of files from which to read identifiers and optional categories"""
     dicts = {}
@@ -274,5 +273,5 @@ def build_gene_compendia(concordances, identifiers):
         glom(dicts, pairs, unique_prefixes=uniques)
     gene_sets = set([frozenset(x) for x in dicts.values()])
     baretype = GENE.split(':')[-1]
-    write_compendium(gene_sets, f'{baretype}.txt', GENE, {})
+    write_compendium(gene_sets, f'{baretype}.txt', GENE, {}, icrdf_filename=icrdf_filename)
 
diff --git a/src/createcompendia/genefamily.py b/src/createcompendia/genefamily.py
@@ -2,7 +2,7 @@
 
 from src.babel_utils import read_identifier_file,glom,write_compendium
 
-def build_compendia(identifiers):
+def build_compendia(identifiers, icrdf_filename):
     """:concordances: a list of files from which to read relationships
        :identifiers: a list of files from which to read identifiers and optional categories"""
     dicts = {}
@@ -15,5 +15,5 @@ def build_compendia(identifiers):
         types.update(new_types)
     genefam_sets = set([frozenset(x) for x in dicts.values()])
     baretype = GENE_FAMILY.split(':')[-1]
-    write_compendium(genefam_sets, f'{baretype}.txt', GENE_FAMILY, {})
+    write_compendium(genefam_sets, f'{baretype}.txt', GENE_FAMILY, {}, icrdf_filename=icrdf_filename)