TranslatorSRI · gaurav · Aug 7, 2024 · Aug 7, 2024 · Aug 7, 2024 · Aug 7, 2024
diff --git a/config.json b/config.json
@@ -73,12 +73,13 @@
   "preferred_name_boost_prefixes": {
     "biolink:ChemicalEntity": [
       "DRUGBANK",
-      "GTOPDB",
       "DrugCentral",
-      "CHEMBL.COMPOUND",
-      "RXCUI",
       "CHEBI",
+      "MESH",
+      "CHEMBL.COMPOUND",
+      "GTOPDB",
       "HMDB",
+      "RXCUI",
       "PUBCHEM.COMPOUND"
     ]
   },
@@ -92,5 +93,7 @@
   ],
 
   "duckdb_config": {
-  }
+  },
+
+  "demote_labels_longer_than": 50
 }
diff --git a/src/babel_utils.py b/src/babel_utils.py
@@ -436,15 +436,17 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i
                                 "names": synonyms_list,
                                 "types": [t[8:] for t in types]} # remove biolink:
 
-                    # To pick a preferred label for this clique, we need to do three things:
+                    # To pick a preferred label for this clique, we need to do four things:
                     # 1. We sort all labels in the preferred-name order. By default, this should be
                     #    the preferred CURIE order, but if this clique is in one of the Biolink classes in
                     #    preferred_name_boost_prefixes, we boost those prefixes in that order to the top of the list.
                     # 2. We filter out any suspicious labels.
                     #    (If this simple filter doesn't work, and if prefixes are inconsistent, we can build upon the
                     #    algorithm proposed by Jeff at
                     #    https://github.com/NCATSTranslator/Feedback/issues/259#issuecomment-1605140850)
-                    # 3. We choose the first label that isn't blank. If no labels remain, we generate a warning.
+                    # 3. We filter out any labels longer than config['demote_labels_longer_than'], but only if there is
+                    #    at least one label shorter than this limit.
+                    # 4. We choose the first label that isn't blank. If no labels remain, we generate a warning.
 
                     # Step 1.1. Sort labels in boosted prefix order if possible.
                     possible_labels = []
@@ -469,7 +471,13 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i
                                                 not l.startswith('CHEMBL')          # Some CHEMBL names are just the identifier again.
                                                 ]
 
-                    # Step 3. Pick the first label that isn't blank.
+                    # Step 3. Filter out labels longer than config['demote_labels_longer_than'], but only if there is at
+                    # least one label shorter than this limit.
+                    labels_shorter_than_limit = [l for l in possible_labels if l and len(l) <= config['demote_labels_longer_than']]
+                    if labels_shorter_than_limit:
+                        filtered_possible_labels = labels_shorter_than_limit
+
+                    # Step 4. Pick the first label that isn't blank.
                     if filtered_possible_labels:
                         document["preferred_name"] = filtered_possible_labels[0]
                     else:

diff --git a/src/exporters/sapbert.py b/src/exporters/sapbert.py
@@ -17,6 +17,8 @@
 from itertools import combinations
 
 import logging
+
+from src.node import get_config
 from src.util import LoggingUtil
 
 # Default logger for this file.
@@ -39,14 +41,29 @@ def convert_synonyms_to_sapbert(synonym_filename, sapbert_filename_gzipped):
     :param sapbert_filename_gzipped: The SAPBERT training file to generate.
     """
 
+    config = get_config()
+
     logger.info(f"convert_synonyms_to_sapbert({synonym_filename}, {sapbert_filename_gzipped})")
 
+    # For now, the simplest way to identify the DrugChemicalConflated file is by name.
+    # In this case we still generate DrugChemicalConflated.txt, but we also generate
+    # DrugChemicalConflatedSmaller.txt, which ignores labels longer than config['demote_labels_longer_than'].
+    generate_smaller_filename = None
+    if synonym_filename.endswith('/DrugChemicalConflated.txt'):
+        generate_smaller_filename = sapbert_filename_gzipped.replace('.txt.gz', 'Smaller.txt.gz')
+
     # Make the output directories if they don't exist.
     os.makedirs(os.path.dirname(sapbert_filename_gzipped), exist_ok=True)
 
+    # Open SmallerFile for writing if needed.
+    generate_smaller_file = None
+    if generate_smaller_filename:
+        generate_smaller_file = gzip.open(generate_smaller_filename, 'wt', encoding='utf-8')
+
     # Go through all the synonyms in the input file.
     count_entry = 0
-    count_training_text = 0
+    count_training_rows = 0
+    count_smaller_rows = 0
     with open(synonym_filename, "r", encoding="utf-8") as synonymf, gzip.open(sapbert_filename_gzipped, "wt", encoding="utf-8") as sapbertf:
         for line in synonymf:
             count_entry += 1
@@ -59,6 +76,9 @@ def convert_synonyms_to_sapbert(synonym_filename, sapbert_filename_gzipped):
                 logging.warning(f"Unable to convert synonym entry for curie {curie}, skipping: {entry}")
                 continue
 
+            # Is the preferred name small enough that we should ignore it from generate_smaller_file?
+            is_preferred_name_short = len(preferred_name) <= config['demote_labels_longer_than']
+
             # Collect and process the list of names.
             names = entry['names']
             if LOWERCASE_ALL_NAMES:
@@ -79,26 +99,47 @@ def convert_synonyms_to_sapbert(synonym_filename, sapbert_filename_gzipped):
             # How many names do we have?
             if len(names) == 0:
                 # This shouldn't happen, but let's anticipate this anyway.
-                sapbertf.write(
-                    f"biolink:{biolink_type}||{curie}||{preferred_name}||{preferred_name.lower()}||{preferred_name.lower()}\n"
-                )
-                count_training_text += 1
+                line = f"biolink:{biolink_type}||{curie}||{preferred_name}||{preferred_name.lower()}||{preferred_name.lower()}\n"
+                sapbertf.write(line)
+                count_training_rows += 1
+                if generate_smaller_file and is_preferred_name_short:
+                    generate_smaller_file.write(line)
+                    count_smaller_rows += 1
             elif len(names) == 1:
                 # If we have less than two names, we don't have anything to randomize.
                 sapbertf.write(
                     f"biolink:{biolink_type}||{curie}||{preferred_name}||{preferred_name.lower()}||{names[0]}\n"
                 )
-                count_training_text += 1
+                count_training_rows += 1
+                if generate_smaller_file and is_preferred_name_short:
+                    generate_smaller_file.write(line)
+                    count_smaller_rows += 1
             else:
                 name_pairs = list(itertools.combinations(set(names), 2))
+                is_any_name_short = any(map(lambda name: len(name) >= config['demote_labels_longer_than'], names))
 
                 if len(name_pairs) > MAX_SYNONYM_PAIRS:
                     # Randomly select 50 pairs.
                     name_pairs = random.sample(name_pairs, MAX_SYNONYM_PAIRS)
 
                 for name_pair in name_pairs:
-                    sapbertf.write(f"biolink:{biolink_type}||{curie}||{preferred_name}||{name_pair[0]}||{name_pair[1]}\n")
-                    count_training_text += 1
+                    line = f"biolink:{biolink_type}||{curie}||{preferred_name}||{name_pair[0]}||{name_pair[1]}\n"
+                    sapbertf.write(line)
+                    count_training_rows += 1
+
+                    # As long as any of the names is short enough, we should use this for training.
+                    if generate_smaller_file and is_any_name_short:
+                        generate_smaller_file.write(line)
+                        count_smaller_rows += 1
 
     logger.info(f"Converted {synonym_filename} to SAPBERT training file {synonym_filename}: " +
-                f"read {count_entry} entries and wrote out {count_training_text} training rows.")
+                f"read {count_entry} entries and wrote out {count_training_rows} training rows.")
+
+    # Close SmallerFile if needed.
+    if generate_smaller_file:
+        generate_smaller_file.close()
+        percentage = count_smaller_rows / float(count_training_rows) * 100
+        logger.info(f"Converted {synonym_filename} to smaller SAPBERT training file {generate_smaller_filename}: " +
+                    f"read {count_entry} entries and wrote out {count_smaller_rows} training rows ({percentage:.2f}%).")
+
+