Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce DrugChemical for load into SAPBERT #330

Open
wants to merge 19 commits into
base: load-drugchemical-into-duckdb
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions config.json
Original file line number Diff line number Diff line change
Expand Up @@ -73,12 +73,13 @@
"preferred_name_boost_prefixes": {
"biolink:ChemicalEntity": [
"DRUGBANK",
"GTOPDB",
"DrugCentral",
"CHEMBL.COMPOUND",
"RXCUI",
"CHEBI",
"MESH",
"CHEMBL.COMPOUND",
"GTOPDB",
"HMDB",
"RXCUI",
"PUBCHEM.COMPOUND"
]
},
Expand All @@ -92,5 +93,7 @@
],

"duckdb_config": {
}
},

"demote_labels_longer_than": 50
}
14 changes: 11 additions & 3 deletions src/babel_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -436,15 +436,17 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i
"names": synonyms_list,
"types": [t[8:] for t in types]} # remove biolink:

# To pick a preferred label for this clique, we need to do three things:
# To pick a preferred label for this clique, we need to do four things:
# 1. We sort all labels in the preferred-name order. By default, this should be
# the preferred CURIE order, but if this clique is in one of the Biolink classes in
# preferred_name_boost_prefixes, we boost those prefixes in that order to the top of the list.
# 2. We filter out any suspicious labels.
# (If this simple filter doesn't work, and if prefixes are inconsistent, we can build upon the
# algorithm proposed by Jeff at
# https://github.com/NCATSTranslator/Feedback/issues/259#issuecomment-1605140850)
# 3. We choose the first label that isn't blank. If no labels remain, we generate a warning.
# 3. We filter out any labels longer than config['demote_labels_longer_than'], but only if there is
# at least one label shorter than this limit.
# 4. We choose the first label that isn't blank. If no labels remain, we generate a warning.
gaurav marked this conversation as resolved.
Show resolved Hide resolved

# Step 1.1. Sort labels in boosted prefix order if possible.
possible_labels = []
Expand All @@ -469,7 +471,13 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i
not l.startswith('CHEMBL') # Some CHEMBL names are just the identifier again.
]

# Step 3. Pick the first label that isn't blank.
# Step 3. Filter out labels longer than config['demote_labels_longer_than'], but only if there is at
# least one label shorter than this limit.
labels_shorter_than_limit = [l for l in possible_labels if l and len(l) <= config['demote_labels_longer_than']]
if labels_shorter_than_limit:
filtered_possible_labels = labels_shorter_than_limit

# Step 4. Pick the first label that isn't blank.
if filtered_possible_labels:
document["preferred_name"] = filtered_possible_labels[0]
else:
Expand Down
59 changes: 50 additions & 9 deletions src/exporters/sapbert.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
from itertools import combinations

import logging

from src.node import get_config
from src.util import LoggingUtil

# Default logger for this file.
Expand All @@ -39,14 +41,29 @@ def convert_synonyms_to_sapbert(synonym_filename, sapbert_filename_gzipped):
:param sapbert_filename_gzipped: The SAPBERT training file to generate.
"""

config = get_config()

logger.info(f"convert_synonyms_to_sapbert({synonym_filename}, {sapbert_filename_gzipped})")

# For now, the simplest way to identify the DrugChemicalConflated file is by name.
# In this case we still generate DrugChemicalConflated.txt, but we also generate
# DrugChemicalConflatedSmaller.txt, which ignores labels longer than config['demote_labels_longer_than'].
generate_smaller_filename = None
if synonym_filename.endswith('/DrugChemicalConflated.txt'):
generate_smaller_filename = sapbert_filename_gzipped.replace('.txt.gz', 'Smaller.txt.gz')

# Make the output directories if they don't exist.
os.makedirs(os.path.dirname(sapbert_filename_gzipped), exist_ok=True)

# Open SmallerFile for writing if needed.
generate_smaller_file = None
if generate_smaller_filename:
generate_smaller_file = gzip.open(generate_smaller_filename, 'wt', encoding='utf-8')

# Go through all the synonyms in the input file.
count_entry = 0
count_training_text = 0
count_training_rows = 0
count_smaller_rows = 0
with open(synonym_filename, "r", encoding="utf-8") as synonymf, gzip.open(sapbert_filename_gzipped, "wt", encoding="utf-8") as sapbertf:
for line in synonymf:
count_entry += 1
Expand All @@ -59,6 +76,9 @@ def convert_synonyms_to_sapbert(synonym_filename, sapbert_filename_gzipped):
logging.warning(f"Unable to convert synonym entry for curie {curie}, skipping: {entry}")
continue

# Is the preferred name small enough that we should ignore it from generate_smaller_file?
is_preferred_name_short = len(preferred_name) <= config['demote_labels_longer_than']

# Collect and process the list of names.
names = entry['names']
if LOWERCASE_ALL_NAMES:
Expand All @@ -79,26 +99,47 @@ def convert_synonyms_to_sapbert(synonym_filename, sapbert_filename_gzipped):
# How many names do we have?
if len(names) == 0:
# This shouldn't happen, but let's anticipate this anyway.
sapbertf.write(
f"biolink:{biolink_type}||{curie}||{preferred_name}||{preferred_name.lower()}||{preferred_name.lower()}\n"
)
count_training_text += 1
line = f"biolink:{biolink_type}||{curie}||{preferred_name}||{preferred_name.lower()}||{preferred_name.lower()}\n"
sapbertf.write(line)
count_training_rows += 1
if generate_smaller_file and is_preferred_name_short:
generate_smaller_file.write(line)
count_smaller_rows += 1
elif len(names) == 1:
# If we have less than two names, we don't have anything to randomize.
sapbertf.write(
f"biolink:{biolink_type}||{curie}||{preferred_name}||{preferred_name.lower()}||{names[0]}\n"
)
count_training_text += 1
count_training_rows += 1
if generate_smaller_file and is_preferred_name_short:
generate_smaller_file.write(line)
count_smaller_rows += 1
else:
name_pairs = list(itertools.combinations(set(names), 2))
is_any_name_short = any(map(lambda name: len(name) >= config['demote_labels_longer_than'], names))

if len(name_pairs) > MAX_SYNONYM_PAIRS:
# Randomly select 50 pairs.
name_pairs = random.sample(name_pairs, MAX_SYNONYM_PAIRS)

for name_pair in name_pairs:
sapbertf.write(f"biolink:{biolink_type}||{curie}||{preferred_name}||{name_pair[0]}||{name_pair[1]}\n")
count_training_text += 1
line = f"biolink:{biolink_type}||{curie}||{preferred_name}||{name_pair[0]}||{name_pair[1]}\n"
sapbertf.write(line)
count_training_rows += 1

# As long as any of the names is short enough, we should use this for training.
if generate_smaller_file and is_any_name_short:
generate_smaller_file.write(line)
count_smaller_rows += 1

logger.info(f"Converted {synonym_filename} to SAPBERT training file {synonym_filename}: " +
f"read {count_entry} entries and wrote out {count_training_text} training rows.")
f"read {count_entry} entries and wrote out {count_training_rows} training rows.")

# Close SmallerFile if needed.
if generate_smaller_file:
generate_smaller_file.close()
percentage = count_smaller_rows / float(count_training_rows) * 100
logger.info(f"Converted {synonym_filename} to smaller SAPBERT training file {generate_smaller_filename}: " +
f"read {count_entry} entries and wrote out {count_smaller_rows} training rows ({percentage:.2f}%).")