Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

get_node_counts edit #67

Merged
merged 3 commits into from
Jun 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -109,3 +109,7 @@ fix_pubtator_type.py
etc
RDAS_GFKG/src/FineTunned_Bert_2.pt
RDAS_GFKG/src/Finetunned_Bert_2.pt
RDAS_CTKG/eligibility_extraction/
RDAS_CTKG/metamap_cond_out.json
RDAS_CTKG/metamap_cond.txt
RDAS_GFKG/convert_csv_fields.py
44 changes: 27 additions & 17 deletions RDAS_CTKG/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -851,7 +851,7 @@ def add_metamap_annotation(db, trial_info):
score = v['score']
types = v['types']
nctid = v['nctid']
db.run(f'MATCH (y:ClinicalTrial) WHERE y.NCTId = \'{nctid}\' MERGE (x:Trial_Annotation {{umls_cui:\'{k}\', umls_concept:\'{concept}\', umls_types:{types}}}) MERGE (y)-[:has_metamap_annotation {{umls_score:{score}}}]->(x)')
db.run(f'MATCH (y:ClinicalTrial) WHERE y.NCTId = \'{nctid}\' MERGE (x:TrialAnnotation {{umls_cui:\'{k}\', umls_concept:\'{concept}\', umls_types:{types}}}) MERGE (y)-[:has_metamap_annotation {{umls_score:{score}}}]->(x)')

def metamap_trial_annotation(db, trial_info, update_metamap=True):
INSTANCE = Submission(os.environ['METAMAP_EMAIL'],os.environ['METAMAP_KEY'])
Expand Down Expand Up @@ -943,7 +943,7 @@ def condition_map(db, update_metamap=True):
"""

print('RUNNING SETUP')
gard_db = AlertCypher('gard')
gard_db = AlertCypher(f'{sysvars.gard_db}')

# # Initialize MetaMap instance
INSTANCE = Submission(os.environ['METAMAP_EMAIL'],os.environ['METAMAP_KEY'])
Expand All @@ -966,9 +966,30 @@ def condition_map(db, update_metamap=True):
else:
db.run('MERGE (x:GARD {{GardId:\"{gard_id}\",GardName:\"{name}\",Synonyms:{syns},UMLS_Source:\"{usource}\"}})'.format(name=name,gard_id=gard_id,syns=syns,usource=usource))



print('ADDING GARD-CONDITION MAPPINGS BASED ON EXACT STRING MATCH')
# Fetch Condition nodes without existing annotations
res = db.run('MATCH (x:Condition) RETURN ID(x) as cond_id, x.Condition as cond').data()

# Create annotations based on exact string match and connect to GARD nodes
loweredList = db.run(f'MATCH (x:GARD) WITH [x IN x.GardName+x.Synonyms | toLower(x)] AS loweredList,x RETURN loweredList, ID(x) as gard_node_id').data()
for entry in res:
cond_id = entry['cond_id']
cond = normalize(entry['cond'])
cond_lower = cond.lower()

for idx,lowered in enumerate(loweredList):
lst = lowered['loweredList']
lst = [normalize(x).lower() for x in lst]
if cond_lower in lst:
gard_node_id = lowered['gard_node_id']
db.run('MATCH (x:GARD) WHERE ID(x) = {gard_node_id} MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (z:ConditionAnnotation {{UMLSPreferredName: \"{cond}\", MATCH_TYPE: \"STRING\"}}) MERGE (z)<-[:has_annotation]-(y) MERGE (x)<-[:mapped_to_gard]-(z)'.format(cond=cond,cond_id=cond_id,gard_node_id=gard_node_id))


print('RUNNING METAMAP')
# Fetch conditions from the database that havent already been annotated and are not acronyms
res = db.run('MATCH (c:Condition) WHERE NOT EXISTS((c)--(:Condition_Annotation)) RETURN c.Condition as condition, ID(c) as cond_id')
res = db.run('MATCH (c:Condition) RETURN c.Condition as condition, ID(c) as cond_id')
cond_strs = [f"{i['cond_id']}|{normalize(i['condition'])}\n" for i in res if not is_acronym(i['condition'])]

# Write condition strings to a file for MetaMap processing
Expand Down Expand Up @@ -1034,7 +1055,7 @@ def condition_map(db, update_metamap=True):

print('CREATING AND CONNECTING METAMAP ANNOTATIONS')
# Fetch relevant data from Condition nodes
res = db.run('MATCH (x:Condition) WHERE x.METAMAP_OUTPUT IS NOT NULL RETURN ID(x) AS cond_id, x.METAMAP_OUTPUT AS cumls, x.METAMAP_PREFERRED_TERM AS prefs, x.FUZZY_SCORE as fuzz, x.METAMAP_SCORE as meta').data()
res = db.run('MATCH (x:Condition) WHERE x.METAMAP_OUTPUT IS NOT NULL RETURN ID(x) AS cond_id, x.Condition as condition, x.METAMAP_OUTPUT AS cumls, x.METAMAP_PREFERRED_TERM AS prefs, x.FUZZY_SCORE as fuzz, x.METAMAP_SCORE as meta').data()

exclude_umls = sysvars.umls_blacklist

Expand All @@ -1057,26 +1078,15 @@ def condition_map(db, update_metamap=True):
gard_ids = gard_ids['gard_id']
for gard_id in gard_ids:
# Create Annotation nodes and connect to Condition and GARD nodes
db.run('MATCH (z:GARD) WHERE z.GardId = \"{gard_id}\" MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (x:ConditionAnnotation {{UMLS_CUI: \"{umls}\", UMLSPreferredName: \"{pref}\", SEMANTIC_TYPE: {sems}, MATCH_TYPE: \"METAMAP\"}}) MERGE (x)<-[:has_annotation {{FUZZY_SCORE: {fuzz}, METAMAP_SCORE: {meta}}}]-(y) MERGE (z)<-[:mapped_to_gard]-(x)'.format(gard_id=gard_id,cond_id=cond_id,umls=umls,pref=prefs[idx],sems=sems[idx],fuzz=fuzzy_scores[idx],meta=meta_scores[idx]))

db.run('MATCH (z:GARD) WHERE z.GardId = \"{gard_id}\" MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (x:Condition_Annotation {{UMLS_CUI: \"{umls}\", UMLSPreferredName: \"{pref}\", SEMANTIC_TYPE: {sems}, MATCH_TYPE: \"METAMAP\"}}) MERGE (x)<-[:has_annotation {{FUZZY_SCORE: {fuzz}, METAMAP_SCORE: {meta}}}]-(y) MERGE (z)<-[:mapped_to_gard]-(x)'.format(gard_id=gard_id,cond_id=cond_id,umls=umls,pref=prefs[idx],sems=sems[idx],fuzz=fuzzy_scores[idx],meta=meta_scores[idx]))
else:
# Create Annotation nodes and connect to Condition nodes
db.run('MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (x:Condition_Annotation {{UMLS_CUI: \"{umls}\", UMLSPreferredName: \"{pref}\", SEMANTIC_TYPE: {sems}, MATCH_TYPE: \"METAMAP\"}}) MERGE (x)<-[:has_annotation {{FUZZY_SCORE: {fuzz}, METAMAP_SCORE: {meta}}}]-(y)'.format(cond_id=cond_id,umls=umls,pref=prefs[idx],sems=sems[idx],fuzz=fuzzy_scores[idx],meta=meta_scores[idx]))
db.run('MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (x:ConditionAnnotation {{UMLS_CUI: \"{umls}\", UMLSPreferredName: \"{pref}\", SEMANTIC_TYPE: {sems}, MATCH_TYPE: \"METAMAP\"}}) MERGE (x)<-[:has_annotation {{FUZZY_SCORE: {fuzz}, METAMAP_SCORE: {meta}}}]-(y)'.format(cond_id=cond_id,umls=umls,pref=prefs[idx],sems=sems[idx],fuzz=fuzzy_scores[idx],meta=meta_scores[idx]))

print('REMOVING UNNEEDED PROPERTIES')
# Remove unnecessary properties from Condition nodes that were used during processing
db.run('MATCH (x:Condition) SET x.METAMAP_PREFERRED_TERM = NULL SET x.METAMAP_OUTPUT = NULL SET x.FUZZY_SCORE = NULL SET x.METAMAP_SCORE = NULL')

print('ADDING GARD-CONDITION MAPPINGS BASED ON EXACT STRING MATCH')
# Fetch Condition nodes without existing annotations
res = db.run('MATCH (x:Condition) WHERE NOT (x)-[:has_annotation]-() RETURN ID(x) as cond_id, x.Condition as cond').data()

# Create annotations based on exact string match and connect to GARD nodes
for entry in res:
cond_id = entry['cond_id']
cond = entry['cond']
db.run('MATCH (x:GARD) WHERE toLower(x.GardName) = toLower(\"{cond}\") MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (z:Condition_Annotation {{UMLSPreferredName: \"{cond}\", MATCH_TYPE: \"STRING\"}}) MERGE (z)<-[:has_annotation]-(y) MERGE (x)<-[:mapped_to_gard]-(z)'.format(cond=cond,cond_id=cond_id))



def drug_normalize(drug):
Expand Down
2 changes: 1 addition & 1 deletion RDAS_GARD/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,7 @@ def populate_node_counts(db,data,prop_name):

res1 = db.run('MATCH (x:GARD)--(y:Phenotype) WITH COUNT(DISTINCT y) AS cnt,x SET x.COUNT_PHENOTYPES = cnt').data()
res2 = db.run('MATCH (x:GARD)--(y:Gene) WITH COUNT(DISTINCT y) AS cnt,x SET x.COUNT_GENES = cnt').data()
res3 = ct_db.run('MATCH (x:GARD)--(y:Annotation)--(z:Condition)--(ct:ClinicalTrial) WITH COUNT(DISTINCT ct) AS cnt,x RETURN cnt AS cnt,x.GardId AS gard_id').data()
res3 = ct_db.run('MATCH (x:GARD)--(y:ConditionAnnotation)--(z:Condition)--(ct:ClinicalTrial) WITH COUNT(DISTINCT ct) AS cnt,x RETURN cnt AS cnt,x.GardId AS gard_id').data()
res4 = pm_db.run('MATCH (x:GARD)--(y:Article) WITH COUNT(DISTINCT y) AS cnt,x RETURN cnt AS cnt, x.GardId AS gard_id').data()
res5 = gnt_db.run('MATCH (x:GARD)--(y:Project)--(z:CoreProject) WITH COUNT(DISTINCT z) AS cnt,x RETURN cnt AS cnt, x.GardId as gard_id').data()

Expand Down
10 changes: 4 additions & 6 deletions RDAS_GFKG/annotate_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,7 @@ def load_model(model_name):

nlp.add_pipe('remove_duplicate_entities')
nlp.add_pipe('abbreviation_detector')
nlp.add_pipe('scispacy_linker', config={'linker_name':'umls',
'resolve_abbreviations':True,
'threshold':0.8})
nlp.add_pipe('scispacy_linker', config={'linker_name':'umls','resolve_abbreviations':True,'threshold':0.8})
break
except:
pass
Expand All @@ -39,7 +37,7 @@ def get_umls_concepts(nlp, text):
text.sort_values(by=['APPLICATION_ID'], inplace=True)
text.reset_index(drop=True, inplace=True)

docs = list(nlp.pipe(text['ABSTRACT_TEXT']))
docs = list(nlp.pipe(text['ABSTRACT_TEXT'], batch_size=100))
linker = nlp.get_pipe('scispacy_linker')

meta_df_lst = []
Expand All @@ -50,8 +48,8 @@ def get_umls_concepts(nlp, text):
all_umls_data = []

for ent in doc.ents:
if len(ent._.umls_ents) > 0:
highest_umls_ent = ent._.umls_ents[0]
if len(ent._.kb_ents) > 0:
highest_umls_ent = ent._.kb_ents[0]
concept_entity.append((highest_umls_ent[0], str(ent)))
umls_data = linker.kb.cui_to_entity[highest_umls_ent[0]]
all_umls_data.append(umls_data)
Expand Down
111 changes: 40 additions & 71 deletions RDAS_GFKG/prep_neo4j_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import pandas as pd
from prepare_annotation_text import prepare_phr_aim
from annotate_text import *
from subprocess import *
from remove_general_umls_concepts import clean_annotation_output
from AlertCypher import AlertCypher
import RDAS_GFKG.methods as rdas
Expand Down Expand Up @@ -72,13 +73,23 @@ def years_to_files(subdir: str):
def aggregate_disease_data():
# Rename GARD-Project mapping results columns to match the names listed in the GARD data
normmap_df = pd.read_csv(data_neo4j('normmap_results.csv'),index_col=False,usecols=['ID','GARD_id','CONF_SCORE','SEM_SIM'])
normmap_df = normmap_df.rename(columns={'ID':'APPLICATION_ID', 'GARD_id': 'GARD_ID'})
normmap_df = normmap_df.rename(columns={'ID':'APPLICATION_ID'})

# Split tuple normmap result into 2 seperate columns
normmap_df[['GARD_NAME', 'GARD_ID']] = normmap_df['GARD_id'].str.extract(r'\(\'(.*?)\', \'(.*?)\'\)')
# drop the original column
normmap_df.drop('GARD_id', axis=1, inplace=True)

disease_df = pd.read_json(data_raw('all_gards.json'))
disease_df = disease_df.rename(columns={'GARD id':'GARD_ID', 'Name':'NAME', 'Synonyms':'SYNONYMS'})

normmap_df.reset_index(drop=True)
disease_df.reset_index(drop=True)

# Merge the GARD-Project mapping results with the GARD data
merged_df = pd.merge(normmap_df, disease_df, on=['GARD_ID'])
merged_df = pd.merge(normmap_df, disease_df, on=['GARD_ID'], how='left')
merged_df = merged_df.dropna(subset=['GARD_ID'])
#merged_df = normmap_df.merge(disease_df[['GARD_ID']])
merged_df.to_csv(data_neo4j('disease/disease_to_application.csv'),index=False)

def combine_normmap_results():
Expand All @@ -89,7 +100,7 @@ def combine_normmap_results():

for filename in files:
print(f'Combining abstract file {filename} into the last')
tmp = pd.read_csv(('{filename}'.format(filename=filename)),index_col=False, encoding = "ISO-8859-1")
tmp = pd.read_csv(('{filename}'.format(filename=filename)),index_col=False,sep='|')
combine_df = pd.concat([combine_df,tmp], axis=0)

combine_df['APPLICATION_ID'] = combine_df['ID'].astype(int)
Expand Down Expand Up @@ -181,7 +192,7 @@ def batch_normmap(df, thr, year):
with lock:
print({'ID': appl_id, 'GARD_id': gard, 'CONF_SCORE': add_data[0], 'SEM_SIM': add_data[1]})
with open(data_neo4j(f'normmap/normmap_results_{year}.csv'), "a") as f:
f.writelines([f'{appl_id},{gard},{add_data[0]},{add_data[1]}\n'])
f.writelines([f'{appl_id}|{gard}|{add_data[0]}|{add_data[1]}\n'])

#except Exception as e:
#print(e)
Expand Down Expand Up @@ -286,7 +297,7 @@ def run_normmap():

# Create CSV files headers
with open(data_neo4j(f'normmap/normmap_results_{year}.csv'), "w") as f:
f.writelines(['ID,GARD_id,CONF_SCORE,SEM_SIM\n'])
f.writelines(['ID|GARD_id|CONF_SCORE|SEM_SIM\n'])

df = pd.read_csv(norm_file, index_col=False, low_memory=False)
chunk_size = int(len(df)/5)
Expand Down Expand Up @@ -643,8 +654,8 @@ def cleanup_pub_country():

def fix_escaped_endings():
def tf(val):
if type(val) == str and val[-1] == '\\':
return val[:-1]
if type(val) == str:
return val.encode('unicode_escape').decode('utf-8')
else:
return val

Expand Down Expand Up @@ -700,6 +711,10 @@ def find_RD_apps(input_file, rd_ids):


def annotation_preprocess_grant():
# Load scispaCy model and add the abbreviation detector to the pipeline
nlp = spacy.load("en_core_sci_lg", exclude=["parser", "ner"])
nlp.add_pipe("abbreviation_detector")

# Get CSV files lists from projects and abstracts folders
projects_files = sorted(years_to_files("projects"))
abstracts_files = sorted(years_to_files("abstracts"))
Expand All @@ -711,7 +726,7 @@ def annotation_preprocess_grant():
continue

# Preprocesses information related to PHR and AIM (May not be needed due to Jaber's code)
annotate_text = prepare_phr_aim(projects_file, abstracts_file)
annotate_text = prepare_phr_aim(projects_file, abstracts_file, nlp)

year = projects_file[-8:-4]

Expand Down Expand Up @@ -755,10 +770,14 @@ def annotate_grant_abstracts():
umls.to_csv(output_file, index=False, mode='a', header=False)

print("Added annotations to", output_file)
text = None
umls = None

except Exception as e:
print(e)
continue

nlp = None

print("***** ALL DONE *****")

Expand Down Expand Up @@ -879,35 +898,6 @@ def prep_data(data_raw_path: str, data_neo4j_path: str) -> FilesToAdd:
"publications",
"normmap"
]



# Clear out old files, or initialize new repo for them if the given data_neo4j path does
# not already exist.
repo = None
is_new_repo = False
'''
try:
files = os.listdir(neo4j_path)
files.remove(".git")
print("Found existing git repo at neo4j_path, clearing out old files")
repo = pygit2.Repository(data_neo4j(".git"))
is_new_repo = False
for f in files:
# skip annotation-related files because we may not want to reannotate some files
if f in ["annotation_files", "annotation_source", "annotation_umls", "grants_umls"]:
folders_to_create.remove(f)
continue
fpath = data_neo4j(f)
shutil.rmtree(fpath) if os.path.isdir(fpath) else os.remove(fpath)
except (NotADirectoryError, FileNotFoundError, ValueError):
print("Not existing repo at given data_neo4j path, initializing")
repo = pygit2.init_repository(neo4j_path)
is_new_repo = True

print("Creating empty output directories")
'''


# add empty folders for the generated files
add_folder = lambda folder_name: os.mkdir(data_neo4j(folder_name))
Expand All @@ -919,7 +909,7 @@ def prep_data(data_raw_path: str, data_neo4j_path: str) -> FilesToAdd:
##############################################
# Run preprocessing stages one after another.#
##############################################

"""
print('Running get_disease_data')
get_disease_data()
print("Running get_RD_project_ids")
Expand Down Expand Up @@ -958,6 +948,7 @@ def prep_data(data_raw_path: str, data_neo4j_path: str) -> FilesToAdd:

print("Running annotation_preprocess_grant")
annotation_preprocess_grant()

print("Running annotate_grant_abstracts")
annotate_grant_abstracts()
print("Running clean_umls_concepts")
Expand All @@ -966,49 +957,27 @@ def prep_data(data_raw_path: str, data_neo4j_path: str) -> FilesToAdd:
clean_annotation_source()
print("Running map_semantic_types")
map_semantic_types()

print("Running fix_escaped_endings")
fix_escaped_endings()



################################################

# Finished running stages, now we figure out which files have changed
# (and store them in a FilesToAdd object to be returned) and commit the changes
'''
print("Getting current repo status")
status = repo.status().items()
fta = {}
for subdir in FilesToAdd.__required_keys__:
fta[subdir] = [data_neo4j(k) for k,v in status
if k.startswith(subdir)
and v in [pygit2.GIT_STATUS_WT_MODIFIED, pygit2.GIT_STATUS_WT_NEW]]

print("adding all changes to commit")
if is_new_repo:
ref = "HEAD"
parents = []
else:
ref = repo.head.name
parents = [repo.head.target]
index = repo.index
index.add_all()
index.write()
print("Committing")
author = pygit2.Signature('script authored', 'no@email.given')
committer = pygit2.Signature('script committed', 'no@email.given')
message = "regular update commit"
tree = index.write_tree()
repo.create_commit(ref, author, committer, message, tree, parents)
'''


# Transfers all processed files to neo4j-dev server so that the code can find them and populate the databases
print('Transfering grant database files to the neo4j-dev server')
target_url = sysvars.rdas_urls['neo4j-dev']
p = Popen(['scp', '-r', '-i', f'~/.ssh/id_rsa', f'{sysvars.gnt_files_path}/processed/', f'{sysvars.current_user}@{target_url}:{sysvars.gnt_files_path}'], encoding='utf8')
p.wait()
print('Transfer done...')
"""
# Gets the names of every processed file added for the rest of the code to add to the neo4j
fta = {}
for subdir in FilesToAdd.__dict__['__annotations__'].keys():
fta[subdir] = ['file:///' + data_neo4j(subdir)+'/'+d for d in os.listdir(data_neo4j(subdir))] #'file:/'
fta[subdir] = sorted(['file:///' + data_neo4j(subdir)+'/'+d for d in os.listdir(data_neo4j(subdir)) if not d == 'README.md']) #'file:/'
print(fta)
return fta


# For testing purposes; this file is typically not run directly, but instead called
# by update_grant.py with an appropriate raw and output path
Expand Down
6 changes: 3 additions & 3 deletions RDAS_GFKG/prepare_annotation_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,11 @@


# Load scispaCy model and add the abbreviation detector to the pipeline
nlp = spacy.load("en_core_sci_lg", exclude=["parser", "ner"])
nlp.add_pipe("abbreviation_detector")
#nlp = spacy.load("en_core_sci_lg", exclude=["parser", "ner"])
#nlp.add_pipe("abbreviation_detector")


def prepare_phr_aim(projects_file, abstracts_file):
def prepare_phr_aim(projects_file, abstracts_file, nlp):
'''
Select the Public Health Relevance (PHR) or
the aim section of the abstract for annotation.
Expand Down
Loading