From 6bfe0b4bdcc21dea7b4f79a3698339c3e4000a17 Mon Sep 17 00:00:00 2001 From: Kevin Vizhalil Date: Mon, 7 Oct 2024 09:41:24 -0400 Subject: [PATCH] split patterns and synonyms #2388 --- code/ARAX/ARAXQuery/Filter_KG/remove_nodes.py | 21 +++++++++++-------- .../KnowledgeSources/general_concepts.json | 8 +++---- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/code/ARAX/ARAXQuery/Filter_KG/remove_nodes.py b/code/ARAX/ARAXQuery/Filter_KG/remove_nodes.py index fbe66f886..53569fe7b 100644 --- a/code/ARAX/ARAXQuery/Filter_KG/remove_nodes.py +++ b/code/ARAX/ARAXQuery/Filter_KG/remove_nodes.py @@ -157,23 +157,25 @@ def remove_orphaned_nodes(self): return self.response def _is_general_concept(self, node): - curies = [] - synonyms = [] + curies = set() + synonyms = set() if not node['attributes']: return False for attribute in node['attributes']: if attribute['attribute_type_id'] == 'biolink:xref': - curies += attribute.get('value',[]) + curies.update(map(str.lower, attribute.get('value', []))) if attribute['attribute_type_id'] == 'biolink:synonym': - synonyms += attribute.get('value',[]) + synonyms.update(map(str.lower, attribute.get('value', []))) if node['name']: - synonyms.append(node['name'].lower()) - if self.block_list_curies.intersection([curie.lower() for curie in curies if curie]): + synonyms.add(node['name'].lower()) + if self.block_list_curies.intersection(curies) or self.block_list_synonyms.intersection(synonyms): return True + for synonym in synonyms: - for block_list_synonym in self.block_list_synonyms: - if isinstance(synonym,str) and isinstance(block_list_synonym,str) and re.match(block_list_synonym, synonym,re.IGNORECASE): - return True + if not isinstance(synonym,str): + continue + if any(p.match(synonym) for p in self.block_list_patterns): + return True return False def remove_general_concept_nodes(self): @@ -200,6 +202,7 @@ def remove_general_concept_nodes(self): self.block_list_synonyms = set(block_list_dict["synonyms"]) self.block_list_curies = set(block_list_dict["curies"]) node_to_remove = set() + self.block_list_patterns = [re.compile(pattern,re.IGNORECASE) for pattern in block_list_dict["patterns"]] # iterate over edges find edges connected to the nodes for key, edge in self.message.knowledge_graph.edges.items(): if set({edge.subject, edge.object}).intersection(node_to_remove): diff --git a/code/ARAX/KnowledgeSources/general_concepts.json b/code/ARAX/KnowledgeSources/general_concepts.json index 4e30b2e89..79709da75 100644 --- a/code/ARAX/KnowledgeSources/general_concepts.json +++ b/code/ARAX/KnowledgeSources/general_concepts.json @@ -347,7 +347,6 @@ "umls:c0003209", "umls:c4045974", "umls:c0005522" - ], "synonyms": [ "used in nicotine dependence", @@ -684,7 +683,6 @@ "secondary", "uterotonics", "radiotherapy", - "pharmacolog.*", "medicament", "Anesthetics", "vaccines", @@ -708,6 +706,8 @@ "Introns", "antioxidant", "Oils" - + ], + "patterns": [ + "pharmacolog.*" ] -} +} \ No newline at end of file