Skip to content

Commit

Permalink
Document.merge_documents() (#1)
Browse files Browse the repository at this point in the history
  • Loading branch information
myedibleenso authored Feb 2, 2024
1 parent f48cb6e commit 11970d5
Show file tree
Hide file tree
Showing 53 changed files with 123 additions and 4 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ minversion = "7.0"
addopts = "-v --cov=lum.clu --cov-report html:docs/coverage"
testpaths = [
"python/tests",
"python/lum/clu/processors/tests",
"python/lum/clu/odin/tests"
]

Expand Down
1 change: 0 additions & 1 deletion python/lum/clu/odin/mention.py
Original file line number Diff line number Diff line change
Expand Up @@ -535,7 +535,6 @@ class EventMention(Mention):
arguments: Mention.Arguments = Field(default={}, description="A mapping of the EventMention's arguments (role -> list[Mention])")
paths: typing.Optional[Mention.Paths] = Field(default={}, description="Graph traversal leading to each argument")


def copy(
self,
maybe_trigger: typing.Optional[TextBoundMention] = None,
Expand Down
33 changes: 33 additions & 0 deletions python/lum/clu/processors/document.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from __future__ import annotations
from pydantic import BaseModel, Field, ConfigDict
from lum.clu.processors.sentence import Sentence
from lum.clu.processors.utils import Labels
Expand All @@ -18,6 +19,38 @@ class Document(BaseModel):

sentences: list[Sentence] = Field(description="The sentences comprising the `Document`.")

@staticmethod
def merge_documents(docs: list[Document]) -> Document:
"""Merges two or more Documents into a single Document."""
text = ""
sentences = []
offset = 0
for doc in docs:
for old in doc.sentences:
s = Sentence(
text = old.text,
raw = old.raw,
words = old.words,
startOffsets = [i + offset for i in old.start_offsets],
endOffsets = [i + offset for i in old.end_offsets],
tags= old.tags,
lemmas = old.lemmas,
norms = old.norms,
chunks = old.chunks,
entities = old.entities,
graphs = old.graphs
)
sentences.append(s)
if doc.text:
text += doc.text
offset += len(doc.text)

return Document(
id = docs[0].id,
text=text if len(text) > 0 else None,
sentences=sentences
)

# size : int
# The number of `sentences`.

Expand Down
9 changes: 6 additions & 3 deletions python/lum/clu/processors/sentence.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from __future__ import annotations
from pydantic import BaseModel, Field, model_validator
from pydantic import BaseModel, ConfigDict, Field, model_validator
from lum.clu.processors.directed_graph import DirectedGraph
from lum.clu.processors.utils import Labels
import typing
Expand All @@ -15,7 +15,10 @@ class Sentence(BaseModel):
"""
Storage class for an annotated sentence. Based on [`org.clulab.processors.Sentence`](https://github.com/clulab/processors/blob/master/main/src/main/scala/org/clulab/processors/Sentence.scala)
"""
text: typing.Optional[str] = Field(default=None, description=" The text of the `Sentence`.")

model_config = ConfigDict(populate_by_name=True)

text: typing.Optional[str] = Field(default=None, description=" The text of the `Sentence`.", exclude=True)

raw: list[str] = Field(description="Raw tokens in this sentence; these are expected to match the original text")

Expand All @@ -36,7 +39,7 @@ class Sentence(BaseModel):
entities: typing.Optional[list[str]] = Field(default=None, description="A list of the `Sentence`'s tokens represented using IOB-style named entity (NE) labels.")

graphs: dict[str, DirectedGraph] = Field(description="A dictionary (str -> `lum.clu.processors.doc.DirectedGraph`) mapping the graph type/name to a `lum.clu.processors.doc.DirectedGraph`.")

@model_validator(mode="before")
@classmethod
def raw_or_words(cls, data: typing.Any) -> typing.Any:
Expand Down
Empty file.
1 change: 1 addition & 0 deletions python/lum/clu/processors/tests/data/example-1-part-0.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id":null,"text":"I like turtles\n\n","sentences":[{"raw":["I","like","turtles"],"words":["I","like","turtles"],"start_offsets":[0,2,7],"end_offsets":[1,6,14],"tags":["PRP","VBP","NNS"],"lemmas":["i","like","turtle"],"norms":["","",""],"chunks":["B-NP","B-PP","B-NP"],"entities":["O","O","O"],"graphs":{"universal-enhanced":{"roots":[1],"edges":[{"source":1,"destination":0,"relation":"nsubj"},{"source":1,"destination":2,"relation":"dobj"}]},"universal-basic":{"roots":[1],"edges":[{"source":1,"destination":0,"relation":"nsubj"},{"source":1,"destination":2,"relation":"dobj"}]},"hybrid":{"roots":[1],"edges":[{"source":1,"destination":0,"relation":"nsubj"},{"source":1,"destination":2,"relation":"dobj"}]},"semantic-roles":{"roots":[0,1,2],"edges":[]},"enhanced-semantic-roles":{"roots":[0,1,2],"edges":[]}}}]}
1 change: 1 addition & 0 deletions python/lum/clu/processors/tests/data/example-1-part-1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id":null,"text":"How about you?\n","sentences":[{"raw":["How","about","you","?"],"words":["How","about","you","?"],"start_offsets":[0,4,10,13],"end_offsets":[3,9,13,14],"tags":["WRB","IN","PRP","."],"lemmas":["how","about","you","?"],"norms":["","","",""],"chunks":["B-ADVP","B-PP","B-NP","O"],"entities":["O","O","O","O"],"graphs":{"universal-enhanced":{"roots":[0],"edges":[{"source":0,"destination":2,"relation":"nmod_about"},{"source":2,"destination":3,"relation":"punct"},{"source":2,"destination":1,"relation":"case"}]},"universal-basic":{"roots":[0],"edges":[{"source":2,"destination":1,"relation":"case"},{"source":0,"destination":2,"relation":"nmod"},{"source":2,"destination":3,"relation":"punct"}]},"hybrid":{"roots":[0],"edges":[{"source":0,"destination":2,"relation":"nmod_about"},{"source":2,"destination":3,"relation":"punct"},{"source":2,"destination":1,"relation":"case"}]},"semantic-roles":{"roots":[0,1,2,3],"edges":[]},"enhanced-semantic-roles":{"roots":[0,1,2,3],"edges":[]}}}]}
1 change: 1 addition & 0 deletions python/lum/clu/processors/tests/data/example-1-part-2.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id":null,"text":"I'm not sure","sentences":[{"raw":["I","'m","not","sure"],"words":["I","am","not","sure"],"start_offsets":[0,1,4,8],"end_offsets":[1,3,7,12],"tags":["PRP","VBP","RB","JJ"],"lemmas":["i","be","not","sure"],"norms":["","","",""],"chunks":["B-NP","B-VP","O","B-ADJP"],"entities":["O","O","O","O"],"graphs":{"universal-enhanced":{"roots":[3],"edges":[{"source":3,"destination":2,"relation":"neg"},{"source":3,"destination":1,"relation":"cop"},{"source":3,"destination":0,"relation":"nsubj"}]},"universal-basic":{"roots":[3],"edges":[{"source":3,"destination":0,"relation":"nsubj"},{"source":3,"destination":1,"relation":"cop"},{"source":3,"destination":2,"relation":"neg"}]},"hybrid":{"roots":[3],"edges":[{"source":3,"destination":2,"relation":"neg"},{"source":3,"destination":1,"relation":"cop"},{"source":3,"destination":0,"relation":"nsubj"}]},"semantic-roles":{"roots":[0,1,2,3],"edges":[]},"enhanced-semantic-roles":{"roots":[0,1,2,3],"edges":[]}}}]}
1 change: 1 addition & 0 deletions python/lum/clu/processors/tests/data/example-2-part-0.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id":null,"text":"Malaga County Water District\n","sentences":[{"raw":["Malaga","County","Water","District"],"words":["Malaga","County","Water","District"],"start_offsets":[0,7,14,20],"end_offsets":[6,13,19,28],"tags":["NNP","NNP","NNP","NNP"],"lemmas":["malaga","county","water","district"],"norms":["","","",""],"chunks":["B-NP","I-NP","I-NP","I-NP"],"entities":["B-ORG","I-ORG","I-ORG","O"],"graphs":{"universal-enhanced":{"roots":[3],"edges":[{"source":3,"destination":2,"relation":"compound"},{"source":3,"destination":1,"relation":"compound"},{"source":3,"destination":0,"relation":"compound"}]},"universal-basic":{"roots":[3],"edges":[{"source":3,"destination":0,"relation":"compound"},{"source":3,"destination":1,"relation":"compound"},{"source":3,"destination":2,"relation":"compound"}]},"hybrid":{"roots":[3],"edges":[{"source":3,"destination":2,"relation":"compound"},{"source":3,"destination":1,"relation":"compound"},{"source":3,"destination":0,"relation":"compound"}]},"semantic-roles":{"roots":[0,1,2,3],"edges":[]},"enhanced-semantic-roles":{"roots":[0,1,2,3],"edges":[]}}}]}
1 change: 1 addition & 0 deletions python/lum/clu/processors/tests/data/example-2-part-1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id":null,"text":"WWTP N Reduction\n","sentences":[{"raw":["WWTP","N","Reduction"],"words":["WWTP","N","Reduction"],"start_offsets":[0,5,7],"end_offsets":[4,6,16],"tags":["NNP","NNP","NNP"],"lemmas":["wwtp","n","reduction"],"norms":["","",""],"chunks":["B-NP","I-NP","I-NP"],"entities":["B-ORG","I-ORG","I-ORG"],"graphs":{"universal-enhanced":{"roots":[2],"edges":[{"source":2,"destination":1,"relation":"compound"},{"source":2,"destination":0,"relation":"compound"}]},"universal-basic":{"roots":[2],"edges":[{"source":2,"destination":0,"relation":"compound"},{"source":2,"destination":1,"relation":"compound"}]},"hybrid":{"roots":[2],"edges":[{"source":2,"destination":1,"relation":"compound"},{"source":2,"destination":0,"relation":"compound"},{"source":2,"destination":1,"relation":"A1"}]},"semantic-roles":{"roots":[0,2],"edges":[{"source":2,"destination":1,"relation":"A1"}]},"enhanced-semantic-roles":{"roots":[0,2],"edges":[{"source":2,"destination":1,"relation":"A1"}]}}}]}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id":null,"text":"origin in consideration for an award.\n","sentences":[{"raw":["origin","in","consideration","for","an","award","."],"words":["origin","in","consideration","for","an","award","."],"start_offsets":[0,7,10,24,28,31,36],"end_offsets":[6,9,23,27,30,36,37],"tags":["NN","IN","NN","IN","DT","NN","."],"lemmas":["origin","in","consideration","for","an","award","."],"norms":["","","","","","",""],"chunks":["I-NP","B-PP","B-NP","B-PP","B-NP","I-NP","O"],"entities":["O","O","O","O","O","O","O"],"graphs":{"universal-enhanced":{"roots":[0],"edges":[{"source":0,"destination":2,"relation":"nmod_in"},{"source":0,"destination":6,"relation":"punct"},{"source":2,"destination":1,"relation":"case"},{"source":2,"destination":5,"relation":"nmod_for"},{"source":5,"destination":3,"relation":"case"},{"source":5,"destination":4,"relation":"det"}]},"universal-basic":{"roots":[0],"edges":[{"source":2,"destination":1,"relation":"case"},{"source":0,"destination":2,"relation":"nmod"},{"source":5,"destination":3,"relation":"case"},{"source":5,"destination":4,"relation":"det"},{"source":2,"destination":5,"relation":"nmod"},{"source":0,"destination":6,"relation":"punct"}]},"hybrid":{"roots":[0],"edges":[{"source":0,"destination":2,"relation":"nmod_in"},{"source":0,"destination":6,"relation":"punct"},{"source":2,"destination":1,"relation":"case"},{"source":2,"destination":5,"relation":"nmod_for"},{"source":5,"destination":3,"relation":"case"},{"source":5,"destination":4,"relation":"det"},{"source":2,"destination":5,"relation":"Ax_for"},{"source":2,"destination":0,"relation":"A1"}]},"semantic-roles":{"roots":[5,1,6,2,4],"edges":[{"source":2,"destination":0,"relation":"A1"},{"source":2,"destination":3,"relation":"Ax"}]},"enhanced-semantic-roles":{"roots":[1,6,2,3,4],"edges":[{"source":2,"destination":5,"relation":"Ax_for"},{"source":2,"destination":0,"relation":"A1"}]}}}]}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id":null,"text":"In general, the Work consists of Demolish of DAF basin equipment, furnish and installation of\n","sentences":[{"raw":["In","general",",","the","Work","consists","of","Demolish","of","DAF","basin","equipment",",","furnish","and","installation","of"],"words":["In","general",",","the","Work","consists","of","Demolish","of","DAF","basin","equipment",",","furnish","and","installation","of"],"start_offsets":[0,3,10,12,16,21,30,33,42,45,49,55,64,66,74,78,91],"end_offsets":[2,10,11,15,20,29,32,41,44,48,54,64,65,73,77,90,93],"tags":["IN","JJ",",","DT","NN","VBZ","IN","NNP","IN","NNP","NN","NN",",","VBP","CC","NN","IN"],"lemmas":["in","general",",","the","work","consist","of","demolish","of","daf","basin","equipment",",","furnish","and","installation","of"],"norms":["","","","","","","","","","","","","","","","",""],"chunks":["B-PP","B-ADJP","O","B-NP","I-NP","B-VP","B-PP","B-NP","B-PP","B-NP","I-NP","I-NP","O","B-NP","I-NP","I-NP","B-PP"],"entities":["O","O","O","O","O","O","O","O","O","B-ORG","O","O","O","O","O","O","O"],"graphs":{"universal-enhanced":{"roots":[16,5],"edges":[{"source":1,"destination":0,"relation":"case"},{"source":4,"destination":3,"relation":"det"},{"source":5,"destination":1,"relation":"nmod_in"},{"source":5,"destination":4,"relation":"nsubj"},{"source":5,"destination":13,"relation":"nmod_of"},{"source":5,"destination":7,"relation":"nmod_of"},{"source":5,"destination":2,"relation":"punct"},{"source":7,"destination":13,"relation":"conj"},{"source":7,"destination":12,"relation":"punct"},{"source":7,"destination":6,"relation":"case"},{"source":7,"destination":11,"relation":"nmod_of"},{"source":11,"destination":9,"relation":"compound"},{"source":11,"destination":8,"relation":"case"},{"source":11,"destination":10,"relation":"compound"},{"source":13,"destination":15,"relation":"conj_and"},{"source":13,"destination":14,"relation":"cc"}]},"universal-basic":{"roots":[5],"edges":[{"source":1,"destination":0,"relation":"case"},{"source":5,"destination":1,"relation":"nmod"},{"source":5,"destination":2,"relation":"punct"},{"source":4,"destination":3,"relation":"det"},{"source":5,"destination":4,"relation":"nsubj"},{"source":7,"destination":6,"relation":"case"},{"source":5,"destination":7,"relation":"nmod"},{"source":11,"destination":8,"relation":"case"},{"source":11,"destination":9,"relation":"compound"},{"source":11,"destination":10,"relation":"compound"},{"source":7,"destination":11,"relation":"nmod"},{"source":7,"destination":12,"relation":"punct"},{"source":7,"destination":13,"relation":"conj"},{"source":13,"destination":14,"relation":"cc"},{"source":13,"destination":15,"relation":"conj"},{"source":13,"destination":16,"relation":"nmod"}]},"hybrid":{"roots":[5],"edges":[{"source":1,"destination":0,"relation":"case"},{"source":4,"destination":3,"relation":"det"},{"source":5,"destination":1,"relation":"nmod_in"},{"source":5,"destination":4,"relation":"nsubj"},{"source":5,"destination":13,"relation":"nmod_of"},{"source":5,"destination":7,"relation":"nmod_of"},{"source":5,"destination":2,"relation":"punct"},{"source":7,"destination":13,"relation":"conj"},{"source":7,"destination":12,"relation":"punct"},{"source":7,"destination":6,"relation":"case"},{"source":7,"destination":11,"relation":"nmod_of"},{"source":11,"destination":9,"relation":"compound"},{"source":11,"destination":8,"relation":"case"},{"source":11,"destination":10,"relation":"compound"},{"source":13,"destination":15,"relation":"conj_and"},{"source":13,"destination":14,"relation":"cc"},{"source":5,"destination":7,"relation":"Ax_of"},{"source":5,"destination":4,"relation":"A1"},{"source":5,"destination":13,"relation":"Ax_of"},{"source":7,"destination":4,"relation":"A0"},{"source":11,"destination":10,"relation":"A1"},{"source":13,"destination":16,"relation":"A1"},{"source":13,"destination":4,"relation":"A0"},{"source":15,"destination":16,"relation":"A1"},{"source":15,"destination":4,"relation":"A0"}]},"semantic-roles":{"roots":[0,5,14,1,9,13,2,12,7,3,11,8,15],"edges":[{"source":5,"destination":4,"relation":"A1"},{"source":5,"destination":6,"relation":"Ax"},{"source":11,"destination":10,"relation":"A1"},{"source":13,"destination":4,"relation":"A0"},{"source":15,"destination":16,"relation":"A1"}]},"enhanced-semantic-roles":{"roots":[0,5,14,1,6,9,2,12,3,11,8,15],"edges":[{"source":5,"destination":7,"relation":"Ax_of"},{"source":5,"destination":4,"relation":"A1"},{"source":5,"destination":13,"relation":"Ax_of"},{"source":7,"destination":4,"relation":"A0"},{"source":11,"destination":10,"relation":"A1"},{"source":13,"destination":16,"relation":"A1"},{"source":13,"destination":4,"relation":"A0"},{"source":15,"destination":16,"relation":"A1"},{"source":15,"destination":4,"relation":"A0"}]}}}]}
Loading

0 comments on commit 11970d5

Please sign in to comment.