Fixed some types of recursive derivations.

timarkh · Dec 19, 2022 · adc246e · adc246e
1 parent 1c43966
commit adc246e
Show file tree

Hide file tree

Showing 8 changed files with 120 additions and 27 deletions.
diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = uniparser-morph
-version = 2.7.2
+version = 2.7.3
 author = Timofey Arkhangelskiy
 author_email = timarkh@gmail.com
 description = Rule-based, linguist-friendly (and rather slow) morphological analysis

diff --git a/tests/derivations.txt b/tests/derivations.txt
@@ -6,10 +6,36 @@
  keep_lex_data: brackets
  gloss: PV
 
-
 -deriv-type: V-nyt
  lex: <0>ныт[.]ын 
  stem: ныт[.].
  regex-stem: т.*
  gramm: +pv,pv_ny
  gloss: PV
+
+
+
+-deriv-type: V-se
+ lex: <0>[.]se
+ stem: [.]se.
+ paradigm: der_test_adv
+ gramm: adv,ptcp
+ gloss: PTCP
+ id: septcp
+
+-deriv-type: A-mi
+ lex: [.]+mï
+ stem: [.]mï.
+ paradigm: der_test_noun
+ gramm: n
+ gloss: NMLZ
+ id: minmlz
+
+-deriv-type: V-semi
+ lex: [.]+semï
+ stem: [.]se&mï.
+ paradigm: der_test_noun
+ gramm: n
+ gloss: PTCP&NMLZ
+ id: septcp,minmlz
+
diff --git a/tests/import_test.py b/tests/import_test.py
@@ -54,7 +54,7 @@
     analyses = a.analyze_words('юртъёсаз', format='conll')
     print(analyses)
 
-    # Test derivations
+    # Test simple derivations
     analyses = a.analyze_words('тулы')
     print(analyses)
     analyses = a.analyze_words('ныттулы')
@@ -64,6 +64,16 @@
     analyses = a.analyze_words('ныууыныс')
     print(analyses)
 
+    # Test recursive derivations
+    for w in [
+        "yarika",     # bare verb
+        "yarikase",   # adverbialized verb
+        "tatune",     # bare adverb
+        "tatunemï",   # nominalized adverb
+        "yarikasemï"  # adverbialized, then nominalized verb
+    ]:
+        print(a.analyze_words(w))
+
     # Test sentences and complex structures
     analyses = a.analyze_words(['Морфологиез', [['А'], ['Мон', 'тонэ', 'яратӥсько', '.']]], format='xml')
     print(analyses)

diff --git a/tests/lexemes.txt b/tests/lexemes.txt
@@ -238812,3 +238812,20 @@
  trans_ru: видеть
  trans_en: see
  gramm: V,tr
+
+-lexeme
+ lex: yarika
+ stem: .yarika.
+ paradigm: der_test_verb
+ paradigm: der_test_v_nmlz
+ trans_en: laugh
+ gloss: laugh
+ gramm: v
+
+-lexeme
+ lex: tatune
+ stem: .tatune.
+ paradigm: der_test_adv
+ trans_en: warm
+ gloss: warm
+ gramm: adv
diff --git a/tests/paradigms.txt b/tests/paradigms.txt
@@ -5881,4 +5881,23 @@
  -deriv-link: V-nyw
   recurs_class: 1
  -deriv-link: V-nyt
-  recurs_class: 1
+  recurs_class: 1
+
+-paradigm: der_test_verb
+ -flex: .
+  gramm:
+
+-paradigm: der_test_noun
+ -flex: .
+  gramm:
+
+-paradigm: der_test_adv
+ -flex: .
+  gramm:
+ deriv-link: A-mi
+
+-paradigm: der_test_v_nmlz
+ -flex: .
+  gramm:
+ deriv-link: V-se
+ deriv-link: V-semi
diff --git a/uniparser_morph/grammar.py b/uniparser_morph/grammar.py
@@ -178,8 +178,10 @@ def load_lex_rules(self, fnames):
         return len(self.lexRulesByLemma) + len(self.lexRulesByStem)
 
     def load_clitics(self, fnames):
-        """Load clitics from the file or files specified by fnames.
-        Return the number of lexemes loaded."""
+        """
+        Load clitics from the file or files specified by fnames.
+        Return the number of lexemes loaded.
+        """
         clDescrs = self.load_yaml_descrs(fnames)
         for dictDescr in clDescrs:
             if dictDescr is None or len(dictDescr) <= 0:
@@ -192,14 +194,16 @@ def load_clitics(self, fnames):
         return len(self.clitics)
 
     def load_derivations(self, fnames, compileDerivs=False):
-        """Load derivations from the file or files specified by fnames.
-        Return the number of derivations loaded."""
+        """
+        Load derivations from the file or files specified by fnames.
+        Return the number of derivations loaded.
+        """
         derivDescrs = self.load_yaml_descrs(fnames)
         for dictDescr in derivDescrs:
             dictDescr['value'] = '#deriv#' + dictDescr['value']
             try:
                 self.derivations[dictDescr['value']] =\
-                                  Derivation(self, dictDescr, self.errorHandler)
+                                 Derivation(self, dictDescr, self.errorHandler)
             except MemoryError:
                 self.raise_error('Not enough memory for the derivations.')
                 return

diff --git a/uniparser_morph/paradigm.py b/uniparser_morph/paradigm.py
@@ -105,7 +105,7 @@ def __init__(self, g, dictDescr, errorHandler=None):
         self.otherDataBracketR = ''     # if keepOtherData == True, append this to the non-essential
                                         # values copied from lexeme (e.g. to translations)
         self.otherData = []
-        self.lemmaChanger = None    # an inflexion object which changes the lemma
+        self.lemmaChangers = []     # inflexions object which change the lemma
         self.startWithSelf = False  # if true, start with the inflexion when joining
                                     # itself to a stem or to a previous inflexion
         try:
@@ -225,8 +225,9 @@ def add_lemma_changer(self, obj):
             self.raise_error('Wrong lemma in ' + self.flex + ': ', newLemma)
             return
         dictDescr = {'name': 'flex', 'value': newLemma, 'content': []}
-        self.lemmaChanger = Inflexion(self.g, dictDescr, self.errorHandler)
-        self.lemmaChanger.startWithSelf = True
+        lemmaChanger = Inflexion(self.g, dictDescr, self.errorHandler)
+        lemmaChanger.startWithSelf = True
+        self.lemmaChangers.append(lemmaChanger)
 
     def remove_stem_number(self):
         flex = self.flex
@@ -771,11 +772,12 @@ def init_derivation(self, data):
         for stem, gloss, gramm in zip(stems, glosses, gramms):
             for stemVar in stem.split('//'):
                 stemVar = re.sub('\\.(?!\\])', '<.>', stemVar)
-                stemVar = stemVar.replace('[.]', '.')
+                # Different conventions for morphemes in stems and inflexions:
+                stemVar = stemVar.replace('[.]', '.').replace('&', '|')
                 bReplaceGrammar = True
                 arrContent = copy.deepcopy(newData)
                 if len(gloss) > 0:
-                    arrContent.append({'name': 'gloss', 'value': gloss})
+                    arrContent.append({'name': 'gloss', 'value': gloss.replace('&', '|')})
                 if gramm.startswith('+') or len(gramm) <= 0:
                     bReplaceGrammar = False
                     gramm = gramm[1:]
@@ -991,7 +993,11 @@ def extend_one(self, flexL):
         return extensions
 
     @classmethod
-    def join_inflexions(cls, flexL, flexR, paradigmLink, partialCompile=True):
+    def join_inflexions(cls,
+                        flexL: Inflexion,
+                        flexR: Inflexion,
+                        paradigmLink: ParadigmLink=None,
+                        partialCompile=True):
         # print(flexL.flex, flexR.flex)
         if not cls.stem_numbers_agree(flexL, flexR):
             return None
@@ -1006,13 +1012,13 @@ def join_inflexions(cls, flexL, flexR, paradigmLink, partialCompile=True):
             flexL.copy_std()
 
         # Manage links to the subsequent paradigms:
-        if paradigmLink.position != POS_UNSPECIFIED:
+        if paradigmLink is not None and paradigmLink.position != POS_UNSPECIFIED:
             flexL.position = paradigmLink.position
         else:
             flexL.position = flexR.position
-        if paradigmLink.position == POS_FINAL:
+        if paradigmLink is not None and paradigmLink.position == POS_FINAL:
             flexL.make_final()
-        elif len(paradigmLink.subsequent) > 0:
+        elif paradigmLink is not None and len(paradigmLink.subsequent) > 0:
             flexL.subsequent = paradigmLink.subsequent
         else:
             flexL.subsequent = flexR.subsequent
@@ -1050,10 +1056,13 @@ def join_inflexions(cls, flexL, flexR, paradigmLink, partialCompile=True):
                                                         flexR.flexStdObj,
                                                         paradigmLink,
                                                         partialCompile)
+
+        # Join lexeme changers
+        flexL.lemmaChangers += flexR.lemmaChangers
         return flexL
 
     @classmethod
-    def join_other_data(cls, flexL, flexR):
+    def join_other_data(cls, flexL: Inflexion, flexR: Inflexion):
         """
         Add otherData values from flexR to flexL.
         """
@@ -1202,8 +1211,11 @@ def join_inflexion_parts(cls, flexPartsL, flexPartsR):
                 fpOldL = flexPartsL[-1][1:]
             else:
                 fpOldL = flexPartsL[-1]
-            if fpOldL[0].flex != '<.>':
-                fpOldL.insert(0, InflexionPart('<.>', '<.>', GLOSS_NEXT_FLEX))
+            # if fpOldL[0].flex != '<.>':
+            #     fpOldL.insert(0, InflexionPart('<.>', '<.>', GLOSS_NEXT_FLEX))
+            # TODO: Deal with GLOSS_STARTWITHSELF in a definitive way.
+            # TODO: Probably abandon it in favor of uniform handling of stems
+            # TODO: in prardigms and derivations.
             fpNew = [InflexionPart('', '', GLOSS_STARTWITHSELF)]
         else:
             fpOldR = flexPartsR[0]

diff --git a/uniparser_morph/wordform.py b/uniparser_morph/wordform.py
@@ -2,6 +2,7 @@
 import re
 import xml.sax.saxutils
 from .common_functions import wfPropertyFields, check_compatibility, join_stem_flex
+from .paradigm import Paradigm, ParadigmLink
 
 
 class Wordform:
@@ -58,28 +59,32 @@ def raise_error(self, message, data=None):
             self.errorHandler.raise_error(message, data)
 
     def add_lemma(self, lex, flex):
-        if flex.lemmaChanger is None:
+        if len(flex.lemmaChangers) <= 0:
             if self.lemma == "":
                 self.lemma = lex.lemma
             elif lex.lemma != "":
                 self.lemma += "+"+lex.lemma
             return
+        unifiedLC = copy.deepcopy(flex.lemmaChangers[0])
+        for i in range(1, len(flex.lemmaChangers)):
+            lc = flex.lemmaChangers[i]
+            Paradigm.join_inflexions(unifiedLC, lc)
         suitableSubLex = [sl for sl in lex.subLexemes
-                          if flex.lemmaChanger.stemNum is None or
-                             len(sl.numStem & flex.lemmaChanger.stemNum) > 0]
+                          if unifiedLC.stemNum is None
+                          or len(sl.numStem & unifiedLC.stemNum) > 0]
         if len(suitableSubLex) <= 0:
             if lex.num_stems() == 1:
                 suitableSubLex = lex.subLexemes
         if len(suitableSubLex) <= 0:
             self.raise_error('No stems available to create the new lemma ' +
-                             flex.lemmaChanger.flex)
+                             unifiedLC.flex)
             self.lemma = ''
             return
         if len(suitableSubLex) > 1:
             if self.verbosity > 0:
                 self.raise_error('Several stems available to create the new lemma ' +
-                                 flex.lemmaChanger.flex)
-        wfLemma = Wordform(self.g, suitableSubLex[0], flex.lemmaChanger,
+                                 unifiedLC.flex)
+        wfLemma = Wordform(self.g, suitableSubLex[0], unifiedLC,
                            self.errorHandler)
         self.lemma = wfLemma.wf