Merge pull request oracc#41 from oracc/master

Adding new changes from rillian.
rillian · Jun 9, 2019 · 8d00aa7 · 8d00aa7
2 parents 0bae693 + c32960b
commit 8d00aa7
Show file tree

Hide file tree

Showing 8 changed files with 104 additions and 101 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -10,16 +10,11 @@ matrix:
     - os: linux
       sudo: false
       python: 2.7
-      env: MYPYTHON=jython - JYTHON_URL="http://search.maven.org/remotecontent?filepath=org/python/jython-installer/2.7.0/jython-installer-2.7.0.jar"
-    - os: linux
-      sudo: false
-      python: 2.7
-      env: MYPYTHON=jython - JYTHON_URL="http://search.maven.org/remotecontent?filepath=org/python/jython-installer/2.7.1b3/jython-installer-2.7.1b3.jar"
+      env: MYPYTHON=jython - JYTHON_URL="https://search.maven.org/remotecontent?filepath=org/python/jython-installer/2.7.1/jython-installer-2.7.1.jar"
     - os: linux
       sudo: false
       python: 3.5
     - os: osx
-      osx_image: xcode7.3
       language: generic
       env: PIP=pip3 MYPYTHON=python3
 env:
@@ -56,14 +51,13 @@ install:
         # Install dependencies
         $PIP install pip
         if [ "$MYPYTHON" == "jython" ]; then
-          $PIP install pip==8.1.2
-          $PIP install pytest==2.9.2
+          $PIP install pytest
         fi
         $PIP install wheel
         $PIP install setuptools
         $PIP install ply pep8 mako
         if [ "$MYPYTHON" != "jython" ]; then
-          $PIP install pytest pytest-cov codecov
+          $PIP install --upgrade pytest pytest-cov codecov
         fi
 
 script:
@@ -75,7 +69,7 @@ script:
       if [ "$MYPYTHON" == "jython" ]; then
         py.test
       else
-        py.test --cov=pyoracc --cov-report xml --cov-report html --runslow
+        pytest --cov=pyoracc
       fi
 
   - pep8 --exclude=parsetab.py .

diff --git a/README.md b/README.md
@@ -128,7 +128,7 @@ Options:
 
 Before running pytest and coverage, install [py.test](https://docs.pytest.org/en/latest/getting-started.html) and [pytest-cov](https://pypi.org/project/pytest-cov/).
 
-    $ py.test --cov=pyoracc --cov-report xml --cov-report html --cov-report annotate --runslow
+    $ py.test --cov=pyoracc --cov-report xml --cov-report html --cov-report annotate
 
 Before running pycodestyle, install [pycodestyle](https://pypi.org/project/pycodestyle/).
 

diff --git a/conftest.py b/conftest.py
diff --git a/pyoracc/atf/common/atffile.py b/pyoracc/atf/common/atffile.py
@@ -20,6 +20,8 @@
 import codecs
 import sys
 import logging
+import json
+from numbers import Number
 
 from pyoracc.atf.cdli.atflex import AtfCDLILexer
 from pyoracc.atf.cdli.atfyacc import AtfCDLIParser
@@ -72,6 +74,32 @@ def __str__(self):
     def serialize(self):
         return AtfFile.template.render_unicode(**vars(self))
 
+    def to_json(self, skip_empty=True, **kwargs):
+        '''Return a JSON representation of the parsed file.
+
+        The optional skip_empty argument determines whether keys
+        with empty values are included in the output. Set it to
+        False to see all possible object members.
+
+        Otherwise it accepts the same optional arguments as
+        json.dumps().'''
+        def _make_serializable(obj):
+            '''Construct a dict representation of an object.
+
+            This is necessary to handle our custom objects
+            which json.JSONEncoder doesn't know how to
+            serialize.'''
+
+            return {k: v
+                    for k, v in vars(obj).items()
+                    if not str(k).startswith('_') and not (
+                        skip_empty and not v and not isinstance(v, Number)
+                    )}
+
+        kwargs.setdefault('indent', 2)
+        kwargs.setdefault('default', _make_serializable)
+        return json.dumps(self.text, **kwargs)
+
 
 def check_atf(infile, atftype, verbose=False):
     content = codecs.open(infile,

diff --git a/pyoracc/atf/common/atflex.py b/pyoracc/atf/common/atflex.py
@@ -69,36 +69,36 @@ def resolve_keyword(self, value, source, fallback=None, extra=None):
 
     states = AtfLexicon.STATES
 
-    t_AMPERSAND = "\&"
-    t_HASH = "\#"
-    t_EXCLAIM = "\!"
-    t_QUERY = "\?"
-    t_STAR = "\*"
-    t_DOLLAR = "\$"
-    t_MINUS = "\-"
-    t_FROM = "\<\<"
-    t_TO = "\>\>"
-    t_COMMA = "\,"
-    t_PARBAR = "\|\|"
-
-    t_INITIAL_transctrl_PARENTHETICALID = "\([^\n\r]*\)"
+    t_AMPERSAND = r'\&'
+    t_HASH = r'\#'
+    t_EXCLAIM = r'\!'
+    t_QUERY = r'\?'
+    t_STAR = r'\*'
+    t_DOLLAR = r'\$'
+    t_MINUS = r'\-'
+    t_FROM = r'\<\<'
+    t_TO = r'\>\>'
+    t_COMMA = r'\,'
+    t_PARBAR = r'\|\|'
+
+    t_INITIAL_transctrl_PARENTHETICALID = r'\([^\n\r]*\)'
 
     def t_INITIAL_transctrl_WHITESPACE(self, t):
         r'[\t ]+'
         # NO TOKEN
 
     def t_MULTILINGUAL(self, t):
-        "\=\="
+        r'\=\='
         t.lexer.push_state("text")
         return t
 
     def t_EQUALBRACE(self, t):
-        "^\=\{"
+        r'^\=\{'
         t.lexer.push_state('text')
         return t
 
     def t_EQUALS(self, t):
-        "\="
+        r'\='
         t.lexer.push_state('flagged')
         return t
 
@@ -121,7 +121,7 @@ def t_NEWLINE(self, t):
         return t
 
     def t_INITIAL_parallel_labeled_ATID(self, t):
-        '^\@[a-zA-Z][a-zA-Z0-9\[\]]*\+?'
+        r'^\@[a-zA-Z][a-zA-Z0-9\[\]]*\+?'
         t.value = t.value[1:]
         t.lexpos += 1
         t.type = self.resolve_keyword(t.value,
@@ -171,13 +171,13 @@ def t_INITIAL_parallel_labeled_ATID(self, t):
         return t
 
     def t_labeled_OPENR(self, t):
-        "\@\("
+        r'\@\('
         t.lexer.push_state("para")
         t.lexer.push_state("transctrl")
         return t
 
     def t_INITIAL_parallel_labeled_HASHID(self, t):
-        '\#[a-zA-Z][a-zA-Z0-9\[\]]+\:'
+        r'\#[a-zA-Z][a-zA-Z0-9\[\]]+\:'
         # Note that \:? absorbs a trailing colon in protocol keywords
         t.value = t.value[1:-1]
         t.lexpos += 1
@@ -303,22 +303,22 @@ def t_transctrl_ID(self, t):
             t.type = "REFERENCE"
         return t
 
-    t_parallel_QUERY = "\?"
+    t_parallel_QUERY = r'\?'
 
     def t_parallel_LINELABEL(self, t):
         r'^([^\.\ \t]*)\.[\ \t]*'
         t.value = t.value.strip(" \t.")
         return t
 
     def t_parallel_labeled_DOLLAR(self, t):
-        "^\$"
+        r'^\$'
         t.lexer.push_state("absorb")
         return t
 
-    t_transctrl_MINUS = "\-\ "
+    t_transctrl_MINUS = r'\-\ '
 
     def t_transctrl_CLOSER(self, t):
-        "\)"
+        r'\)'
         t.lexer.pop_state()
         return t
 
@@ -352,8 +352,8 @@ def t_labeled_NEWLINE(self, t):
     # fact that the first character may not be a ?
     # We are looking for a string that does not start with ? it may include
     # newlines if they are followed by a whitespace.
-    translation_regex1 = '([^\?\^\n\r]|([\n\r](?=[ \t])))'
-    translation_regex2 = '([^\^\n\r]|([\n\r](?=[ \t])))*'
+    translation_regex1 = r'([^\?\^\n\r]|([\n\r](?=[ \t])))'
+    translation_regex2 = r'([^\^\n\r]|([\n\r](?=[ \t])))*'
     translation_regex = white + translation_regex1 + translation_regex2 + white
 
     @lex.TOKEN(translation_regex)
@@ -399,12 +399,12 @@ def t_flagged_ID(self, t):
         t.value = t.value.strip()
         return t
 
-    t_flagged_HASH = "\#"
-    t_flagged_EXCLAIM = "\!"
-    t_flagged_QUERY = "\?"
-    t_flagged_STAR = "\*"
-    t_flagged_parallel_para_HAT = "[\ \t]*\^[\ \t]*"
-    t_flagged_EQUALS = "\="
+    t_flagged_HASH = r'\#'
+    t_flagged_EXCLAIM = r'\!'
+    t_flagged_QUERY = r'\?'
+    t_flagged_STAR = r'\*'
+    t_flagged_parallel_para_HAT = r'[\ \t]*\^[\ \t]*'
+    t_flagged_EQUALS = r'\='
     # --- Rules for paragaph state----------------------------------
     # Free text, ended by double new line
 
@@ -441,11 +441,11 @@ def t_para_MAGICNEWLINE(self, t):
     # --- RULES FOR THE nonequals STATE -----
     # Absorb everything except an equals
     def t_nonequals_ID(self, t):
-        "[^\=\n\r]+"
+        r'[^\=\n\r]+'
         t.value = t.value.strip()
         return t
 
-    t_nonequals_EQUALS = "\="
+    t_nonequals_EQUALS = r'\='
 
     # --- RULES FOR THE absorb STATE -----
     # Absorb everything
@@ -455,14 +455,14 @@ def t_absorb_ID(self, t):
         return t
 
     # --- RULES FOR THE text STATE ----
-    t_text_ID = "[^\ \t \n\r]+"
+    t_text_ID = r'[^\ \t \n\r]+'
 
     def t_text_SPACE(self, t):
         r'[\ \t]'
         # No token generated
 
     # --- RULES FOR THE lemmatize STATE
-    t_lemmatize_ID = "[^\;\n\r]+"
+    t_lemmatize_ID = r'[^\;\n\r]+'
     t_lemmatize_SEMICOLON = r'\;[\ \t]*'
 
     # Error handling rule

diff --git a/pyoracc/model/corpus.py b/pyoracc/model/corpus.py
@@ -33,10 +33,10 @@ def __init__(self, **kwargs):
         self.source = kwargs['source']
         if 'source' in kwargs:
             for dirpath, _, files in os.walk(self.source):
-                for file in files:
-                    if file.endswith('.atf'):
+                for filename in files:
+                    if filename.endswith('.atf'):
                         try:
-                            path = os.path.join(dirpath, file)
+                            path = os.path.join(dirpath, filename)
                             print("Parsing file", path, "... ", end="")
                             content = codecs.open(path,
                                                   encoding='utf-8-sig').read()

diff --git a/pyoracc/test/atf/test_atffile.py b/pyoracc/test/atf/test_atffile.py
@@ -20,6 +20,8 @@
 
 from pyoracc.atf.common.atffile import AtfFile
 from ..fixtures import anzu, belsunu, sample_file
+import pytest
+import json
 
 
 def test_create():
@@ -88,36 +90,49 @@ def test_composite():
     ]
 
 
-def consider_composite(name, code):
+@pytest.mark.parametrize('name, code', [
+    (text[0], text[1]) for text in composites])
+def test_composite_code(name, code):
     """
-    Parses ATF and checks CDLI ID coincides
+    Parses ATF and checks CDLI ID coincides.
     """
     afile = AtfFile(sample_file(name))
     assert afile.text.texts[0].code == code
 
 
-def consider_file(name, code, description):
+@pytest.mark.parametrize('name, code, description', [
+    (text[0], text[1], text[2]) for text in texts])
+def test_text_designation(name, code, description):
     """
-    Parses ATF and checks CDLI ID and text description coincide
+    Parses ATF and checks CDLI ID and text description coincide.
     """
     afile = AtfFile(sample_file(name))
     assert afile.text.code == code
     assert afile.text.description == description
 
 
-def test_texts():
-    """"
-    Go through list of selected filenames and check parser deals non-composite
-    files.
-    """
-    for text in texts:
-        yield consider_file, text[0], text[1], text[2]
+# ATF filenames which fail the serialization tests.
+_xfail_texts = [
+    # Multilingual objects store the unmarked language
+    # under the `None` key in their `lines` dictionary,
+    # which is incompatible with `sort_keys=True`.
+    'bb_2_6',
+    ]
 
 
-def test_composites():
+@pytest.mark.parametrize('name', [
+    name if name not in _xfail_texts
+    else pytest.param(name, marks=[pytest.mark.xfail()])
+    for name in [text[0] for text in texts]])
+def test_json_serialization(name):
     """
-    Go through list of selected composites and check parser deals with
-    composite files correctly
+    Parses ATF and verifies the to_json() method output.
     """
-    for composite in composites:
-        yield consider_composite, composite[0], composite[1]
+    afile = AtfFile(sample_file(name))
+    js = afile.to_json()
+    result = json.loads(js)
+    assert result
+    noskipjs = afile.to_json(skip_empty=False, sort_keys=True)
+    result = json.loads(noskipjs)
+    assert result
+    assert len(noskipjs) >= len(js)