Skip to content

Commit

Permalink
Merge pull request oracc#41 from oracc/master
Browse files Browse the repository at this point in the history
Adding new changes from rillian.
  • Loading branch information
jayanthkmr authored Jun 9, 2019
2 parents 0bae693 + c32960b commit 8d00aa7
Show file tree
Hide file tree
Showing 8 changed files with 104 additions and 101 deletions.
14 changes: 4 additions & 10 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,11 @@ matrix:
- os: linux
sudo: false
python: 2.7
env: MYPYTHON=jython - JYTHON_URL="http://search.maven.org/remotecontent?filepath=org/python/jython-installer/2.7.0/jython-installer-2.7.0.jar"
- os: linux
sudo: false
python: 2.7
env: MYPYTHON=jython - JYTHON_URL="http://search.maven.org/remotecontent?filepath=org/python/jython-installer/2.7.1b3/jython-installer-2.7.1b3.jar"
env: MYPYTHON=jython - JYTHON_URL="https://search.maven.org/remotecontent?filepath=org/python/jython-installer/2.7.1/jython-installer-2.7.1.jar"
- os: linux
sudo: false
python: 3.5
- os: osx
osx_image: xcode7.3
language: generic
env: PIP=pip3 MYPYTHON=python3
env:
Expand Down Expand Up @@ -56,14 +51,13 @@ install:
# Install dependencies
$PIP install pip
if [ "$MYPYTHON" == "jython" ]; then
$PIP install pip==8.1.2
$PIP install pytest==2.9.2
$PIP install pytest
fi
$PIP install wheel
$PIP install setuptools
$PIP install ply pep8 mako
if [ "$MYPYTHON" != "jython" ]; then
$PIP install pytest pytest-cov codecov
$PIP install --upgrade pytest pytest-cov codecov
fi
script:
Expand All @@ -75,7 +69,7 @@ script:
if [ "$MYPYTHON" == "jython" ]; then
py.test
else
py.test --cov=pyoracc --cov-report xml --cov-report html --runslow
pytest --cov=pyoracc
fi
- pep8 --exclude=parsetab.py .
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ Options:

Before running pytest and coverage, install [py.test](https://docs.pytest.org/en/latest/getting-started.html) and [pytest-cov](https://pypi.org/project/pytest-cov/).

$ py.test --cov=pyoracc --cov-report xml --cov-report html --cov-report annotate --runslow
$ py.test --cov=pyoracc --cov-report xml --cov-report html --cov-report annotate

Before running pycodestyle, install [pycodestyle](https://pypi.org/project/pycodestyle/).

Expand Down
26 changes: 0 additions & 26 deletions conftest.py

This file was deleted.

28 changes: 28 additions & 0 deletions pyoracc/atf/common/atffile.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
import codecs
import sys
import logging
import json
from numbers import Number

from pyoracc.atf.cdli.atflex import AtfCDLILexer
from pyoracc.atf.cdli.atfyacc import AtfCDLIParser
Expand Down Expand Up @@ -72,6 +74,32 @@ def __str__(self):
def serialize(self):
return AtfFile.template.render_unicode(**vars(self))

def to_json(self, skip_empty=True, **kwargs):
'''Return a JSON representation of the parsed file.
The optional skip_empty argument determines whether keys
with empty values are included in the output. Set it to
False to see all possible object members.
Otherwise it accepts the same optional arguments as
json.dumps().'''
def _make_serializable(obj):
'''Construct a dict representation of an object.
This is necessary to handle our custom objects
which json.JSONEncoder doesn't know how to
serialize.'''

return {k: v
for k, v in vars(obj).items()
if not str(k).startswith('_') and not (
skip_empty and not v and not isinstance(v, Number)
)}

kwargs.setdefault('indent', 2)
kwargs.setdefault('default', _make_serializable)
return json.dumps(self.text, **kwargs)


def check_atf(infile, atftype, verbose=False):
content = codecs.open(infile,
Expand Down
70 changes: 35 additions & 35 deletions pyoracc/atf/common/atflex.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,36 +69,36 @@ def resolve_keyword(self, value, source, fallback=None, extra=None):

states = AtfLexicon.STATES

t_AMPERSAND = "\&"
t_HASH = "\#"
t_EXCLAIM = "\!"
t_QUERY = "\?"
t_STAR = "\*"
t_DOLLAR = "\$"
t_MINUS = "\-"
t_FROM = "\<\<"
t_TO = "\>\>"
t_COMMA = "\,"
t_PARBAR = "\|\|"

t_INITIAL_transctrl_PARENTHETICALID = "\([^\n\r]*\)"
t_AMPERSAND = r'\&'
t_HASH = r'\#'
t_EXCLAIM = r'\!'
t_QUERY = r'\?'
t_STAR = r'\*'
t_DOLLAR = r'\$'
t_MINUS = r'\-'
t_FROM = r'\<\<'
t_TO = r'\>\>'
t_COMMA = r'\,'
t_PARBAR = r'\|\|'

t_INITIAL_transctrl_PARENTHETICALID = r'\([^\n\r]*\)'

def t_INITIAL_transctrl_WHITESPACE(self, t):
r'[\t ]+'
# NO TOKEN

def t_MULTILINGUAL(self, t):
"\=\="
r'\=\='
t.lexer.push_state("text")
return t

def t_EQUALBRACE(self, t):
"^\=\{"
r'^\=\{'
t.lexer.push_state('text')
return t

def t_EQUALS(self, t):
"\="
r'\='
t.lexer.push_state('flagged')
return t

Expand All @@ -121,7 +121,7 @@ def t_NEWLINE(self, t):
return t

def t_INITIAL_parallel_labeled_ATID(self, t):
'^\@[a-zA-Z][a-zA-Z0-9\[\]]*\+?'
r'^\@[a-zA-Z][a-zA-Z0-9\[\]]*\+?'
t.value = t.value[1:]
t.lexpos += 1
t.type = self.resolve_keyword(t.value,
Expand Down Expand Up @@ -171,13 +171,13 @@ def t_INITIAL_parallel_labeled_ATID(self, t):
return t

def t_labeled_OPENR(self, t):
"\@\("
r'\@\('
t.lexer.push_state("para")
t.lexer.push_state("transctrl")
return t

def t_INITIAL_parallel_labeled_HASHID(self, t):
'\#[a-zA-Z][a-zA-Z0-9\[\]]+\:'
r'\#[a-zA-Z][a-zA-Z0-9\[\]]+\:'
# Note that \:? absorbs a trailing colon in protocol keywords
t.value = t.value[1:-1]
t.lexpos += 1
Expand Down Expand Up @@ -303,22 +303,22 @@ def t_transctrl_ID(self, t):
t.type = "REFERENCE"
return t

t_parallel_QUERY = "\?"
t_parallel_QUERY = r'\?'

def t_parallel_LINELABEL(self, t):
r'^([^\.\ \t]*)\.[\ \t]*'
t.value = t.value.strip(" \t.")
return t

def t_parallel_labeled_DOLLAR(self, t):
"^\$"
r'^\$'
t.lexer.push_state("absorb")
return t

t_transctrl_MINUS = "\-\ "
t_transctrl_MINUS = r'\-\ '

def t_transctrl_CLOSER(self, t):
"\)"
r'\)'
t.lexer.pop_state()
return t

Expand Down Expand Up @@ -352,8 +352,8 @@ def t_labeled_NEWLINE(self, t):
# fact that the first character may not be a ?
# We are looking for a string that does not start with ? it may include
# newlines if they are followed by a whitespace.
translation_regex1 = '([^\?\^\n\r]|([\n\r](?=[ \t])))'
translation_regex2 = '([^\^\n\r]|([\n\r](?=[ \t])))*'
translation_regex1 = r'([^\?\^\n\r]|([\n\r](?=[ \t])))'
translation_regex2 = r'([^\^\n\r]|([\n\r](?=[ \t])))*'
translation_regex = white + translation_regex1 + translation_regex2 + white

@lex.TOKEN(translation_regex)
Expand Down Expand Up @@ -399,12 +399,12 @@ def t_flagged_ID(self, t):
t.value = t.value.strip()
return t

t_flagged_HASH = "\#"
t_flagged_EXCLAIM = "\!"
t_flagged_QUERY = "\?"
t_flagged_STAR = "\*"
t_flagged_parallel_para_HAT = "[\ \t]*\^[\ \t]*"
t_flagged_EQUALS = "\="
t_flagged_HASH = r'\#'
t_flagged_EXCLAIM = r'\!'
t_flagged_QUERY = r'\?'
t_flagged_STAR = r'\*'
t_flagged_parallel_para_HAT = r'[\ \t]*\^[\ \t]*'
t_flagged_EQUALS = r'\='
# --- Rules for paragaph state----------------------------------
# Free text, ended by double new line

Expand Down Expand Up @@ -441,11 +441,11 @@ def t_para_MAGICNEWLINE(self, t):
# --- RULES FOR THE nonequals STATE -----
# Absorb everything except an equals
def t_nonequals_ID(self, t):
"[^\=\n\r]+"
r'[^\=\n\r]+'
t.value = t.value.strip()
return t

t_nonequals_EQUALS = "\="
t_nonequals_EQUALS = r'\='

# --- RULES FOR THE absorb STATE -----
# Absorb everything
Expand All @@ -455,14 +455,14 @@ def t_absorb_ID(self, t):
return t

# --- RULES FOR THE text STATE ----
t_text_ID = "[^\ \t \n\r]+"
t_text_ID = r'[^\ \t \n\r]+'

def t_text_SPACE(self, t):
r'[\ \t]'
# No token generated

# --- RULES FOR THE lemmatize STATE
t_lemmatize_ID = "[^\;\n\r]+"
t_lemmatize_ID = r'[^\;\n\r]+'
t_lemmatize_SEMICOLON = r'\;[\ \t]*'

# Error handling rule
Expand Down
6 changes: 3 additions & 3 deletions pyoracc/model/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,10 @@ def __init__(self, **kwargs):
self.source = kwargs['source']
if 'source' in kwargs:
for dirpath, _, files in os.walk(self.source):
for file in files:
if file.endswith('.atf'):
for filename in files:
if filename.endswith('.atf'):
try:
path = os.path.join(dirpath, file)
path = os.path.join(dirpath, filename)
print("Parsing file", path, "... ", end="")
content = codecs.open(path,
encoding='utf-8-sig').read()
Expand Down
47 changes: 31 additions & 16 deletions pyoracc/test/atf/test_atffile.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@

from pyoracc.atf.common.atffile import AtfFile
from ..fixtures import anzu, belsunu, sample_file
import pytest
import json


def test_create():
Expand Down Expand Up @@ -88,36 +90,49 @@ def test_composite():
]


def consider_composite(name, code):
@pytest.mark.parametrize('name, code', [
(text[0], text[1]) for text in composites])
def test_composite_code(name, code):
"""
Parses ATF and checks CDLI ID coincides
Parses ATF and checks CDLI ID coincides.
"""
afile = AtfFile(sample_file(name))
assert afile.text.texts[0].code == code


def consider_file(name, code, description):
@pytest.mark.parametrize('name, code, description', [
(text[0], text[1], text[2]) for text in texts])
def test_text_designation(name, code, description):
"""
Parses ATF and checks CDLI ID and text description coincide
Parses ATF and checks CDLI ID and text description coincide.
"""
afile = AtfFile(sample_file(name))
assert afile.text.code == code
assert afile.text.description == description


def test_texts():
""""
Go through list of selected filenames and check parser deals non-composite
files.
"""
for text in texts:
yield consider_file, text[0], text[1], text[2]
# ATF filenames which fail the serialization tests.
_xfail_texts = [
# Multilingual objects store the unmarked language
# under the `None` key in their `lines` dictionary,
# which is incompatible with `sort_keys=True`.
'bb_2_6',
]


def test_composites():
@pytest.mark.parametrize('name', [
name if name not in _xfail_texts
else pytest.param(name, marks=[pytest.mark.xfail()])
for name in [text[0] for text in texts]])
def test_json_serialization(name):
"""
Go through list of selected composites and check parser deals with
composite files correctly
Parses ATF and verifies the to_json() method output.
"""
for composite in composites:
yield consider_composite, composite[0], composite[1]
afile = AtfFile(sample_file(name))
js = afile.to_json()
result = json.loads(js)
assert result
noskipjs = afile.to_json(skip_empty=False, sort_keys=True)
result = json.loads(noskipjs)
assert result
assert len(noskipjs) >= len(js)
Loading

0 comments on commit 8d00aa7

Please sign in to comment.