diff --git a/.travis.yml b/.travis.yml index d3372f6..5020226 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,16 +10,11 @@ matrix: - os: linux sudo: false python: 2.7 - env: MYPYTHON=jython - JYTHON_URL="http://search.maven.org/remotecontent?filepath=org/python/jython-installer/2.7.0/jython-installer-2.7.0.jar" - - os: linux - sudo: false - python: 2.7 - env: MYPYTHON=jython - JYTHON_URL="http://search.maven.org/remotecontent?filepath=org/python/jython-installer/2.7.1b3/jython-installer-2.7.1b3.jar" + env: MYPYTHON=jython - JYTHON_URL="https://search.maven.org/remotecontent?filepath=org/python/jython-installer/2.7.1/jython-installer-2.7.1.jar" - os: linux sudo: false python: 3.5 - os: osx - osx_image: xcode7.3 language: generic env: PIP=pip3 MYPYTHON=python3 env: @@ -56,14 +51,13 @@ install: # Install dependencies $PIP install pip if [ "$MYPYTHON" == "jython" ]; then - $PIP install pip==8.1.2 - $PIP install pytest==2.9.2 + $PIP install pytest fi $PIP install wheel $PIP install setuptools $PIP install ply pep8 mako if [ "$MYPYTHON" != "jython" ]; then - $PIP install pytest pytest-cov codecov + $PIP install --upgrade pytest pytest-cov codecov fi script: @@ -75,7 +69,7 @@ script: if [ "$MYPYTHON" == "jython" ]; then py.test else - py.test --cov=pyoracc --cov-report xml --cov-report html --runslow + pytest --cov=pyoracc fi - pep8 --exclude=parsetab.py . diff --git a/README.md b/README.md index 53f4e22..4ab8afb 100644 --- a/README.md +++ b/README.md @@ -128,7 +128,7 @@ Options: Before running pytest and coverage, install [py.test](https://docs.pytest.org/en/latest/getting-started.html) and [pytest-cov](https://pypi.org/project/pytest-cov/). - $ py.test --cov=pyoracc --cov-report xml --cov-report html --cov-report annotate --runslow + $ py.test --cov=pyoracc --cov-report xml --cov-report html --cov-report annotate Before running pycodestyle, install [pycodestyle](https://pypi.org/project/pycodestyle/). diff --git a/conftest.py b/conftest.py deleted file mode 100644 index a74f104..0000000 --- a/conftest.py +++ /dev/null @@ -1,26 +0,0 @@ -''' -Copyright 2015, 2016 University College London. - -This file is part of PyORACC. - -PyORACC is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -PyORACC is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with PyORACC. If not, see . -''' - - -import pytest - - -def pytest_addoption(parser): - parser.addoption("--runslow", action="store_true", - help="run slow tests") diff --git a/pyoracc/atf/common/atffile.py b/pyoracc/atf/common/atffile.py index 48dde7d..4160492 100644 --- a/pyoracc/atf/common/atffile.py +++ b/pyoracc/atf/common/atffile.py @@ -20,6 +20,8 @@ import codecs import sys import logging +import json +from numbers import Number from pyoracc.atf.cdli.atflex import AtfCDLILexer from pyoracc.atf.cdli.atfyacc import AtfCDLIParser @@ -72,6 +74,32 @@ def __str__(self): def serialize(self): return AtfFile.template.render_unicode(**vars(self)) + def to_json(self, skip_empty=True, **kwargs): + '''Return a JSON representation of the parsed file. + + The optional skip_empty argument determines whether keys + with empty values are included in the output. Set it to + False to see all possible object members. + + Otherwise it accepts the same optional arguments as + json.dumps().''' + def _make_serializable(obj): + '''Construct a dict representation of an object. + + This is necessary to handle our custom objects + which json.JSONEncoder doesn't know how to + serialize.''' + + return {k: v + for k, v in vars(obj).items() + if not str(k).startswith('_') and not ( + skip_empty and not v and not isinstance(v, Number) + )} + + kwargs.setdefault('indent', 2) + kwargs.setdefault('default', _make_serializable) + return json.dumps(self.text, **kwargs) + def check_atf(infile, atftype, verbose=False): content = codecs.open(infile, diff --git a/pyoracc/atf/common/atflex.py b/pyoracc/atf/common/atflex.py index e528a92..e5bf3c0 100644 --- a/pyoracc/atf/common/atflex.py +++ b/pyoracc/atf/common/atflex.py @@ -69,36 +69,36 @@ def resolve_keyword(self, value, source, fallback=None, extra=None): states = AtfLexicon.STATES - t_AMPERSAND = "\&" - t_HASH = "\#" - t_EXCLAIM = "\!" - t_QUERY = "\?" - t_STAR = "\*" - t_DOLLAR = "\$" - t_MINUS = "\-" - t_FROM = "\<\<" - t_TO = "\>\>" - t_COMMA = "\," - t_PARBAR = "\|\|" - - t_INITIAL_transctrl_PARENTHETICALID = "\([^\n\r]*\)" + t_AMPERSAND = r'\&' + t_HASH = r'\#' + t_EXCLAIM = r'\!' + t_QUERY = r'\?' + t_STAR = r'\*' + t_DOLLAR = r'\$' + t_MINUS = r'\-' + t_FROM = r'\<\<' + t_TO = r'\>\>' + t_COMMA = r'\,' + t_PARBAR = r'\|\|' + + t_INITIAL_transctrl_PARENTHETICALID = r'\([^\n\r]*\)' def t_INITIAL_transctrl_WHITESPACE(self, t): r'[\t ]+' # NO TOKEN def t_MULTILINGUAL(self, t): - "\=\=" + r'\=\=' t.lexer.push_state("text") return t def t_EQUALBRACE(self, t): - "^\=\{" + r'^\=\{' t.lexer.push_state('text') return t def t_EQUALS(self, t): - "\=" + r'\=' t.lexer.push_state('flagged') return t @@ -121,7 +121,7 @@ def t_NEWLINE(self, t): return t def t_INITIAL_parallel_labeled_ATID(self, t): - '^\@[a-zA-Z][a-zA-Z0-9\[\]]*\+?' + r'^\@[a-zA-Z][a-zA-Z0-9\[\]]*\+?' t.value = t.value[1:] t.lexpos += 1 t.type = self.resolve_keyword(t.value, @@ -171,13 +171,13 @@ def t_INITIAL_parallel_labeled_ATID(self, t): return t def t_labeled_OPENR(self, t): - "\@\(" + r'\@\(' t.lexer.push_state("para") t.lexer.push_state("transctrl") return t def t_INITIAL_parallel_labeled_HASHID(self, t): - '\#[a-zA-Z][a-zA-Z0-9\[\]]+\:' + r'\#[a-zA-Z][a-zA-Z0-9\[\]]+\:' # Note that \:? absorbs a trailing colon in protocol keywords t.value = t.value[1:-1] t.lexpos += 1 @@ -303,7 +303,7 @@ def t_transctrl_ID(self, t): t.type = "REFERENCE" return t - t_parallel_QUERY = "\?" + t_parallel_QUERY = r'\?' def t_parallel_LINELABEL(self, t): r'^([^\.\ \t]*)\.[\ \t]*' @@ -311,14 +311,14 @@ def t_parallel_LINELABEL(self, t): return t def t_parallel_labeled_DOLLAR(self, t): - "^\$" + r'^\$' t.lexer.push_state("absorb") return t - t_transctrl_MINUS = "\-\ " + t_transctrl_MINUS = r'\-\ ' def t_transctrl_CLOSER(self, t): - "\)" + r'\)' t.lexer.pop_state() return t @@ -352,8 +352,8 @@ def t_labeled_NEWLINE(self, t): # fact that the first character may not be a ? # We are looking for a string that does not start with ? it may include # newlines if they are followed by a whitespace. - translation_regex1 = '([^\?\^\n\r]|([\n\r](?=[ \t])))' - translation_regex2 = '([^\^\n\r]|([\n\r](?=[ \t])))*' + translation_regex1 = r'([^\?\^\n\r]|([\n\r](?=[ \t])))' + translation_regex2 = r'([^\^\n\r]|([\n\r](?=[ \t])))*' translation_regex = white + translation_regex1 + translation_regex2 + white @lex.TOKEN(translation_regex) @@ -399,12 +399,12 @@ def t_flagged_ID(self, t): t.value = t.value.strip() return t - t_flagged_HASH = "\#" - t_flagged_EXCLAIM = "\!" - t_flagged_QUERY = "\?" - t_flagged_STAR = "\*" - t_flagged_parallel_para_HAT = "[\ \t]*\^[\ \t]*" - t_flagged_EQUALS = "\=" + t_flagged_HASH = r'\#' + t_flagged_EXCLAIM = r'\!' + t_flagged_QUERY = r'\?' + t_flagged_STAR = r'\*' + t_flagged_parallel_para_HAT = r'[\ \t]*\^[\ \t]*' + t_flagged_EQUALS = r'\=' # --- Rules for paragaph state---------------------------------- # Free text, ended by double new line @@ -441,11 +441,11 @@ def t_para_MAGICNEWLINE(self, t): # --- RULES FOR THE nonequals STATE ----- # Absorb everything except an equals def t_nonequals_ID(self, t): - "[^\=\n\r]+" + r'[^\=\n\r]+' t.value = t.value.strip() return t - t_nonequals_EQUALS = "\=" + t_nonequals_EQUALS = r'\=' # --- RULES FOR THE absorb STATE ----- # Absorb everything @@ -455,14 +455,14 @@ def t_absorb_ID(self, t): return t # --- RULES FOR THE text STATE ---- - t_text_ID = "[^\ \t \n\r]+" + t_text_ID = r'[^\ \t \n\r]+' def t_text_SPACE(self, t): r'[\ \t]' # No token generated # --- RULES FOR THE lemmatize STATE - t_lemmatize_ID = "[^\;\n\r]+" + t_lemmatize_ID = r'[^\;\n\r]+' t_lemmatize_SEMICOLON = r'\;[\ \t]*' # Error handling rule diff --git a/pyoracc/model/corpus.py b/pyoracc/model/corpus.py index 48001c0..4969fc4 100644 --- a/pyoracc/model/corpus.py +++ b/pyoracc/model/corpus.py @@ -33,10 +33,10 @@ def __init__(self, **kwargs): self.source = kwargs['source'] if 'source' in kwargs: for dirpath, _, files in os.walk(self.source): - for file in files: - if file.endswith('.atf'): + for filename in files: + if filename.endswith('.atf'): try: - path = os.path.join(dirpath, file) + path = os.path.join(dirpath, filename) print("Parsing file", path, "... ", end="") content = codecs.open(path, encoding='utf-8-sig').read() diff --git a/pyoracc/test/atf/test_atffile.py b/pyoracc/test/atf/test_atffile.py index ac72046..fd7fdf2 100644 --- a/pyoracc/test/atf/test_atffile.py +++ b/pyoracc/test/atf/test_atffile.py @@ -20,6 +20,8 @@ from pyoracc.atf.common.atffile import AtfFile from ..fixtures import anzu, belsunu, sample_file +import pytest +import json def test_create(): @@ -88,36 +90,49 @@ def test_composite(): ] -def consider_composite(name, code): +@pytest.mark.parametrize('name, code', [ + (text[0], text[1]) for text in composites]) +def test_composite_code(name, code): """ - Parses ATF and checks CDLI ID coincides + Parses ATF and checks CDLI ID coincides. """ afile = AtfFile(sample_file(name)) assert afile.text.texts[0].code == code -def consider_file(name, code, description): +@pytest.mark.parametrize('name, code, description', [ + (text[0], text[1], text[2]) for text in texts]) +def test_text_designation(name, code, description): """ - Parses ATF and checks CDLI ID and text description coincide + Parses ATF and checks CDLI ID and text description coincide. """ afile = AtfFile(sample_file(name)) assert afile.text.code == code assert afile.text.description == description -def test_texts(): - """" - Go through list of selected filenames and check parser deals non-composite - files. - """ - for text in texts: - yield consider_file, text[0], text[1], text[2] +# ATF filenames which fail the serialization tests. +_xfail_texts = [ + # Multilingual objects store the unmarked language + # under the `None` key in their `lines` dictionary, + # which is incompatible with `sort_keys=True`. + 'bb_2_6', + ] -def test_composites(): +@pytest.mark.parametrize('name', [ + name if name not in _xfail_texts + else pytest.param(name, marks=[pytest.mark.xfail()]) + for name in [text[0] for text in texts]]) +def test_json_serialization(name): """ - Go through list of selected composites and check parser deals with - composite files correctly + Parses ATF and verifies the to_json() method output. """ - for composite in composites: - yield consider_composite, composite[0], composite[1] + afile = AtfFile(sample_file(name)) + js = afile.to_json() + result = json.loads(js) + assert result + noskipjs = afile.to_json(skip_empty=False, sort_keys=True) + result = json.loads(noskipjs) + assert result + assert len(noskipjs) >= len(js) diff --git a/pyoracc/test/model/test_corpus.py b/pyoracc/test/model/test_corpus.py index 9bdc26f..65d7cc3 100644 --- a/pyoracc/test/model/test_corpus.py +++ b/pyoracc/test/model/test_corpus.py @@ -25,19 +25,12 @@ from ..fixtures import tiny_corpus, sample_corpus, whole_corpus -slow = pytest.mark.skipif( - not pytest.config.getoption("--runslow"), - reason="need --runslow option to run" -) - - def test_tiny(): corpus = Corpus(source=tiny_corpus(), atftype='oracc') assert corpus.successes == 1 assert corpus.failures == 1 -@slow def test_sample(): corpus = Corpus(source=sample_corpus(), atftype='oracc') assert corpus.successes == 37 @@ -48,11 +41,10 @@ def test_sample(): reason="Need to set oracc_corpus_path to point " "to the whole corpus, which is not bundled with " "pyoracc") -@slow def test_whole(): corpus = Corpus(source=whole_corpus(), atftype='oracc') - # there is a total of 8229 files in the corpus + # There are a total of 8229 files in the corpus. # We have ommmited lacost/00atf/cdliatf_unblocked.atf - # which is 61 MB and this to large to fit in the git repository + # which is 61 MB and too large to fit in the git repository. assert corpus.successes == 6750 assert corpus.failures == 1479