Skip to content

Commit

Permalink
Merge pull request #47 from goodmami/gh-44-optimize-char-class
Browse files Browse the repository at this point in the history
Release v0.5.2

Fix optimizations regarding character classes and grammar mutation
  • Loading branch information
goodmami authored Mar 29, 2024
2 parents ab684d7 + 4b8e405 commit 350e89d
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 25 deletions.
13 changes: 13 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,17 @@
## [Unreleased][unreleased]


## [v0.5.2][]

**Release date: 2024-03-28**

### Fixed

* Optimization returns new grammar instead of mutating original ([#44])
* Make union of choice of character classes ([#44])
* `Flag.STRICT` now raises parsing errors in machine parser


## [v0.5.1][]

**Release date: 2023-12-31**
Expand Down Expand Up @@ -171,6 +182,7 @@ descent parser and a work-in-progress state-machine parser.
[v0.4.0]: ../../releases/tag/v0.4.0
[v0.5.0]: ../../releases/tag/v0.5.0
[v0.5.1]: ../../releases/tag/v0.5.1
[v0.5.2]: ../../releases/tag/v0.5.2

[#6]: https://github.com/goodmami/pe/issues/6
[#7]: https://github.com/goodmami/pe/issues/7
Expand All @@ -186,3 +198,4 @@ descent parser and a work-in-progress state-machine parser.
[#31]: https://github.com/goodmami/pe/issues/31
[#36]: https://github.com/goodmami/pe/issues/36
[#38]: https://github.com/goodmami/pe/issues/38
[#44]: https://github.com/goodmami/pe/issues/44
4 changes: 3 additions & 1 deletion pe/_cy_machine.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ from enum import IntEnum
from cpython.mem cimport PyMem_Malloc, PyMem_Free

from pe._constants import Operator, Flag, FAIL as FAILURE
from pe._errors import Error
from pe._errors import Error, ParseError
from pe._match import Match
from pe._types import Memo
from pe._definition import Definition
Expand Down Expand Up @@ -176,6 +176,8 @@ class MachineParser(Parser):
idx = self._index[self.start]
end = self._parser.match(idx, s, pos, args, kwargs, memo)
if end < 0:
if flags & Flag.STRICT:
raise ParseError()
return None
else:
return Match(
Expand Down
2 changes: 1 addition & 1 deletion pe/_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
Meta-information about pe.
"""

__version__ = '0.5.1'
__version__ = '0.5.2'
30 changes: 17 additions & 13 deletions pe/_optimize.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,22 +128,18 @@ def _common(defn):
if len(ranges) == 1 and ranges[0][1] is None and not negated:
defn = Literal(ranges[0][0])

if op == SEQ:
_common_sequence(defn.args[0])
elif op == SEQ:
defn = _common_sequence(defn)

if op == CHC:
_common_choice(defn.args[0])

# Sequence(x) -> x OR Choice(x) -> x
if op in (SEQ, CHC) and len(defn.args[0]) == 1:
defn = defn.args[0][0]
op = defn.op
elif op == CHC:
defn = _common_choice(defn)

return defn


def _common_sequence(subdefs):
def _common_sequence(defn):
i = 0
subdefs = list(defn.args[0])
while i < len(subdefs) - 1:
d = subdefs[i]
# ![...] . -> [^...]
Expand All @@ -163,16 +159,18 @@ def _common_sequence(subdefs):
if j - i > 1:
subdefs[i:j] = [Literal(''.join(x.args[0] for x in subdefs[i:j]))]
i += 1
return Sequence(*subdefs)


def _common_choice(subdefs):
def _common_choice(defn):
i = 0
subdefs = list(defn.args[0])
while i < len(subdefs) - 1:
d = subdefs[i]
# [..] / [..] -> [....]
# [..] / "." -> [...]
if (d.op == CLS and not d.args[1]) or (d.op == LIT and len(d.args[0]) == 1):
ranges = d.args[0] if d.op == CLS else [(d.args[0], None)]
ranges = list(d.args[0]) if d.op == CLS else [(d.args[0], None)]
j = i + 1
while j < len(subdefs):
d2 = subdefs[j]
Expand All @@ -184,8 +182,14 @@ def _common_choice(subdefs):
break
j += 1
if j - i > 1:
subdefs[i:j] = [Class(ranges)]
subdefs[i:j] = [Class(sorted(set(ranges), key=_range_sort_key))]
i += 1
return Choice(*subdefs)


def _range_sort_key(range):
"""Ensure single hyphen characters are the first."""
return (range != ("-", None), range)


def _regex_dot(defn, defs, grpid):
Expand Down
4 changes: 3 additions & 1 deletion pe/_py_machine.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import re

from pe._constants import FAIL as FAILURE, Operator, Flag
from pe._errors import Error
from pe._errors import Error, ParseError
from pe._match import Match
from pe._types import Memo
from pe._definition import Definition
Expand Down Expand Up @@ -132,6 +132,8 @@ def match(self,
idx = self._index[self.start]
end = _match(self.pi, idx, s, pos, args, kwargs, memo)
if end < 0:
if flags & Flag.STRICT:
raise ParseError()
return None
else:
return Match(
Expand Down
33 changes: 24 additions & 9 deletions test/test__optimize.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,16 @@


def gload(s, inline=False, common=False, regex=False):
_, original = loads(s)
start, defmap = loads(s)
return optimize(Grammar(defmap, start=start),
inline=inline,
common=common,
regex=regex)
optimized = optimize(
Grammar(defmap, start=start),
inline=inline,
common=common,
regex=regex
)
assert original == defmap
return optimized


def iload(s):
Expand Down Expand Up @@ -67,12 +72,13 @@ def test_common():
gload(r'A <- "a"'))
assert (cload(r'A <- !"a"') ==
gload(r'A <- !"a"'))
assert (cload(r'A <- !"a"') ==
gload(r'A <- !"a"'))
# single-char classes to literals
assert (cload(r'A <- [a]') ==
gload(r'A <- "a"'))
# but not single-range
# but not multi-char class
assert (cload(r'A <- [ab]') ==
gload(r'A <- [ab]'))
# and not ranges
assert (cload(r'A <- [a-c]') ==
gload(r'A <- [a-c]'))
# add "b" to avoid dropping the sequence
Expand All @@ -86,15 +92,24 @@ def test_common():
# sequence of literals to literal
assert (cload(r'A <- "a" "bc" "d"') ==
gload(r'A <- "abcd"'))
# but not sequence with classes
# or sequence of literals or single-char classes
assert (cload(r'A <- "a" [b] "c"') ==
gload(r'A <- "abc"'))
# but not sequence with multi-char classes
assert (cload(r'A <- "a" [bc] "d"') ==
gload(r'A <- "a" [bc] "d"'))
# choice of classes or single-char literals
# choice of classes
assert (cload(r'A <- [ab] / [bc]') ==
gload(r'A <- [abc]'))
# or choice of classes or single-char literals
assert (cload(r'A <- [ab] / "m" / [yz]') ==
gload(r'A <- [abmyz]'))
# not negated classes though
assert (cload(r'A <- (![ab] .) / "m" / [yz]') ==
grm({'A': Choice(Class('ab', negate=True), Class('myz'))}))
# hyphen characters are moved to start of class
assert (cload(r'A <- [(-,] / [-.]') ==
gload(r'A <- [-(-,.]'))


def test_regex():
Expand Down

0 comments on commit 350e89d

Please sign in to comment.