Commit ea5b21d4 authored by Maximilian Köhl's avatar Maximilian Köhl
Browse files

initial commit

parents
build
dist
.venv
.tox
.coverage
MANIFEST
docs/_build
*.egg-info
.vscode/*
!.vscode/settings.windows.template.json
!.vscode/tasks.json
.mypy_cache
.pytest_cache
pip-wheel-metadata
__pycache__
playground
Orex: Object-Oriented Regular Expressions
=========================================
An object-oriented approach to regular expressions.
# -*- coding:utf-8 -*-
#
# Copyright (C) 2020, Maximilian Köhl <mkoehl@cs.uni-saarland.de>
from __future__ import annotations
from .backend import CompileFlags, CompiledPattern, compile_pattern
from .collection import PatternCollection
from .match import Region, Group
from .patterns import (
Pattern,
Atom,
Class,
PrototypePattern,
ensure_pattern,
literal,
repeat,
at_most_once,
at_least_once,
character_range,
character_set,
choice,
group,
named_group,
concat,
word,
WORD,
WHITESPACE,
)
from .stringify import StringifyOptions, Style, Path, StringifyResult, stringify
__all__ = [
"CompileFlags",
"CompiledPattern",
"compile_pattern",
"PatternCollection",
"Region",
"Group",
"Pattern",
"Atom",
"Class",
"PrototypePattern",
"ensure_pattern",
"literal",
"repeat",
"at_most_once",
"at_least_once",
"character_range",
"character_set",
"choice",
"group",
"named_group",
"concat",
"word",
"WORD",
"WHITESPACE",
"StringifyOptions",
"Style",
"Path",
"StringifyResult",
"stringify",
]
# -*- coding:utf-8 -*-
#
# Copyright (C) 2020, Maximilian Köhl <mkoehl@cs.uni-saarland.de>
from __future__ import annotations
import typing as t
import dataclasses
import enum
import re
from .match import Region, Group
from .patterns import Pattern
from .stringify import StringifyResult, stringify
class CompileFlags(enum.Flag):
NONE = 0
IGNORECASE = re.IGNORECASE
MULTILINE = re.MULTILINE
DEFAULT_FLAGS = CompileFlags.NONE
@dataclasses.dataclass
class CompiledPattern:
pattern: Pattern
flags: CompileFlags
_python_pattern: t.Pattern[str]
_stringify_result: StringifyResult
def match(
self, string: str, *, start: int = 0, end: t.Optional[int] = None
) -> t.Optional[Group]:
if end is None:
python_match = self._python_pattern.match(string, pos=start)
else:
python_match = self._python_pattern.match(string, pos=start, endpos=end)
if python_match is None:
return None
else:
region = Region(*python_match.span(0))
return Group(region, self._stringify_result, 0, python_match)
def search(
self, string: str, *, start: int = 0, end: t.Optional[int] = None
) -> t.Optional[Group]:
if end is None:
python_match = self._python_pattern.search(string, pos=start)
else:
python_match = self._python_pattern.search(string, pos=start, endpos=end)
if python_match is None:
return None
else:
region = Region(*python_match.span(0))
return Group(region, self._stringify_result, 0, python_match)
def compile_pattern(
pattern: Pattern, *, flags: CompileFlags = DEFAULT_FLAGS
) -> CompiledPattern:
stringify_result = stringify(pattern)
python_pattern = re.compile(stringify_result.string, flags=flags.value)
return CompiledPattern(pattern, flags, python_pattern, stringify_result)
__all__ = ["CompileFlags", "CompiledPattern", "compile_pattern"]
# -*- coding:utf-8 -*-
#
# Copyright (C) 2020, Maximilian Köhl <mkoehl@cs.uni-saarland.de>
from __future__ import annotations
import typing as t
import dataclasses
from .backend import CompileFlags, DEFAULT_FLAGS, CompiledPattern, compile_pattern
from .match import Group
from .patterns import Pattern, choice, named_group
@dataclasses.dataclass
class PatternCollection:
flags: CompileFlags = DEFAULT_FLAGS
_patterns: t.List[Pattern] = dataclasses.field(default_factory=list)
_map: t.Dict[str, Pattern] = dataclasses.field(default_factory=dict)
_compiled_pattern: t.Optional[CompiledPattern] = None
def add_pattern(self, pattern: Pattern) -> None:
self._compiled_pattern = None
self._map[f"PATTERN_{len(self._patterns)}"] = pattern
self._patterns.append(pattern)
def _compile(self) -> CompiledPattern:
if self._compiled_pattern is None:
self._compiled_pattern = compile_pattern(
choice(
[
named_group(f"PATTERN_{index}", pattern)
for index, pattern in enumerate(self._patterns)
]
)
)
return self._compiled_pattern
def _resolve(self, group: t.Optional[Group]) -> t.Optional[t.Tuple[Pattern, Group]]:
if group is None:
return None
assert group._python_match.lastgroup is not None
pattern = self._map[group._python_match.lastgroup]
return pattern, group.get_group(group._python_match.lastgroup)
def match(
self, string: str, *, start: int = 0, end: t.Optional[int] = None
) -> t.Optional[t.Tuple[Pattern, Group]]:
return self._resolve(self._compile().match(string, start=start, end=end))
def search(
self, string: str, *, start: int = 0, end: t.Optional[int] = None
) -> t.Optional[t.Tuple[Pattern, Group]]:
return self._resolve(self._compile().search(string, start=start, end=end))
__all__ = ["PatternCollection"]
# -*- coding:utf-8 -*-
#
# Copyright (C) 2020, Maximilian Köhl <mkoehl@cs.uni-saarland.de>
from __future__ import annotations
import typing as t
import dataclasses
from .stringify import StringifyResult, Path
@dataclasses.dataclass(frozen=True)
class Region:
start: int
end: int
@dataclasses.dataclass(frozen=True)
class Group:
region: Region
_stringify_result: StringifyResult
_group_index: int
_python_match: t.Match[str]
def __repr__(self) -> str:
return f"<orex.Group; region=({self.start},{self.end}), text={self.text!r}>"
@property
def text(self) -> str:
return self._python_match.group(self._group_index)
@property
def start(self) -> int:
return self.region.start
@property
def end(self) -> int:
return self.region.end
def iter_groups(self) -> t.Iterator[Group]:
for offset in range(1, self._stringify_result.subgroups[self._group_index] + 1):
yield self.get_group(offset)
def iter_named_groups(self) -> t.Iterator[t.Tuple[Path, str, Group]]:
own_prefix = len(self._stringify_result.paths[self._group_index]) + 1
for offset in range(1, self._stringify_result.subgroups[self._group_index] + 1):
index = self._group_index + offset
try:
name = self._stringify_result.names[index]
except KeyError:
pass
else:
path = self._stringify_result.paths[index][own_prefix:]
yield path, name, self.get_group(offset)
def get_group(self, name_or_index: t.Union[str, int]) -> Group:
if isinstance(name_or_index, int):
assert name_or_index <= self._stringify_result.subgroups[self._group_index]
index = self._group_index + name_or_index
else:
path = self._stringify_result.paths[self._group_index]
name = self._stringify_result.get_full_group_name(path, name=name_or_index)
index = self._stringify_result.indices[name]
region = Region(*self._python_match.span(index))
return Group(region, self._stringify_result, index, self._python_match)
__all__ = ["Region", "Group"]
# -*- coding:utf-8 -*-
#
# Copyright (C) 2020, Maximilian Köhl <mkoehl@cs.uni-saarland.de>
from __future__ import annotations
import typing as t
import dataclasses
import enum
class Pattern:
pass
class Atom(Pattern, enum.Enum):
ANY = "."
START = "^"
END = "$"
STRING_START = r"\A"
STRING_END = r"\Z"
WORD_BOUNDARY = r"\b"
NOT_WORD_BOUNDARY = r"\B"
def __repr__(self) -> str:
return f"<{self.__class__.__name__}.{self.name}>"
class Class(Pattern, enum.Enum):
WORD = r"\w"
NOT_WORD = r"\W"
WHITESPACE = r"\s"
NOT_WHITESPACE = r"\S"
DIGIT = r"\d"
NOT_DIGIT = r"\D"
def __repr__(self) -> str:
return f"<{self.__class__.__name__}.{self.name}>"
@dataclasses.dataclass(frozen=True)
class Literal(Pattern):
literal: str
def __repr__(self) -> str:
return f"<{self.__class__.__name__} {self.literal!r}>"
@dataclasses.dataclass(frozen=True)
class Repeat(Pattern):
pattern: Pattern
lower_bound: int = 0
upper_bound: t.Optional[int] = None
is_greedy: bool = True
def __repr__(self) -> str:
greedy_flag = "" if self.is_greedy else "?"
upper_bound = "∞" if self.upper_bound is None else str(self.upper_bound)
return (
f"<{self.__class__.__name__} {self.pattern!r} "
f"{{{self.lower_bound},{upper_bound}}}{greedy_flag}>"
)
@dataclasses.dataclass(frozen=True)
class Range:
start: str
end: str
def __post_init__(self) -> None:
if len(self.start) != 1 or len(self.end) != 1:
raise Exception(f"invalid character range {self.start!r}-{self.end!r}")
def __repr__(self) -> str:
return f"<{self.__class__.__name__} {self.start!r}-{self.end!r}>"
@dataclasses.dataclass(frozen=True)
class Set(Pattern):
elements: t.FrozenSet[t.Union[Range, Class, str]]
is_negated: bool = False
def __post_init__(self) -> None:
if not self.elements:
raise Exception("pattern set must not be empty")
for element in self.elements:
if isinstance(element, str) and len(element) != 1:
raise Exception(f"invalid set element {element!r}")
def __repr__(self) -> str:
elements = ", ".join(map(repr, self.elements))
return f"<{self.__class__.__name__} is_negated={self.is_negated} {elements}>"
def negate(self) -> Set:
return Set(self.elements, is_negated=not self.is_negated)
@dataclasses.dataclass(frozen=True)
class Choice(Pattern):
alternatives: t.Tuple[Pattern, ...]
def __post_init__(self) -> None:
if not self.alternatives:
raise Exception("alternatives must not be empty")
def __repr__(self) -> str:
alternatives = " | ".join(map(repr, self.alternatives))
return f"<{self.__class__.__name__} {alternatives}>"
@dataclasses.dataclass(frozen=True)
class Group(Pattern):
pattern: Pattern
def __repr__(self) -> str:
return f"<{self.__class__.__name__} {self.pattern!r}>"
@dataclasses.dataclass(frozen=True)
class NamedGroup(Pattern):
name: str
pattern: Pattern
def __repr__(self) -> str:
return f"<{self.__class__.__name__} name={self.name!r} {self.pattern!r}>"
@dataclasses.dataclass(frozen=True)
class Sequence(Pattern):
patterns: t.Tuple[Pattern, ...]
def __post_init__(self) -> None:
if not self.patterns:
raise Exception("sequence must not be empty")
def __repr__(self) -> str:
patterns = " ".join(map(repr, self.patterns))
return f"<{self.__class__.__name__} {patterns}>"
PrototypePattern = t.Union[ # type: ignore
Pattern,
str,
t.AbstractSet["PrototypePattern"], # type: ignore
t.Mapping[str, "PrototypePattern"], # type: ignore
t.Sequence["PrototypePattern"], # type: ignore
]
def ensure_pattern(prototype: PrototypePattern) -> Pattern:
if isinstance(prototype, Pattern):
return prototype
elif isinstance(prototype, str):
return literal(prototype)
elif isinstance(prototype, t.Mapping):
assert len(prototype) == 1
name, child = next(iter(prototype.items()))
return NamedGroup(name, ensure_pattern(child))
elif isinstance(prototype, t.AbstractSet):
return Choice(tuple(ensure_pattern(alternative) for alternative in prototype))
else:
return Sequence(tuple(ensure_pattern(element) for element in prototype))
def literal(literal: str) -> Literal:
return Literal(literal)
def repeat(
pattern: PrototypePattern,
*,
lower_bound: int = 0,
upper_bound: t.Optional[int] = None,
is_greedy: bool = True,
) -> Repeat:
return Repeat(
ensure_pattern(pattern),
lower_bound=lower_bound,
upper_bound=upper_bound,
is_greedy=is_greedy,
)
def at_most_once(prototype: PrototypePattern, *, is_greedy: bool = True) -> Repeat:
return repeat(prototype, lower_bound=0, upper_bound=1, is_greedy=is_greedy)
def at_least_once(prototype: PrototypePattern, *, greedy: bool = True) -> Repeat:
return repeat(prototype, lower_bound=1, is_greedy=greedy)
def character_range(start: str, end: str) -> Range:
return Range(start, end)
def character_set(*elements: t.Union[Range, Class, str]) -> Set:
return Set(frozenset(elements))
def choice(alternatives: t.Iterable[PrototypePattern]) -> Choice:
return Choice(tuple(ensure_pattern(alternative) for alternative in alternatives))
def group(*alternatives: PrototypePattern) -> Group:
return Group(choice(alternatives))
def named_group(name: str, prototype: PrototypePattern) -> NamedGroup:
return NamedGroup(name, ensure_pattern(prototype))
def concat(*prototypes: PrototypePattern) -> Sequence:
return Sequence(tuple(ensure_pattern(prototype) for prototype in prototypes))
def word(prototype: PrototypePattern) -> Pattern:
return concat(Atom.WORD_BOUNDARY, ensure_pattern(prototype), Atom.WORD_BOUNDARY)
WORD = at_least_once(Class.WORD)
WHITESPACE = at_least_once(Class.WHITESPACE)
__all__ = [
"Pattern",
"Atom",
"Class",
"PrototypePattern",
"ensure_pattern",
"literal",
"repeat",
"at_most_once",
"at_least_once",
"character_range",
"character_set",
"choice",
"group",
"named_group",
"concat",
"word",
"WORD",
"WHITESPACE",
]
# -*- coding:utf-8 -*-
#
# Copyright (C) 2020, Maximilian Köhl <mkoehl@cs.uni-saarland.de>
from __future__ import annotations
import typing as t
import abc
import dataclasses
import enum
import functools
import re
from mxu.itertools import iter_lookahead
from mxu.sanity import check_singledispatch
from . import patterns
from .patterns import Pattern
@dataclasses.dataclass(frozen=True)
class StringifyOptions:
hierarchic_groups: bool = True
strip_group_names: bool = False
group_name_prefix: str = ""
DEFAULT_OPTIONS = StringifyOptions()
class _Style(abc.ABC):
@abc.abstractmethod
def begin_named_group(self, name: str, ctx: _Context) -> None:
raise NotImplementedError()
class _StylePython(_Style):
def begin_named_group(self, name: str, ctx: _Context) -> None:
ctx.chunks.append(f"(?P<{name}>")
class _StyleOniguruma(_Style):
def begin_named_group(self, name: str, ctx: _Context) -> None:
ctx.chunks.append(f"(?<{name}>")
class Style(enum.Enum):
PYTHON = _StylePython()
ONIGURUMA = _StyleOniguruma()
def __repr__(self) -> str:
return f"<{self.__class__.__name__}.{self.name}>"
DEFAULT_STYLE = Style.PYTHON
Path = t.Tuple[str, ...]
@dataclasses.dataclass(frozen=True)
class StringifyResult: