from typing import Union, Callable
import itertools
import math
from .lexer import Lexer
from .tokens import *
from .re_ast import *
[docs]class Pyrser:
""" Regular Expression Parser.
Pyrser instances can parse regular expressions and return the corresponding AST.
"""
def __init__(self) -> None:
self.lxr: Lexer = Lexer()
[docs] def parse(self, re: str) -> RE:
""" Parses a regular expression.
Parses a regex and returns the corresponding AST.
If the regex contains errors raises an Exception.
Args:
re (str): a regular expression
Returns:
RE: the root node of the regular expression's AST
"""
def get_range_str(start: str, end: str) -> str:
result = ''
i = ord(start)
while i <= ord(end):
result += chr(i)
i += 1
return result
def next_tkn_initializer(re: str) -> Callable[[bool], Union[Token, None]]:
tokens = self.lxr.scan(re=re)
i = -1
def next_tkn(without_consuming: bool = False) -> Union[Token, None]:
nonlocal i
nonlocal tokens
nonlocal curr_tkn
if without_consuming:
return tokens[i+1] if len(tokens) > i+1 else None
i += 1
if i < len(tokens):
curr_tkn = tokens[i]
else:
curr_tkn = None
return next_tkn
def parse_re() -> RE:
return RE(parse_re_seq())
def parse_re_seq(capturing: bool = True, group_name: str = None, group_id: int = None) -> Union[OrNode, GroupNode]:
match_start, match_end = False, False
if type(curr_tkn) is Start or type(curr_tkn) is Circumflex:
next_tkn()
match_start = True
node = parse_group(capturing=capturing,
group_name=group_name, group_id=group_id)
if isinstance(curr_tkn, EndToken):
next_tkn()
match_end = True
else:
match_end = False
if match_start:
node.children.appendleft(StartElement())
if match_end:
node.children.append(EndElement())
if isinstance(curr_tkn, OrToken):
next_tkn()
node = OrNode(left=node, right=parse_re_seq(
group_name=node.group_name, group_id=node.group_id))
return node
def parse_group(capturing: bool = True, group_name: str = None, group_id: int = None) -> GroupNode:
nonlocal groups_counter
if group_id is None:
group_id = next(groups_counter)
elements = deque() # holds the children of the GroupNode
while curr_tkn is not None and not isinstance(curr_tkn, OrToken) and \
not isinstance(curr_tkn, RightParenthesis) and \
not isinstance(curr_tkn, EndToken):
new_el = parse_range_el()
next_tkn()
if isinstance(curr_tkn, EndToken):
elements.append(new_el)
break
if isinstance(curr_tkn, Quantifier):
if isinstance(curr_tkn, ZeroOrOne):
new_el.min, new_el.max = 0, 1
elif isinstance(curr_tkn, ZeroOrMore):
new_el.min, new_el.max = 0, math.inf
else:
# suppose it's 1+
new_el.min, new_el.max = 1, math.inf
next_tkn()
elif isinstance(curr_tkn, LeftCurlyBrace):
parse_curly(new_el)
elements.append(new_el)
return GroupNode(children=elements, capturing=capturing, group_name=group_name, group_id=group_id)
def parse_curly(new_el: ASTNode) -> None:
# move past the left brace
next_tkn()
# find val_1, val_2
val_1, val_2 = '', ''
try:
while isinstance(curr_tkn, ElementToken):
val_1 += curr_tkn.char
next_tkn()
if val_1 == '':
val_1 == 0
else:
val_1 = int(val_1)
if isinstance(curr_tkn, RightCurlyBrace):
# case {exact}
if type(val_1) is int:
new_el.min, new_el.max = val_1, val_1
next_tkn() # skip the closing brace
return
else:
raise Exception("Invalid curly brace syntax.")
next_tkn()
while isinstance(curr_tkn, ElementToken):
val_2 += curr_tkn.char
next_tkn()
if val_2 == '':
val_2 == math.inf
else:
val_2 = int(val_2)
next_tkn() # skip the closing brace
new_el.min = val_1 if type(val_1) is int else 0
new_el.max = val_2 if type(val_2) is int else math.inf
except Exception as e:
raise Exception("Invalid curly brace syntax.")
def parse_range_el() -> ASTNode:
if isinstance(curr_tkn, LeftBracket):
next_tkn()
element = parse_inner_el()
if isinstance(curr_tkn, RightBracket):
return element
else:
raise Exception(
"Missing closing ']'.")
else:
return parse_el()
def parse_inner_el() -> RangeElement:
# parse_inner_el creates a single RangeElement with all the matches
nonlocal curr_tkn
match_str = ''
if curr_tkn is None:
raise Exception(
"Missing closing ']'.")
positive_logic = True
if isinstance(curr_tkn, NotToken):
positive_logic = False
next_tkn()
prev_char = None
while curr_tkn is not None:
if isinstance(curr_tkn, RightBracket):
break
if isinstance(curr_tkn, SpaceToken):
match_str += curr_tkn.char
next_tkn()
continue
# every character inside it must be treated as an element
if not isinstance(curr_tkn, ElementToken):
curr_tkn = ElementToken(char=curr_tkn.char)
if next_tkn(without_consuming=True) is None:
raise Exception("Missing closing ']'.")
elif isinstance(next_tkn(without_consuming=True), Dash):
# it may be a range (like a-z, A-M, 0-9, ...)
prev_char = curr_tkn.char
next_tkn() # current token is now the Dash
if isinstance(next_tkn(without_consuming=True), RightBracket) or isinstance(next_tkn(without_consuming=True), SpaceToken):
# we're in one of these scenarios: "<char>-]" "<char>-\s"
# the dash and previous character must be interpreted as single elements
match_str += prev_char + curr_tkn.char
else:
# we're in the case of an actual range (or next_tkn is none)
next_tkn() # curr_tkn is now the one after the dash
if next_tkn is None:
raise Exception("Missing closing ']'.")
elif ord(prev_char) > ord(curr_tkn.char):
raise Exception(
f"Range values reversed. Start '{prev_char}' char code is greater than end '{curr_tkn.char}' char code.")
else:
match_str += get_range_str(prev_char,
curr_tkn.char)
else:
# no range, no missing ']', just a char to add to match_str
match_str += curr_tkn.char
next_tkn()
return RangeElement(match_str="".join(sorted(set(match_str))), is_positive_logic=positive_logic)
def parse_el() -> Union[Element, OrNode, GroupNode]:
group_name: Union[str, None]
group_name = None
if isinstance(curr_tkn, ElementToken):
return Element(match_ch=curr_tkn.char)
elif isinstance(curr_tkn, Wildcard):
return WildcardElement()
elif isinstance(curr_tkn, SpaceToken):
return SpaceElement()
elif isinstance(curr_tkn, LeftParenthesis):
next_tkn()
# (?: for non-capturing group
capturing = True
if type(curr_tkn) is QuestionMark:
next_tkn()
if curr_tkn.char == ':':
capturing = False
next_tkn()
elif curr_tkn.char == '<':
next_tkn()
group_name = parse_group_name()
else:
if curr_tkn is None:
raise Exception("Unterminated group.")
else:
raise Exception(
f"Invalid group: '{{?{curr_tkn.char}'.")
res = parse_re_seq(capturing=capturing, group_name=group_name)
if isinstance(curr_tkn, RightParenthesis):
# next_tkn() not needed (parse_group's while loop will eat the parenthesis)
return res
else:
raise Exception("Missing closing group parenthesis ')'.")
else:
raise Exception(
"Unescaped special character {}.".format(curr_tkn.char))
def parse_group_name() -> str:
if curr_tkn is None:
raise Exception("Unterminated named group name.")
group_name = ''
while curr_tkn.char != '>':
group_name += curr_tkn.char
next_tkn()
if curr_tkn is None:
raise Exception("Unterminated named group name.")
if len(group_name) == 0:
raise Exception("Unexpected empty named group name.")
next_tkn() # consumes '>'
return group_name
groups_counter = itertools.count(start=0)
curr_tkn = None
next_tkn = next_tkn_initializer(re)
next_tkn()
ast = parse_re()
if curr_tkn is not None:
raise Exception(
"Unable to parse the regex.")
return ast