Source code for pyregexp.pyrser

from typing import Union, Callable
import itertools
import math
from .lexer import Lexer
from .tokens import *
from .re_ast import *


[docs]class Pyrser:
    """ Regular Expression Parser.

    Pyrser instances can parse regular expressions and return the corresponding AST.    
    """

    def __init__(self) -> None:
        self.lxr: Lexer = Lexer()

[docs]    def parse(self, re: str) -> RE:
        """ Parses a regular expression.

        Parses a regex and returns the corresponding AST.
        If the regex contains errors raises an Exception.

        Args:
            re (str): a regular expression

        Returns:
            RE: the root node of the regular expression's AST
        """

        def get_range_str(start: str, end: str) -> str:
            result = ''
            i = ord(start)
            while i <= ord(end):
                result += chr(i)
                i += 1
            return result

        def next_tkn_initializer(re: str) -> Callable[[bool], Union[Token, None]]:
            tokens = self.lxr.scan(re=re)

            i = -1

            def next_tkn(without_consuming: bool = False) -> Union[Token, None]:
                nonlocal i
                nonlocal tokens
                nonlocal curr_tkn

                if without_consuming:
                    return tokens[i+1] if len(tokens) > i+1 else None

                i += 1
                if i < len(tokens):
                    curr_tkn = tokens[i]
                else:
                    curr_tkn = None

            return next_tkn

        def parse_re() -> RE:
            return RE(parse_re_seq())

        def parse_re_seq(capturing: bool = True, group_name: str = None, group_id: int = None) -> Union[OrNode, GroupNode]:
            match_start, match_end = False, False
            if type(curr_tkn) is Start or type(curr_tkn) is Circumflex:
                next_tkn()
                match_start = True

            node = parse_group(capturing=capturing,
                               group_name=group_name, group_id=group_id)

            if isinstance(curr_tkn, EndToken):
                next_tkn()
                match_end = True
            else:
                match_end = False

            if match_start:
                node.children.appendleft(StartElement())
            if match_end:
                node.children.append(EndElement())

            if isinstance(curr_tkn, OrToken):
                next_tkn()
                node = OrNode(left=node, right=parse_re_seq(
                    group_name=node.group_name, group_id=node.group_id))

            return node

        def parse_group(capturing: bool = True, group_name: str = None, group_id: int = None) -> GroupNode:
            nonlocal groups_counter
            if group_id is None:
                group_id = next(groups_counter)

            elements = deque()  # holds the children of the GroupNode

            while curr_tkn is not None and not isinstance(curr_tkn, OrToken) and \
                    not isinstance(curr_tkn, RightParenthesis) and \
                    not isinstance(curr_tkn, EndToken):
                new_el = parse_range_el()

                next_tkn()

                if isinstance(curr_tkn, EndToken):
                    elements.append(new_el)
                    break

                if isinstance(curr_tkn, Quantifier):
                    if isinstance(curr_tkn, ZeroOrOne):
                        new_el.min, new_el.max = 0, 1
                    elif isinstance(curr_tkn, ZeroOrMore):
                        new_el.min, new_el.max = 0, math.inf
                    else:
                        # suppose it's 1+
                        new_el.min, new_el.max = 1, math.inf
                    next_tkn()
                elif isinstance(curr_tkn, LeftCurlyBrace):
                    parse_curly(new_el)

                elements.append(new_el)

            return GroupNode(children=elements, capturing=capturing, group_name=group_name, group_id=group_id)

        def parse_curly(new_el: ASTNode) -> None:
            # move past the left brace
            next_tkn()

            # find val_1, val_2
            val_1, val_2 = '', ''
            try:
                while isinstance(curr_tkn, ElementToken):
                    val_1 += curr_tkn.char
                    next_tkn()
                if val_1 == '':
                    val_1 == 0
                else:
                    val_1 = int(val_1)

                if isinstance(curr_tkn, RightCurlyBrace):
                    # case {exact}
                    if type(val_1) is int:
                        new_el.min, new_el.max = val_1, val_1
                        next_tkn()  # skip the closing brace
                        return
                    else:
                        raise Exception("Invalid curly brace syntax.")

                next_tkn()
                while isinstance(curr_tkn, ElementToken):
                    val_2 += curr_tkn.char
                    next_tkn()
                if val_2 == '':
                    val_2 == math.inf
                else:
                    val_2 = int(val_2)

                next_tkn()  # skip the closing brace

                new_el.min = val_1 if type(val_1) is int else 0
                new_el.max = val_2 if type(val_2) is int else math.inf

            except Exception as e:
                raise Exception("Invalid curly brace syntax.")

        def parse_range_el() -> ASTNode:
            if isinstance(curr_tkn, LeftBracket):
                next_tkn()
                element = parse_inner_el()
                if isinstance(curr_tkn, RightBracket):
                    return element
                else:
                    raise Exception(
                        "Missing closing ']'.")
            else:
                return parse_el()

        def parse_inner_el() -> RangeElement:
            # parse_inner_el creates a single RangeElement with all the matches
            nonlocal curr_tkn
            match_str = ''
            if curr_tkn is None:
                raise Exception(
                    "Missing closing ']'.")

            positive_logic = True
            if isinstance(curr_tkn, NotToken):
                positive_logic = False
                next_tkn()

            prev_char = None
            while curr_tkn is not None:
                if isinstance(curr_tkn, RightBracket):
                    break

                if isinstance(curr_tkn, SpaceToken):
                    match_str += curr_tkn.char
                    next_tkn()
                    continue

                # every character inside it must be treated as an element
                if not isinstance(curr_tkn, ElementToken):
                    curr_tkn = ElementToken(char=curr_tkn.char)

                if next_tkn(without_consuming=True) is None:
                    raise Exception("Missing closing ']'.")
                elif isinstance(next_tkn(without_consuming=True), Dash):
                    # it may be a range (like a-z, A-M, 0-9, ...)
                    prev_char = curr_tkn.char
                    next_tkn()  # current token is now the Dash
                    if isinstance(next_tkn(without_consuming=True), RightBracket) or isinstance(next_tkn(without_consuming=True), SpaceToken):
                        # we're in one of these scenarios: "<char>-]" "<char>-\s"
                        # the dash and previous character must be interpreted as single elements
                        match_str += prev_char + curr_tkn.char
                    else:
                        # we're in the case of an actual range (or next_tkn is none)
                        next_tkn()  # curr_tkn is now the one after the dash
                        if next_tkn is None:
                            raise Exception("Missing closing ']'.")
                        elif ord(prev_char) > ord(curr_tkn.char):
                            raise Exception(
                                f"Range values reversed. Start '{prev_char}' char code is greater than end '{curr_tkn.char}' char code.")
                        else:
                            match_str += get_range_str(prev_char,
                                                       curr_tkn.char)
                else:
                    # no range, no missing ']', just a char to add to match_str
                    match_str += curr_tkn.char
                next_tkn()

            return RangeElement(match_str="".join(sorted(set(match_str))), is_positive_logic=positive_logic)

        def parse_el() -> Union[Element, OrNode, GroupNode]:
            group_name: Union[str, None]
            group_name = None
            if isinstance(curr_tkn, ElementToken):
                return Element(match_ch=curr_tkn.char)
            elif isinstance(curr_tkn, Wildcard):
                return WildcardElement()
            elif isinstance(curr_tkn, SpaceToken):
                return SpaceElement()
            elif isinstance(curr_tkn, LeftParenthesis):
                next_tkn()
                # (?: for non-capturing group
                capturing = True
                if type(curr_tkn) is QuestionMark:
                    next_tkn()
                    if curr_tkn.char == ':':
                        capturing = False
                        next_tkn()
                    elif curr_tkn.char == '<':
                        next_tkn()
                        group_name = parse_group_name()
                    else:
                        if curr_tkn is None:
                            raise Exception("Unterminated group.")
                        else:
                            raise Exception(
                                f"Invalid group: '{{?{curr_tkn.char}'.")
                res = parse_re_seq(capturing=capturing, group_name=group_name)
                if isinstance(curr_tkn, RightParenthesis):
                    # next_tkn() not needed (parse_group's while loop will eat the parenthesis)
                    return res
                else:
                    raise Exception("Missing closing group parenthesis ')'.")
            else:
                raise Exception(
                    "Unescaped special character {}.".format(curr_tkn.char))

        def parse_group_name() -> str:
            if curr_tkn is None:
                raise Exception("Unterminated named group name.")
            group_name = ''
            while curr_tkn.char != '>':
                group_name += curr_tkn.char
                next_tkn()
                if curr_tkn is None:
                    raise Exception("Unterminated named group name.")
            if len(group_name) == 0:
                raise Exception("Unexpected empty named group name.")
            next_tkn()  # consumes '>'
            return group_name

        groups_counter = itertools.count(start=0)

        curr_tkn = None
        next_tkn = next_tkn_initializer(re)
        next_tkn()

        ast = parse_re()
        if curr_tkn is not None:
            raise Exception(
                "Unable to parse the regex.")
        return ast