Source code for pyregexp.lexer

from typing import List
from .tokens import *


[docs]class Lexer:
    """ Lexer for the pyregexp library.

    This class contains the method to scan a regular expression string producing the corresponding tokens.
    """

    def __init__(self) -> None:
        self.__digits__ = '0123456789'

    def __is_digit__(self, ch: str) -> bool:
        return self.__digits__.find(ch) > -1

[docs]    def scan(self, re: str) -> List[Token]:
        """ Regular expressions scanner.

        Scans the regular expression in input and produces the list of recognized Tokens in output.
        It raises an Exception if there are errors in the regular expression.

        Args:
            re (str): the regular expression to scan

        Returns:
            List[Token]: the list of tokens recognized in the passed regex
        """
        tokens = []

        def append(elem: Token) -> None:
            nonlocal tokens
            tokens.append(elem)

        i = 0
        escape_found = False
        while i < len(re):
            ch = re[i]
            if escape_found:
                if ch == 't':
                    append(ElementToken(char='\t'))
                if ch == 's':
                    # \s matches a space character
                    append(SpaceToken(char=ch))
                else:
                    append(ElementToken(char=ch))
            elif ch == '\\':
                escape_found = True
                i += 1  # otherwise i won't be incremented bc of continue
                continue
            elif ch == '.':
                append(Wildcard())
            elif ch == '(':
                append(LeftParenthesis())
            elif ch == ')':
                append(RightParenthesis())
            elif ch == '[':
                append(LeftBracket())
            elif ch == '-':
                append(Dash())
            elif ch == ']':
                append(RightBracket())
            elif ch == '{':
                append(LeftCurlyBrace())
                i += 1
                while i < len(re):
                    ch = re[i]
                    if ch == ',':
                        append(Comma())
                    elif self.__is_digit__(ch):
                        append(ElementToken(char=ch))
                    elif ch == '}':
                        append(RightCurlyBrace())
                        break
                    else:
                        raise Exception("Bad token at index ${}.".format(i))
                    i += 1
            elif ch == '^':
                if i == 0:
                    append(Start())
                else:
                    append(Circumflex())
            elif ch == '$':
                append(End())
            elif ch == '?':
                append(QuestionMark())
            elif ch == '*':
                append(Asterisk())
            elif ch == '+':
                append(Plus())
            elif ch == '|':
                append(VerticalBar())
            elif ch == '}':
                append(RightCurlyBrace())
            else:
                append(ElementToken(char=ch))

            escape_found = False
            i += 1

        return tokens