Source code for pyregexp.lexer

from typing import List
from .tokens import *


[docs]class Lexer: """ Lexer for the pyregexp library. This class contains the method to scan a regular expression string producing the corresponding tokens. """ def __init__(self) -> None: self.__digits__ = '0123456789' def __is_digit__(self, ch: str) -> bool: return self.__digits__.find(ch) > -1
[docs] def scan(self, re: str) -> List[Token]: """ Regular expressions scanner. Scans the regular expression in input and produces the list of recognized Tokens in output. It raises an Exception if there are errors in the regular expression. Args: re (str): the regular expression to scan Returns: List[Token]: the list of tokens recognized in the passed regex """ tokens = [] def append(elem: Token) -> None: nonlocal tokens tokens.append(elem) i = 0 escape_found = False while i < len(re): ch = re[i] if escape_found: if ch == 't': append(ElementToken(char='\t')) if ch == 's': # \s matches a space character append(SpaceToken(char=ch)) else: append(ElementToken(char=ch)) elif ch == '\\': escape_found = True i += 1 # otherwise i won't be incremented bc of continue continue elif ch == '.': append(Wildcard()) elif ch == '(': append(LeftParenthesis()) elif ch == ')': append(RightParenthesis()) elif ch == '[': append(LeftBracket()) elif ch == '-': append(Dash()) elif ch == ']': append(RightBracket()) elif ch == '{': append(LeftCurlyBrace()) i += 1 while i < len(re): ch = re[i] if ch == ',': append(Comma()) elif self.__is_digit__(ch): append(ElementToken(char=ch)) elif ch == '}': append(RightCurlyBrace()) break else: raise Exception("Bad token at index ${}.".format(i)) i += 1 elif ch == '^': if i == 0: append(Start()) else: append(Circumflex()) elif ch == '$': append(End()) elif ch == '?': append(QuestionMark()) elif ch == '*': append(Asterisk()) elif ch == '+': append(Plus()) elif ch == '|': append(VerticalBar()) elif ch == '}': append(RightCurlyBrace()) else: append(ElementToken(char=ch)) escape_found = False i += 1 return tokens