Source code for pyregexp.engine

"""Module containing the RegexEngine class.

The RegexEngine class implements a regular expressions engine.

Example:
    Matching a regex with some test string::

        reng = RegexEngine()
        result, consumed = reng.match(r"a+bx", "aabx")
"""


from collections import deque
from typing import Callable, Deque, Union, Tuple, List
import unicodedata
from .pyrser import Pyrser
from .match import Match
from .re_ast import RE, GroupNode, LeafNode, OrNode, EndElement, StartElement


[docs]class RegexEngine: """ Regular Expressions Engine. This class contains all the necessary to recognize regular expressions in a test string. """ def __init__(self): self.parser: Pyrser = Pyrser() self.prev_re: str = None self.prev_ast: RE = None
[docs] def match(self, re: str, string: str, return_matches: bool = False, continue_after_match: bool = False, ignore_case: int = 0) -> Union[Tuple[bool, int, List[Deque[Match]]], Tuple[bool, int]]: """ Searches a regex in a test string. Searches the passed regular expression in the passed test string and returns the result. It is possible to customize both the returned value and the search method. The ignore_case flag may cause unexpected results in the returned number of matched characters, and also in the returned matches, e.g. when the character ẞ is present in either the regex or the test string. Args: re (str): the regular expression to search string (str): the test string return_matches (bool): if True a data structure containing the matches - the whole match and the subgroups matched (default is False) continue_after_match (bool): if True the engine continues matching until the whole input is consumed (default is False) ignore_case (int): when 0 the case is not ignored, when 1 a "soft" case ignoring is performed, when 2 casefolding is performed. (default is 0) Returns: A tuple containing whether a match was found or not, the last matched character index, and, if return_matches is True, a list of deques of Match, where each list of matches represents in the first position the whole match, and in the subsequent positions all the group and subgroups matched. """ def return_fnc(res: bool, consumed: int, all_matches: List[Deque[Match]], return_matches: bool) -> Union[Tuple[bool, int, List[Deque[Match]]], Tuple[bool, int]]: """ Create the Tuple to return.""" if return_matches: return res, consumed, all_matches else: return res, consumed if ignore_case == 1: re = unicodedata.normalize("NFKD", re).lower() string = unicodedata.normalize("NFKD", string).casefold() elif ignore_case == 2: re = unicodedata.normalize("NFKD", re).casefold() string = unicodedata.normalize("NFKD", string).casefold() ast = self.parser.parse(re=re) if self.prev_re != re else self.prev_ast self.prev_re = re self.prev_ast = ast # variables holding the matched groups list for each matched substring in the test string all_matches: List[Deque[Match]] = [] highest_matched_idx: int = 0 # holds the highest matched string's index res, consumed, matches = self.__match__(ast, string, 0) if res: highest_matched_idx = consumed all_matches.append(matches) else: return return_fnc(res, highest_matched_idx, all_matches, return_matches) if not continue_after_match or not consumed > 0: return return_fnc(res, highest_matched_idx, all_matches, return_matches) while True: res, consumed, matches = self.__match__(ast, string, consumed) # if consumed is not grater than highest_matched_idx this means the new match # consumed 0 characters, so there is really nothing more to match if res and consumed > highest_matched_idx: highest_matched_idx = consumed all_matches.append(matches) else: return return_fnc(True, highest_matched_idx, all_matches, return_matches)
def __match__(self, ast: RE, string: str, start_str_i: int) -> Tuple[bool, int, Deque[Match]]: """ Same as match, but always returns after the first match.""" matches: Deque[Match] = deque() # used to restore the left match of a ornode if necessary last_match: Match = None # str_i represents the matched characters so far. It is inizialized to # the value of the input parameter start_str_i because the match could # be to be searched starting at an index different from 0, e.g. in the # case this function is called to search a second match in the test # string. str_i = start_str_i # max_matched_idx represents the "upper limit" of the match. # It is necessary when backtracking in the presence of nested # quantifiers, because we need a way to "tell" the group that # is causing the fail by being too greedy to stop earlier if # possible. max_matched_idx = -1 def return_fnc(res: bool, str_i: int) -> Tuple[bool, int, Deque[Match]]: """ Returns the Tuple to be returned by __match__.""" nonlocal matches return res, str_i, matches def save_matches(match_group: Callable, ast: Union[RE, GroupNode], string: str, start_idx: int, max_matched_idx=-1) -> Tuple[bool, int]: """ Save the matches of capturing groups. Args: match_group (Callable): the function to use to match the group ast (Union[RE, GroupNode]): the group to match string (str): the string to match start_idx (int): the starting index Returns: A tuple of the boolean result of the match, and the last matched index. """ nonlocal matches nonlocal last_match res, end_idx = match_group(ast, string, max_matched_idx) if ast.is_capturing() and res == True: for i in range(0, len(matches)): if matches[i].group_id == ast.group_id: last_match = matches[i] matches.remove(matches[i]) break matches.appendleft( Match(ast.group_id, start_idx, end_idx, string, ast.group_name)) return res, end_idx def remove_leftmost_match(): """ Used when matching an OrNode. When matching an OrNode the right children is always saved instead of saving the left one when the chosen path goes left. By calling this function you remove the leftmost match (the one created by the right child). """ nonlocal matches matches.popleft() def appendleft_last_match(): """ Used when matching an OrNode. When matching an OrNode the right children is always saved instead of saving the left one when the chosen path goes left. By calling this function you restore the left match. """ nonlocal matches matches.appendleft(last_match) def match_group(ast: Union[RE, GroupNode, OrNode], string: str, max_matched_idx: int = -1) -> Tuple[bool, int]: """ Match a group, which is always the case.s Returns the match state (True or False) and the new string i, that is the number of matched characters in the string so far. """ nonlocal start_str_i nonlocal str_i backtrack_stack: List[Tuple[int, int, int, List[int]]] = [] def backtrack(str_i: int, curr_child_i: int, recursive: bool = False) -> Tuple[bool, int, int]: """ Returns whether it is possible to backtrack and the state to backtrack to. Takes as input the current state of the engine and returns whether or not it is possible to backtrack. Args: str_i (int): the current considered index of the test string curr_child_i (int): the index of the GroupNode children considered Returns: A Tuple containing a bool, True if it is possible to backtrack, the new string index, and the new node children index to which backtrack to. Note that the last two parameters only have a meaning in the case it is possible to backtrack (the bool is True). """ nonlocal backtrack_stack nonlocal max_matched_idx nonlocal ast if len(backtrack_stack) == 0: return False, str_i, curr_child_i # the fist step is to pop the last tuple from the backtrack_stack popped_child_i, min_, matched_times, consumed_list = backtrack_stack.pop() if matched_times == min_: # if a node is already matched the minimum number of times, the # chance you have to potentially be able to backtrack is to is # to delete the entry from the stack and then search for a new # possibility (recursively calling this function). # But, before the recursion, you have to calculate what the # string index (str_i) value was before the node was matched # even once. Thus, you have to decrease the string index # of each consumption in the consumed_list. # calculate_the new str_i before_str_i = str_i for consumption in consumed_list: str_i -= consumption if max_matched_idx == -1 or isinstance(ast.children[popped_child_i], LeafNode) or before_str_i == str_i: # recursive call return backtrack(str_i, popped_child_i, True) else: # case of backtracking from nested quantifier # returns "not recursive" because if it is the case # of a recursive call, this is outside of the case of # simply nested quantifiers, and in I cannot backtrack # anymore return not recursive, str_i, popped_child_i else: # the node was matched more times than its min, so you just # need to remove the last consumption from the list, # decrease the str_i by that amount, decrease the times the node # was matched - matched_times - by 1, and then append the stack # the tuple with the new matched_times and consumed_list. last_consumed = consumed_list.pop() new_str_i = str_i - last_consumed if max_matched_idx == -1 or isinstance(ast.children[popped_child_i], LeafNode): backtrack_stack.append( (popped_child_i, min_, matched_times - 1, consumed_list)) # lastly, you return that the backtracking is possible, and # the state to which backtrack to. return True, new_str_i, curr_child_i else: # case of backtracking from nested quantifier return not recursive, new_str_i, popped_child_i def remove_this_node_from_stack(curr_child_i: int, str_i: int) -> int: """ Removes node from stack and returns the new str_i. """ nonlocal backtrack_stack popped_child_i, min_, matched_times, consumed_list = backtrack_stack.pop() if popped_child_i == curr_child_i: for consumption in consumed_list: str_i -= consumption else: backtrack_stack.append((popped_child_i, min_, matched_times, consumed_list)) return str_i curr_node = ast.children[0] if len(ast.children) > 0 else None i = 0 # the children i'm iterating, not to confuse with str_i if isinstance(ast, OrNode): # matcha il primo, se matcha return true # se no matcha il secondo # se matcha return true, altrimenti false tmp_str_i = str_i res, new_str_i = save_matches( match_group, curr_node, string, str_i, max_matched_idx) if not isinstance(curr_node, OrNode) else match_group(curr_node, string, max_matched_idx) if not res: str_i = tmp_str_i curr_node = ast.right res, new_str_i = save_matches( match_group, curr_node, string, str_i, max_matched_idx) if not isinstance(curr_node, OrNode) else match_group(curr_node, string, max_matched_idx) str_i = new_str_i return res, str_i # the passed ast can't be a Leaf while i < len(ast.children): curr_node = ast.children[i] # if is OrNode I evaluate the sub-groups with a recursive call if isinstance(curr_node, OrNode): before_str_i = str_i min_, max_ = curr_node.min, curr_node.max j = 0 consumed_list = [] backtracking = False while j < max_: tmp_str_i = str_i save_match_left = isinstance(curr_node.left, GroupNode) res_left, str_i_left = save_matches(match_group, curr_node.left, string, str_i, max_matched_idx) if save_match_left else match_group(curr_node.left, string, max_matched_idx) str_i = tmp_str_i save_match_right = isinstance(curr_node.right, GroupNode) res_right, str_i_right = save_matches(match_group, curr_node.right, string, str_i, max_matched_idx) if save_match_right else match_group(curr_node.right, string, max_matched_idx) if res_left and res_right: # choose the one that consumed the most character # unless it exceeds the max_matched_idx chose_left = (str_i_left >= str_i_right) str_i = str_i_left if chose_left else str_i_right if max_matched_idx != -1 and str_i > max_matched_idx: # tries to stay below the max_matched_idx threshold str_i = str_i_right if chose_left else str_i_left if chose_left: if save_match_right: remove_leftmost_match() if save_match_left: appendleft_last_match() else: # chose right if save_match_left and not save_match_right: # there is a spurious match originated from # the left child remove_leftmost_match() elif res_left and not res_right: str_i = str_i_left elif not res_left and res_right: str_i = str_i_right res = (res_left or res_right) if res == True and (max_matched_idx == -1 or str_i <= max_matched_idx): if (str_i - tmp_str_i == 0) and j >= min_: max_matched_idx = -1 break consumed_list.append(str_i - tmp_str_i) else: if min_ <= j: max_matched_idx = -1 break if i > 0 and not isinstance(ast.children[i-1], LeafNode): str_i = remove_this_node_from_stack(i, str_i) if str_i == start_str_i: return False, str_i max_matched_idx = str_i - 1 if max_matched_idx == -1 else max_matched_idx - 1 can_bt, bt_str_i, bt_i = backtrack(str_i, i) if can_bt: i = bt_i str_i = bt_str_i backtracking = True break # retry to match the current node else: return False, str_i j += 1 if not backtracking: backtrack_stack.append( (i, min_, j, consumed_list)) max_matched_idx = -1 i += 1 continue elif isinstance(curr_node, GroupNode): min_, max_ = curr_node.min, curr_node.max j = 0 consumed_list = [] before_str_i = str_i backtracking = False while j < max_: tmp_str_i = str_i res, new_str_i = save_matches( match_group, curr_node, string, str_i, max_matched_idx) if res == True and (max_matched_idx == -1 or new_str_i <= max_matched_idx): # i must use tmp_str_i because str_i is changed by the match_group # call, so (new_str_i - str_i) would be always 0 if (new_str_i - tmp_str_i == 0) and j >= min_: max_matched_idx = -1 break consumed_list.append(new_str_i - tmp_str_i) #str_i = new_str_i else: if min_ <= j: # i did the bare minimum or more max_matched_idx = -1 break if i > 0 and not isinstance(ast.children[i-1], LeafNode): str_i = remove_this_node_from_stack(i, str_i) if str_i == start_str_i: return False, str_i max_matched_idx = str_i - 1 if max_matched_idx == -1 else max_matched_idx - 1 can_bt, bt_str_i, bt_i = backtrack(str_i, i) if can_bt: i = bt_i str_i = bt_str_i backtracking = True break # retry to match the current node else: return False, str_i j += 1 # if NOT backtracking iterate the next element, and put the # current on the backtrack_stack, otherwise don't increment i, don't put on the # stack so to retry the current one (just continue) if not backtracking: backtrack_stack.append( (i, min_, j, consumed_list)) max_matched_idx = -1 i += 1 continue elif isinstance(curr_node, LeafNode): # it is a LeafNode obviously now min_, max_ = curr_node.min, curr_node.max j = 0 consumed_list = [] before_str_i = str_i # to discard changes made in case i need to bt backtracking = False while j < max_: if str_i < len(string): # i still have input to match if curr_node.is_match(ch=string[str_i], str_i=str_i, str_len=len(string)) and (max_matched_idx == -1 or str_i < max_matched_idx): if not (isinstance(curr_node, StartElement) or isinstance(curr_node, EndElement)): consumed_list.append(1) str_i += 1 else: if min_ <= j: # I already met the minimum requirement for match break if i > 0 and not isinstance(ast.children[i-1], LeafNode): str_i = remove_this_node_from_stack(i, str_i) if str_i == start_str_i: return False, str_i max_matched_idx = str_i - 1 can_bt, bt_str_i, bt_i = backtrack( before_str_i, i) if can_bt: i = bt_i str_i = bt_str_i backtracking = True break else: return False, str_i else: # finished input if isinstance(curr_node, StartElement) or isinstance(curr_node, EndElement) and curr_node.is_match(str_i=str_i, str_len=len(string)): pass # finished input w/o finishing the regex tree elif min_ <= j: break else: # i have more states, but the input is finished can_bt, bt_str_i, bt_i = backtrack( before_str_i, i) if can_bt: i = bt_i str_i = bt_str_i backtracking = True break else: return False, str_i j += 1 if not backtracking: backtrack_stack.append( (i, min_, j, consumed_list)) i += 1 continue else: return False, str_i return True, str_i i = str_i if len(string) == 0: res, consumed = save_matches( match_group=match_group, ast=ast, string=string, start_idx=str_i) return return_fnc(res, consumed) while str_i < len(string): res, _ = save_matches(match_group=match_group, ast=ast, string=string, start_idx=str_i) i += 1 if res: return return_fnc(True, str_i) else: matches = deque() str_i = i return return_fnc(False, str_i)