"Utility functions used by the btm_matcher module" from . import pytree from .pgen2 import grammar, token from .pygram import pattern_symbols, python_symbols syms = pattern_symbols pysyms = python_symbols tokens = grammar.opmap token_labels = token TYPE_ANY = -1 TYPE_ALTERNATIVES = -2 TYPE_GROUP = -3 class MinNode(object): """This class serves as an intermediate representation of the pattern tree during the conversion to sets of leaf-to-root subpatterns""" def __init__(self, type=None, name=None): self.type = type self.name = name self.children = [] self.leaf = False self.parent = None self.alternatives = [] self.group = [] def __repr__(self): return str(self.type) + ' ' + str(self.name) def leaf_to_root(self): """Internal method. Returns a characteristic path of the pattern tree. This method must be run for all leaves until the linear subpatterns are merged into a single""" node = self subp = [] while node: if node.type == TYPE_ALTERNATIVES: node.alternatives.append(subp) if len(node.alternatives) == len(node.children): #last alternative subp = [tuple(node.alternatives)] node.alternatives = [] node = node.parent continue else: node = node.parent subp = None break if node.type == TYPE_GROUP: node.group.append(subp) #probably should check the number of leaves if len(node.group) == len(node.children): subp = get_characteristic_subpattern(node.group) node.group = [] node = node.parent continue else: node = node.parent subp = None break if node.type == token_labels.NAME and node.name: #in case of type=name, use the name instead subp.append(node.name) else: subp.append(node.type) node = node.parent return subp def get_linear_subpattern(self): """Drives the leaf_to_root method. The reason that leaf_to_root must be run multiple times is because we need to reject 'group' matches; for example the alternative form (a | b c) creates a group [b c] that needs to be matched. Since matching multiple linear patterns overcomes the automaton's capabilities, leaf_to_root merges each group into a single choice based on 'characteristic'ity, i.e. (a|b c) -> (a|b) if b more characteristic than c Returns: The most 'characteristic'(as defined by get_characteristic_subpattern) path for the compiled pattern tree. """ for l in self.leaves(): subp = l.leaf_to_root() if subp: return subp def leaves(self): "Generator that returns the leaves of the tree" for child in self.children: for x in child.leaves(): yield x if not self.children: yield self def reduce_tree(node, parent=None): """ Internal function. Reduces a compiled pattern tree to an intermediate representation suitable for feeding the automaton. This also trims off any optional pattern elements(like [a], a*). """ new_node = None #switch on the node type if node.type == syms.Matcher: #skip node = node.children[0] if node.type == syms.Alternatives : #2 cases if len(node.children) <= 2: #just a single 'Alternative', skip this node new_node = reduce_tree(node.children[0], parent) else: #real alternatives new_node = MinNode(type=TYPE_ALTERNATIVES) #skip odd children('|' tokens) for child in node.children: if node.children.index(child)%2: continue reduced = reduce_tree(child, new_node) if reduced is not None: new_node.children.append(reduced) elif node.type == syms.Alternative: if len(node.children) > 1: new_node = MinNode(type=TYPE_GROUP) for child in node.children: reduced = reduce_tree(child, new_node) if reduced: new_node.children.append(reduced) if not new_node.children: # delete the group if all of the children were reduced to None new_node = None else: new_node = reduce_tree(node.children[0], parent) elif node.type == syms.Unit: if (isinstance(node.children[0], pytree.Leaf) and node.children[0].value == '('): #skip parentheses return reduce_tree(node.children[1], parent) if ((isinstance(node.children[0], pytree.Leaf) and node.children[0].value == '[') or (len(node.children)>1 and hasattr(node.children[1], "value") and node.children[1].value == '[')): #skip whole unit if its optional return None leaf = True details_node = None alternatives_node = None has_repeater = False repeater_node = None has_variable_name = False for child in node.children: if child.type == syms.Details: leaf = False details_node = child elif child.type == syms.Repeater: has_repeater = True repeater_node = child elif child.type == syms.Alternatives: alternatives_node = child if hasattr(child, 'value') and child.value == '=': # variable name has_variable_name = True #skip variable name if has_variable_name: #skip variable name, '=' name_leaf = node.children[2] if hasattr(name_leaf, 'value') and name_leaf.value == '(': # skip parenthesis name_leaf = node.children[3] else: name_leaf = node.children[0] #set node type if name_leaf.type == token_labels.NAME: #(python) non-name or wildcard if name_leaf.value == 'any': new_node = MinNode(type=TYPE_ANY) else: if hasattr(token_labels, name_leaf.value): new_node = MinNode(type=getattr(token_labels, name_leaf.value)) else: new_node = MinNode(type=getattr(pysyms, name_leaf.value)) elif name_leaf.type == token_labels.STRING: #(python) name or character; remove the apostrophes from #the string value name = name_leaf.value.strip("'") if name in tokens: new_node = MinNode(type=tokens[name]) else: new_node = MinNode(type=token_labels.NAME, name=name) elif name_leaf.type == syms.Alternatives: new_node = reduce_tree(alternatives_node, parent) #handle repeaters if has_repeater: if repeater_node.children[0].value == '*': #reduce to None new_node = None elif repeater_node.children[0].value == '+': #reduce to a single occurence i.e. do nothing pass else: #TODO: handle {min, max} repeaters raise NotImplementedError pass #add children if details_node and new_node is not None: for child in details_node.children[1:-1]: #skip '<', '>' markers reduced = reduce_tree(child, new_node) if reduced is not None: new_node.children.append(reduced) if new_node: new_node.parent = parent return new_node def get_characteristic_subpattern(subpatterns): """Picks the most characteristic from a list of linear patterns Current order used is: names > common_names > common_chars """ if not isinstance(subpatterns, list): return subpatterns if len(subpatterns)==1: return subpatterns[0] # first pick out the ones containing variable names subpatterns_with_names = [] subpatterns_with_common_names = [] common_names = ['in', 'for', 'if' , 'not', 'None'] subpatterns_with_common_chars = [] common_chars = "[]().,:" for subpattern in subpatterns: if any(rec_test(subpattern, lambda x: type(x) is str)): if any(rec_test(subpattern, lambda x: isinstance(x, str) and x in common_chars)): subpatterns_with_common_chars.append(subpattern) elif any(rec_test(subpattern, lambda x: isinstance(x, str) and x in common_names)): subpatterns_with_common_names.append(subpattern) else: subpatterns_with_names.append(subpattern) if subpatterns_with_names: subpatterns = subpatterns_with_names elif subpatterns_with_common_names: subpatterns = subpatterns_with_common_names elif subpatterns_with_common_chars: subpatterns = subpatterns_with_common_chars # of the remaining subpatterns pick out the longest one return max(subpatterns, key=len) def rec_test(sequence, test_func): """Tests test_func on all items of sequence and items of included sub-iterables""" for x in sequence: if isinstance(x, (list, tuple)): for y in rec_test(x, test_func): yield y else: yield test_func(x)