# Copyright 2006 Google, Inc. All Rights Reserved. # Licensed to PSF under a Contributor Agreement. """Pattern compiler. The grammer is taken from PatternGrammar.txt. The compiler compiles a pattern to a pytree.*Pattern instance. """ __author__ = "Guido van Rossum " # Python imports import os import StringIO # Fairly local imports from .pgen2 import driver, literals, token, tokenize, parse, grammar # Really local imports from . import pytree from . import pygram # The pattern grammar file _PATTERN_GRAMMAR_FILE = os.path.join(os.path.dirname(__file__), "PatternGrammar.txt") class PatternSyntaxError(Exception): pass def tokenize_wrapper(input): """Tokenizes a string suppressing significant whitespace.""" skip = set((token.NEWLINE, token.INDENT, token.DEDENT)) tokens = tokenize.generate_tokens(StringIO.StringIO(input).readline) for quintuple in tokens: type, value, start, end, line_text = quintuple if type not in skip: yield quintuple class PatternCompiler(object): def __init__(self, grammar_file=_PATTERN_GRAMMAR_FILE): """Initializer. Takes an optional alternative filename for the pattern grammar. """ self.grammar = driver.load_grammar(grammar_file) self.syms = pygram.Symbols(self.grammar) self.pygrammar = pygram.python_grammar self.pysyms = pygram.python_symbols self.driver = driver.Driver(self.grammar, convert=pattern_convert) def compile_pattern(self, input, debug=False, with_tree=False): """Compiles a pattern string to a nested pytree.*Pattern object.""" tokens = tokenize_wrapper(input) try: root = self.driver.parse_tokens(tokens, debug=debug) except parse.ParseError as e: raise PatternSyntaxError(str(e)) if with_tree: return self.compile_node(root), root else: return self.compile_node(root) def compile_node(self, node): """Compiles a node, recursively. This is one big switch on the node type. """ # XXX Optimize certain Wildcard-containing-Wildcard patterns # that can be merged if node.type == self.syms.Matcher: node = node.children[0] # Avoid unneeded recursion if node.type == self.syms.Alternatives: # Skip the odd children since they are just '|' tokens alts = [self.compile_node(ch) for ch in node.children[::2]] if len(alts) == 1: return alts[0] p = pytree.WildcardPattern([[a] for a in alts], min=1, max=1) return p.optimize() if node.type == self.syms.Alternative: units = [self.compile_node(ch) for ch in node.children] if len(units) == 1: return units[0] p = pytree.WildcardPattern([units], min=1, max=1) return p.optimize() if node.type == self.syms.NegatedUnit: pattern = self.compile_basic(node.children[1:]) p = pytree.NegatedPattern(pattern) return p.optimize() assert node.type == self.syms.Unit name = None nodes = node.children if len(nodes) >= 3 and nodes[1].type == token.EQUAL: name = nodes[0].value nodes = nodes[2:] repeat = None if len(nodes) >= 2 and nodes[-1].type == self.syms.Repeater: repeat = nodes[-1] nodes = nodes[:-1] # Now we've reduced it to: STRING | NAME [Details] | (...) | [...] pattern = self.compile_basic(nodes, repeat) if repeat is not None: assert repeat.type == self.syms.Repeater children = repeat.children child = children[0] if child.type == token.STAR: min = 0 max = pytree.HUGE elif child.type == token.PLUS: min = 1 max = pytree.HUGE elif child.type == token.LBRACE: assert children[-1].type == token.RBRACE assert len(children) in (3, 5) min = max = self.get_int(children[1]) if len(children) == 5: max = self.get_int(children[3]) else: assert False if min != 1 or max != 1: pattern = pattern.optimize() pattern = pytree.WildcardPattern([[pattern]], min=min, max=max) if name is not None: pattern.name = name return pattern.optimize() def compile_basic(self, nodes, repeat=None): # Compile STRING | NAME [Details] | (...) | [...] assert len(nodes) >= 1 node = nodes[0] if node.type == token.STRING: value = unicode(literals.evalString(node.value)) return pytree.LeafPattern(_type_of_literal(value), value) elif node.type == token.NAME: value = node.value if value.isupper(): if value not in TOKEN_MAP: raise PatternSyntaxError("Invalid token: %r" % value) if nodes[1:]: raise PatternSyntaxError("Can't have details for token") return pytree.LeafPattern(TOKEN_MAP[value]) else: if value == "any": type = None elif not value.startswith("_"): type = getattr(self.pysyms, value, None) if type is None: raise PatternSyntaxError("Invalid symbol: %r" % value) if nodes[1:]: # Details present content = [self.compile_node(nodes[1].children[1])] else: content = None return pytree.NodePattern(type, content) elif node.value == "(": return self.compile_node(nodes[1]) elif node.value == "[": assert repeat is None subpattern = self.compile_node(nodes[1]) return pytree.WildcardPattern([[subpattern]], min=0, max=1) assert False, node def get_int(self, node): assert node.type == token.NUMBER return int(node.value) # Map named tokens to the type value for a LeafPattern TOKEN_MAP = {"NAME": token.NAME, "STRING": token.STRING, "NUMBER": token.NUMBER, "TOKEN": None} def _type_of_literal(value): if value[0].isalpha(): return token.NAME elif value in grammar.opmap: return grammar.opmap[value] else: return None def pattern_convert(grammar, raw_node_info): """Converts raw node information to a Node or Leaf instance.""" type, value, context, children = raw_node_info if children or type in grammar.number2symbol: return pytree.Node(type, children, context=context) else: return pytree.Leaf(type, value, context=context) def compile_pattern(pattern): return PatternCompiler().compile_pattern(pattern)