#!/usr/bin/python2.5 """ PyNarcissus A lexical scanner and parser. JS implemented in JS, ported to Python. """ __author__ = "JT Olds" __author_email__ = "jtolds@xnet5.com" __date__ = "2009-03-24" __all__ = ["ParseError", "parse", "tokens"] import re, sys, types class Object: pass class Error_(Exception): pass class ParseError(Error_): pass tokens = dict(enumerate(( # End of source. "END", # Operators and punctuators. Some pair-wise order matters, e.g. (+, -) # and (UNARY_PLUS, UNARY_MINUS). "\n", ";", ",", "=", "?", ":", "CONDITIONAL", "||", "&&", "|", "^", "&", "==", "!=", "===", "!==", "<", "<=", ">=", ">", "<<", ">>", ">>>", "+", "-", "*", "/", "%", "!", "~", "UNARY_PLUS", "UNARY_MINUS", "++", "--", ".", "[", "]", "{", "}", "(", ")", # Nonterminal tree node type codes. "SCRIPT", "BLOCK", "LABEL", "FOR_IN", "CALL", "NEW_WITH_ARGS", "INDEX", "ARRAY_INIT", "OBJECT_INIT", "PROPERTY_INIT", "GETTER", "SETTER", "GROUP", "LIST", # Terminals. "IDENTIFIER", "NUMBER", "STRING", "REGEXP", # Keywords. "break", "case", "catch", "const", "continue", "debugger", "default", "delete", "do", "else", "enum", "false", "finally", "for", "function", "if", "in", "instanceof", "new", "null", "return", "switch", "this", "throw", "true", "try", "typeof", "var", "void", "while", "with"))) # Operator and punctuator mapping from token to tree node type name. # NB: superstring tokens (e.g., ++) must come before their substring token # counterparts (+ in the example), so that the opRegExp regular expression # synthesized from this list makes the longest possible match. opTypeNames = [ ('\n', "NEWLINE"), (';', "SEMICOLON"), (',', "COMMA"), ('?', "HOOK"), (':', "COLON"), ('||', "OR"), ('&&', "AND"), ('|', "BITWISE_OR"), ('^', "BITWISE_XOR"), ('&', "BITWISE_AND"), ('===', "STRICT_EQ"), ('==', "EQ"), ('=', "ASSIGN"), ('!==', "STRICT_NE"), ('!=', "NE"), ('<<', "LSH"), ('<=', "LE"), ('<', "LT"), ('>>>', "URSH"), ('>>', "RSH"), ('>=', "GE"), ('>', "GT"), ('++', "INCREMENT"), ('--', "DECREMENT"), ('+', "PLUS"), ('-', "MINUS"), ('*', "MUL"), ('/', "DIV"), ('%', "MOD"), ('!', "NOT"), ('~', "BITWISE_NOT"), ('.', "DOT"), ('[', "LEFT_BRACKET"), (']', "RIGHT_BRACKET"), ('{', "LEFT_CURLY"), ('}', "RIGHT_CURLY"), ('(', "LEFT_PAREN"), (')', "RIGHT_PAREN"), ] keywords = {} # Define const END, etc., based on the token names. Also map name to index. for i, t in tokens.copy().iteritems(): if re.match(r'^[a-z]', t): const_name = t.upper() keywords[t] = i elif re.match(r'^\W', t): const_name = dict(opTypeNames)[t] else: const_name = t globals()[const_name] = i tokens[t] = i assignOps = {} # Map assignment operators to their indexes in the tokens array. for i, t in enumerate(['|', '^', '&', '<<', '>>', '>>>', '+', '-', '*', '/', '%']): assignOps[t] = tokens[t] assignOps[i] = t # Build a regexp that recognizes operators and punctuators (except newline). opRegExpSrc = "^" for i, j in opTypeNames: if i == "\n": continue if opRegExpSrc != "^": opRegExpSrc += "|^" opRegExpSrc += re.sub(r'[?|^&(){}\[\]+\-*\/\.]', lambda x: "\\%s" % x.group(0), i) opRegExp = re.compile(opRegExpSrc) # Convert opTypeNames to an actual dictionary now that we don't care about ordering opTypeNames = dict(opTypeNames) # A regexp to match floating point literals (but not integer literals). fpRegExp = re.compile(r'^\d+\.\d*(?:[eE][-+]?\d+)?|^\d+(?:\.\d*)?[eE][-+]?\d+|^\.\d+(?:[eE][-+]?\d+)?') # A regexp to match regexp literals. reRegExp = re.compile(r'^\/((?:\\.|\[(?:\\.|[^\]])*\]|[^\/])+)\/([gimy]*)') class SyntaxError_(ParseError): def __init__(self, message, filename, lineno): ParseError.__init__(self, "Syntax error: %s\n%s:%s" % (message, filename, lineno)) class Tokenizer(object): def __init__(self, s, f, l): self.cursor = 0 self.source = str(s) self.tokens = {} self.tokenIndex = 0 self.lookahead = 0 self.scanNewlines = False self.scanOperand = True self.filename = f self.lineno = l input_ = property(lambda self: self.source[self.cursor:]) done = property(lambda self: self.peek() == END) token = property(lambda self: self.tokens.get(self.tokenIndex)) def match(self, tt): return self.get() == tt or self.unget() def mustMatch(self, tt): if not self.match(tt): raise self.newSyntaxError("Missing " + tokens.get(tt).lower()) return self.token def peek(self): if self.lookahead: next = self.tokens.get((self.tokenIndex + self.lookahead) & 3) if self.scanNewlines and (getattr(next, "lineno", None) != getattr(self, "lineno", None)): tt = NEWLINE else: tt = getattr(next, "type_", None) else: tt = self.get() self.unget() return tt def peekOnSameLine(self): self.scanNewlines = True tt = self.peek() self.scanNewlines = False return tt def get(self): while self.lookahead: self.lookahead -= 1 self.tokenIndex = (self.tokenIndex + 1) & 3 token = self.tokens.get(self.tokenIndex) if getattr(token, "type_", None) != NEWLINE or self.scanNewlines: return getattr(token, "type_", None) while True: input__ = self.input_ if self.scanNewlines: match = re.match(r'^[ \t]+', input__) else: match = re.match(r'^\s+', input__) if match: spaces = match.group(0) self.cursor += len(spaces) newlines = re.findall(r'\n', spaces) if newlines: self.lineno += len(newlines) input__ = self.input_ match = re.match(r'^\/(?:\*(?:.|\n)*?\*\/|\/.*)', input__) if not match: break comment = match.group(0) self.cursor += len(comment) newlines = re.findall(r'\n', comment) if newlines: self.lineno += len(newlines) self.tokenIndex = (self.tokenIndex + 1) & 3 token = self.tokens.get(self.tokenIndex) if not token: token = Object() self.tokens[self.tokenIndex] = token if not input__: token.type_ = END return END def matchInput(): match = fpRegExp.match(input__) if match: token.type_ = NUMBER token.value = float(match.group(0)) return match.group(0) match = re.match(r'^0[xX][\da-fA-F]+|^0[0-7]*|^\d+', input__) if match: token.type_ = NUMBER token.value = eval(match.group(0)) return match.group(0) match = re.match(r'^[$_\w]+', input__) # FIXME no ES3 unicode if match: id_ = match.group(0) token.type_ = keywords.get(id_, IDENTIFIER) token.value = id_ return match.group(0) match = re.match(r'^"(?:\\.|[^"])*"|^\'(?:\\.|[^\'])*\'', input__) if match: token.type_ = STRING token.value = eval(match.group(0)) return match.group(0) if self.scanOperand: match = reRegExp.match(input__) if match: token.type_ = REGEXP token.value = {"regexp": match.group(1), "modifiers": match.group(2)} return match.group(0) match = opRegExp.match(input__) if match: op = match.group(0) if assignOps.has_key(op) and input__[len(op)] == '=': token.type_ = ASSIGN token.assignOp = globals()[opTypeNames[op]] token.value = op return match.group(0) + "=" token.type_ = globals()[opTypeNames[op]] if self.scanOperand and (token.type_ in (PLUS, MINUS)): token.type_ += UNARY_PLUS - PLUS token.assignOp = None token.value = op return match.group(0) if self.scanNewlines: match = re.match(r'^\n', input__) if match: token.type_ = NEWLINE return match.group(0) raise self.newSyntaxError("Illegal token") token.start = self.cursor self.cursor += len(matchInput()) token.end = self.cursor token.lineno = self.lineno return getattr(token, "type_", None) def unget(self): self.lookahead += 1 if self.lookahead == 4: raise "PANIC: too much lookahead!" self.tokenIndex = (self.tokenIndex - 1) & 3 def newSyntaxError(self, m): return SyntaxError_(m, self.filename, self.lineno) class CompilerContext(object): def __init__(self, inFunction): self.inFunction = inFunction self.stmtStack = [] self.funDecls = [] self.varDecls = [] self.bracketLevel = 0 self.curlyLevel = 0 self.parenLevel = 0 self.hookLevel = 0 self.ecmaStrictMode = False self.inForLoopInit = False def Script(t, x): n = Statements(t, x) n.type_ = SCRIPT n.funDecls = x.funDecls n.varDecls = x.varDecls return n class Node(list): def __init__(self, t, type_=None, args=[]): list.__init__(self) token = t.token if token: if type_: self.type_ = type_ else: self.type_ = getattr(token, "type_", None) self.value = token.value self.lineno = token.lineno self.start = token.start self.end = token.end else: self.type_ = type_ self.lineno = t.lineno self.tokenizer = t for arg in args: self.append(arg) type = property(lambda self: tokenstr(self.type_)) # Always use push to add operands to an expression, to update start and end. def append(self, kid, numbers=[]): if kid: if hasattr(self, "start") and kid.start < self.start: self.start = kid.start if hasattr(self, "end") and self.end < kid.end: self.end = kid.end return list.append(self, kid) indentLevel = 0 def __str__(self): a = list((str(i), v) for i, v in enumerate(self)) for attr in dir(self): if attr[0] == "_": continue elif attr == "tokenizer": a.append((attr, "[object Object]")) elif attr in ("append", "count", "extend", "getSource", "index", "insert", "pop", "remove", "reverse", "sort", "type_", "target", "filename", "indentLevel", "type"): continue else: a.append((attr, getattr(self, attr))) if len(self): a.append(("length", len(self))) a.sort(lambda a, b: cmp(a[0], b[0])) INDENTATION = " " Node.indentLevel += 1 n = Node.indentLevel s = "{\n%stype: %s" % ((INDENTATION * n), tokenstr(self.type_)) for i, value in a: s += ",\n%s%s: " % ((INDENTATION * n), i) if i == "value" and self.type_ == REGEXP: s += "/%s/%s" % (value["regexp"], value["modifiers"]) elif value is None: s += "null" elif value is False: s += "false" elif value is True: s += "true" elif type(value) == list: s += ','.join((str(x) for x in value)) else: s += str(value) Node.indentLevel -= 1 n = Node.indentLevel s += "\n%s}" % (INDENTATION * n) return s __repr__ = __str__ def getSource(self): if getattr(self, "start", None) is not None: if getattr(self, "end", None) is not None: return self.tokenizer.source[self.start:self.end] return self.tokenizer.source[self.start:] if getattr(self, "end", None) is not None: return self.tokenizer.source[:self.end] return self.tokenizer.source[:] filename = property(lambda self: self.tokenizer.filename) def __nonzero__(self): return True # Statement stack and nested statement handler. def nest(t, x, node, func, end=None): x.stmtStack.append(node) n = func(t, x) x.stmtStack.pop() if end: t.mustMatch(end) return n def tokenstr(tt): t = tokens[tt] if re.match(r'^\W', t): return opTypeNames[t] return t.upper() def Statements(t, x): n = Node(t, BLOCK) x.stmtStack.append(n) while not t.done and t.peek() != RIGHT_CURLY: n.append(Statement(t, x)) x.stmtStack.pop() return n def Block(t, x): t.mustMatch(LEFT_CURLY) n = Statements(t, x) t.mustMatch(RIGHT_CURLY) return n DECLARED_FORM = 0 EXPRESSED_FORM = 1 STATEMENT_FORM = 2 def Statement(t, x): tt = t.get() # Cases for statements ending in a right curly return early, avoiding the # common semicolon insertion magic after this switch. if tt == FUNCTION: if len(x.stmtStack) > 1: type_ = STATEMENT_FORM else: type_ = DECLARED_FORM return FunctionDefinition(t, x, True, type_) elif tt == LEFT_CURLY: n = Statements(t, x) t.mustMatch(RIGHT_CURLY) return n elif tt == IF: n = Node(t) n.condition = ParenExpression(t, x) x.stmtStack.append(n) n.thenPart = Statement(t, x) if t.match(ELSE): n.elsePart = Statement(t, x) else: n.elsePart = None x.stmtStack.pop() return n elif tt == SWITCH: n = Node(t) t.mustMatch(LEFT_PAREN) n.discriminant = Expression(t, x) t.mustMatch(RIGHT_PAREN) n.cases = [] n.defaultIndex = -1 x.stmtStack.append(n) t.mustMatch(LEFT_CURLY) while True: tt = t.get() if tt == RIGHT_CURLY: break if tt in (DEFAULT, CASE): if tt == DEFAULT and n.defaultIndex >= 0: raise t.newSyntaxError("More than one switch default") n2 = Node(t) if tt == DEFAULT: n.defaultIndex = len(n.cases) else: n2.caseLabel = Expression(t, x, COLON) else: raise t.newSyntaxError("Invalid switch case") t.mustMatch(COLON) n2.statements = Node(t, BLOCK) while True: tt = t.peek() if(tt == CASE or tt == DEFAULT or tt == RIGHT_CURLY): break n2.statements.append(Statement(t, x)) n.cases.append(n2) x.stmtStack.pop() return n elif tt == FOR: n = Node(t) n2 = None n.isLoop = True t.mustMatch(LEFT_PAREN) tt = t.peek() if tt != SEMICOLON: x.inForLoopInit = True if tt == VAR or tt == CONST: t.get() n2 = Variables(t, x) else: n2 = Expression(t, x) x.inForLoopInit = False if n2 and t.match(IN): n.type_ = FOR_IN if n2.type_ == VAR: if len(n2) != 1: raise SyntaxError("Invalid for..in left-hand side", t.filename, n2.lineno) # NB: n2[0].type_ == INDENTIFIER and n2[0].value == n2[0].name n.iterator = n2[0] n.varDecl = n2 else: n.iterator = n2 n.varDecl = None n.object = Expression(t, x) else: if n2: n.setup = n2 else: n.setup = None t.mustMatch(SEMICOLON) if t.peek() == SEMICOLON: n.condition = None else: n.condition = Expression(t, x) t.mustMatch(SEMICOLON) if t.peek() == RIGHT_PAREN: n.update = None else: n.update = Expression(t, x) t.mustMatch(RIGHT_PAREN) n.body = nest(t, x, n, Statement) return n elif tt == WHILE: n = Node(t) n.isLoop = True n.condition = ParenExpression(t, x) n.body = nest(t, x, n, Statement) return n elif tt == DO: n = Node(t) n.isLoop = True n.body = nest(t, x, n, Statement, WHILE) n.condition = ParenExpression(t, x) if not x.ecmaStrictMode: #