Blame - tools/idl_parser/idl_lexer.py - chromium/src.git

blob: 9b92b3ecce41a239953f0bf1240dab4d7fc13c18 [file] [log] [blame]

[email protected]	cffee7f	2013-04-11 17:03:48	[diff] [blame]	1	#!/usr/bin/env python
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	2	# Copyright (c) 2013 The Chromium Authors. All rights reserved.
				3	# Use of this source code is governed by a BSD-style license that can be
				4	# found in the LICENSE file.
				5
				6	""" Lexer for PPAPI IDL
				7
				8	The lexer uses the PLY library to build a tokenizer which understands both
				9	WebIDL and Pepper tokens.
				10
				11	WebIDL, and WebIDL regular expressions can be found at:
				12	http://www.w3.org/TR/2012/CR-WebIDL-20120419/
				13	PLY can be found at:
				14	http://www.dabeaz.com/ply/
				15	"""
				16
				17	import optparse
				18	import os.path
				19	import sys
				20
				21	#
				22	# Try to load the ply module, if not, then assume it is in the third_party
				23	# directory.
				24	#
				25	try:
				26	# Disable lint check which fails to find the ply module.
				27	# pylint: disable=F0401
				28	from ply import lex
				29	except:
				30	module_path, module_name = os.path.split(__file__)
				31	third_party = os.path.join(module_path, '..', '..', 'third_party')
				32	sys.path.append(third_party)
				33	# pylint: disable=F0401
				34	from ply import lex
				35
				36	#
				37	# IDL Lexer
				38	#
				39	class IDLLexer(object):
				40	# 'tokens' is a value required by lex which specifies the complete list
				41	# of valid token types.
				42	tokens = [
				43	# Data types
				44	'float',
				45	'integer',
				46	'string',
				47
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	48	# Symbol and keywords types
				49	'COMMENT',
				50	'identifier',
				51
[email protected]	d4b8667	2013-04-11 16:28:31	[diff] [blame]	52	# MultiChar operators
				53	'ELLIPSIS',
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	54	]
				55
				56	# 'keywords' is a map of string to token type. All tokens matching
				57	# KEYWORD_OR_SYMBOL are matched against keywords dictionary, to determine
				58	# if the token is actually a keyword.
				59	keywords = {
				60	'any' : 'ANY',
				61	'attribute' : 'ATTRIBUTE',
				62	'boolean' : 'BOOLEAN',
				63	'byte' : 'BYTE',
				64	'callback' : 'CALLBACK',
				65	'const' : 'CONST',
				66	'creator' : 'CREATOR',
				67	'Date' : 'DATE',
				68	'deleter' : 'DELETER',
				69	'dictionary' : 'DICTIONARY',
				70	'DOMString' : 'DOMSTRING',
				71	'double' : 'DOUBLE',
				72	'enum' : 'ENUM',
				73	'false' : 'FALSE',
				74	'float' : 'FLOAT',
				75	'exception' : 'EXCEPTION',
				76	'getter': 'GETTER',
				77	'implements' : 'IMPLEMENTS',
				78	'Infinity' : 'INFINITY',
				79	'inherit' : 'INHERIT',
				80	'interface' : 'INTERFACE',
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	81	'legacycaller' : 'LEGACYCALLER',
				82	'long' : 'LONG',
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	83	'Nan' : 'NAN',
				84	'null' : 'NULL',
				85	'object' : 'OBJECT',
				86	'octet' : 'OCTET',
				87	'optional' : 'OPTIONAL',
				88	'or' : 'OR',
				89	'partial' : 'PARTIAL',
				90	'readonly' : 'READONLY',
				91	'sequence' : 'SEQUENCE',
				92	'setter': 'SETTER',
				93	'short' : 'SHORT',
				94	'static' : 'STATIC',
				95	'stringifier' : 'STRINGIFIER',
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	96	'typedef' : 'TYPEDEF',
				97	'true' : 'TRUE',
				98	'unsigned' : 'UNSIGNED',
				99	'unrestricted' : 'UNRESTRICTED',
				100	'void' : 'VOID'
				101	}
				102
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	103	# Token definitions
				104	#
				105	# Lex assumes any value or function in the form of 't_<TYPE>' represents a
				106	# regular expression where a match will emit a token of type <TYPE>. In the
				107	# case of a function, the function is called when a match is made. These
				108	# definitions come from WebIDL.
[email protected]	ac7b49d	2013-04-12 18:48:47	[diff] [blame]	109	def t_ELLIPSIS(self, t):
				110	r'\.\.\.'
				111	return t
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	112
[email protected]	ac7b49d	2013-04-12 18:48:47	[diff] [blame]	113	def t_float(self, t):
				114	r'-?(([0-9]+\.[0-9]\|[0-9]\.[0-9]+)([Ee][+-]?[0-9]+)?\|[0-9]+[Ee][+-]?[0-9]+)'
				115	return t
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	116
[email protected]	ac7b49d	2013-04-12 18:48:47	[diff] [blame]	117	def t_integer(self, t):
[email protected]	4c713c4	2013-05-24 11:05:48	[diff] [blame]	118	r'-?(0([Xx][0-9A-Fa-f]+\|[0-7])\|[1-9][0-9])'
[email protected]	ac7b49d	2013-04-12 18:48:47	[diff] [blame]	119	return t
[email protected]	d4b8667	2013-04-11 16:28:31	[diff] [blame]	120
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	121
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	122	# A line ending '\n', we use this to increment the line number
				123	def t_LINE_END(self, t):
				124	r'\n+'
				125	self.AddLines(len(t.value))
				126
				127	# We do not process escapes in the IDL strings. Strings are exclusively
				128	# used for attributes and enums, and not used as typical 'C' constants.
				129	def t_string(self, t):
				130	r'"[^"]*"'
				131	t.value = t.value[1:-1]
				132	self.AddLines(t.value.count('\n'))
				133	return t
				134
				135	# A C or C++ style comment: /* xxx */ or //
				136	def t_COMMENT(self, t):
				137	r'(/\(.\|\n)?\/)\|(//.(\n[ \t]//.)*)'
				138	self.AddLines(t.value.count('\n'))
				139	return t
				140
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	141	# A symbol or keyword.
				142	def t_KEYWORD_OR_SYMBOL(self, t):
				143	r'_?[A-Za-z][A-Za-z_0-9]*'
				144
				145	# All non-keywords are assumed to be symbols
				146	t.type = self.keywords.get(t.value, 'identifier')
				147
				148	# We strip leading underscores so that you can specify symbols with the same
				149	# value as a keywords (E.g. a dictionary named 'interface').
				150	if t.value[0] == '_':
				151	t.value = t.value[1:]
				152	return t
				153
				154	def t_ANY_error(self, t):
				155	msg = 'Unrecognized input'
[email protected]	ac7b49d	2013-04-12 18:48:47	[diff] [blame]	156	line = self.Lexer().lineno
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	157
				158	# If that line has not been accounted for, then we must have hit
				159	# EoF, so compute the beginning of the line that caused the problem.
				160	if line >= len(self.index):
				161	# Find the offset in the line of the first word causing the issue
				162	word = t.value.split()[0]
				163	offs = self.lines[line - 1].find(word)
				164	# Add the computed line's starting position
[email protected]	ac7b49d	2013-04-12 18:48:47	[diff] [blame]	165	self.index.append(self.Lexer().lexpos - offs)
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	166	msg = 'Unexpected EoF reached after'
				167
[email protected]	ac7b49d	2013-04-12 18:48:47	[diff] [blame]	168	pos = self.Lexer().lexpos - self.index[line]
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	169	out = self.ErrorMessage(line, pos, msg)
				170	sys.stderr.write(out + '\n')
[email protected]	d4b8667	2013-04-11 16:28:31	[diff] [blame]	171	self._lex_errors += 1
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	172
				173
				174	def AddLines(self, count):
				175	# Set the lexer position for the beginning of the next line. In the case
				176	# of multiple lines, tokens can not exist on any of the lines except the
				177	# last one, so the recorded value for previous lines are unused. We still
				178	# fill the array however, to make sure the line count is correct.
[email protected]	ac7b49d	2013-04-12 18:48:47	[diff] [blame]	179	self.Lexer().lineno += count
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	180	for _ in range(count):
[email protected]	ac7b49d	2013-04-12 18:48:47	[diff] [blame]	181	self.index.append(self.Lexer().lexpos)
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	182
				183	def FileLineMsg(self, line, msg):
				184	# Generate a message containing the file and line number of a token.
[email protected]	ac7b49d	2013-04-12 18:48:47	[diff] [blame]	185	filename = self.Lexer().filename
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	186	if filename:
				187	return "%s(%d) : %s" % (filename, line + 1, msg)
				188	return "<BuiltIn> : %s" % msg
				189
				190	def SourceLine(self, line, pos):
				191	# Create a source line marker
[email protected]	d4b8667	2013-04-11 16:28:31	[diff] [blame]	192	caret = ' ' * pos + '^'
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	193	# We decrement the line number since the array is 0 based while the
				194	# line numbers are 1 based.
				195	return "%s\n%s" % (self.lines[line - 1], caret)
				196
				197	def ErrorMessage(self, line, pos, msg):
				198	return "\n%s\n%s" % (
				199	self.FileLineMsg(line, msg),
				200	self.SourceLine(line, pos))
				201
[email protected]	d4b8667	2013-04-11 16:28:31	[diff] [blame]	202	#
				203	# Tokenizer
				204	#
				205	# The token function returns the next token provided by IDLLexer for matching
				206	# against the leaf paterns.
				207	#
				208	def token(self):
[email protected]	ac7b49d	2013-04-12 18:48:47	[diff] [blame]	209	tok = self.Lexer().token()
[email protected]	d4b8667	2013-04-11 16:28:31	[diff] [blame]	210	if tok:
				211	self.last = tok
				212	return tok
				213
				214
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	215	def GetTokens(self):
				216	outlist = []
				217	while True:
[email protected]	ac7b49d	2013-04-12 18:48:47	[diff] [blame]	218	t = self.Lexer().token()
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	219	if not t:
				220	break
				221	outlist.append(t)
				222	return outlist
				223
[email protected]	d4b8667	2013-04-11 16:28:31	[diff] [blame]	224	def Tokenize(self, data, filename='__no_file__'):
[email protected]	ac7b49d	2013-04-12 18:48:47	[diff] [blame]	225	lexer = self.Lexer()
				226	lexer.lineno = 1
				227	lexer.filename = filename
				228	lexer.input(data)
[email protected]	d4b8667	2013-04-11 16:28:31	[diff] [blame]	229	self.lines = data.split('\n')
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	230
[email protected]	ac7b49d	2013-04-12 18:48:47	[diff] [blame]	231	def KnownTokens(self):
				232	return self.tokens
				233
				234	def Lexer(self):
				235	if not self._lexobj:
				236	self._lexobj = lex.lex(object=self, lextab=None, optimize=0)
				237	return self._lexobj
				238
				239	def _AddConstDefs(self):
				240	# 'literals' is a value expected by lex which specifies a list of valid
				241	# literal tokens, meaning the token type and token value are identical.
				242	self.literals = r'"*.(){}[],;:=+-/~\|&^?<>'
				243	self.t_ignore = ' \t'
				244
				245	def _AddToken(self, token):
				246	if token in self.tokens:
				247	raise RuntimeError('Same token: ' + token)
				248	self.tokens.append(token)
				249
				250	def _AddTokens(self, tokens):
				251	for token in tokens:
				252	self._AddToken(token)
				253
				254	def _AddKeywords(self, keywords):
				255	for key in keywords:
				256	value = key.upper()
				257	self._AddToken(value)
				258	self.keywords[key] = value
				259
[email protected]	a958ace	2013-06-29 20:51:01	[diff] [blame^]	260	def _DelKeywords(self, keywords):
				261	for key in keywords:
				262	self.tokens.remove(key.upper())
				263	del self.keywords[key]
				264
[email protected]	d4b8667	2013-04-11 16:28:31	[diff] [blame]	265	def __init__(self):
				266	self.index = [0]
				267	self._lex_errors = 0
				268	self.linex = []
				269	self.filename = None
[email protected]	ac7b49d	2013-04-12 18:48:47	[diff] [blame]	270	self.keywords = {}
				271	self.tokens = []
				272	self._AddConstDefs()
				273	self._AddTokens(IDLLexer.tokens)
				274	self._AddKeywords(IDLLexer.keywords)
				275	self._lexobj = None
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	276
[email protected]	ac7b49d	2013-04-12 18:48:47	[diff] [blame]	277	# If run by itself, attempt to build the lexer
				278	if __name__ == '__main__':
				279	lexer = IDLLexer()