Blame - tools/idl_parser/idl_lexer.py - chromium/src.git

blob: c1446816008a9c80ebc4781e1f42010fe220e883 [file] [log] [blame]

[email protected]	cffee7f	2013-04-11 17:03:48	[diff] [blame]	1	#!/usr/bin/env python
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	2	# Copyright (c) 2013 The Chromium Authors. All rights reserved.
				3	# Use of this source code is governed by a BSD-style license that can be
				4	# found in the LICENSE file.
				5
Kenichi Ishibashi	23e996b	2017-06-22 07:16:28	[diff] [blame]	6	""" Lexer for Web IDL
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	7
Kenichi Ishibashi	23e996b	2017-06-22 07:16:28	[diff] [blame]	8	The lexer uses the PLY library to build a tokenizer which understands
				9	Web IDL tokens.
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	10
Kenichi Ishibashi	23e996b	2017-06-22 07:16:28	[diff] [blame]	11	Web IDL, and Web IDL regular expressions can be found at:
raphael.kubo.da.costa	4bec0d7	2017-02-22 10:12:44	[diff] [blame]	12	http://heycam.github.io/webidl/
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	13	PLY can be found at:
				14	http://www.dabeaz.com/ply/
				15	"""
				16
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	17	import os.path
				18	import sys
				19
raphael.kubo.da.costa	adf0559	2017-02-21 12:58:23	[diff] [blame]	20	SRC_DIR = os.path.join(os.path.dirname(__file__), os.pardir, os.pardir)
				21	sys.path.insert(0, os.path.join(SRC_DIR, 'third_party'))
				22	from ply import lex
				23
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	24
				25	#
				26	# IDL Lexer
				27	#
				28	class IDLLexer(object):
[email protected]	a8f9428	2013-08-14 01:42:30	[diff] [blame]	29	# 'literals' is a value expected by lex which specifies a list of valid
				30	# literal tokens, meaning the token type and token value are identical.
				31	literals = r'"*.(){}[],;:=+-/~\|&^?<>'
				32
				33	# 't_ignore' contains ignored characters (spaces and tabs)
				34	t_ignore = ' \t'
				35
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	36	# 'tokens' is a value required by lex which specifies the complete list
				37	# of valid token types.
				38	tokens = [
				39	# Data types
				40	'float',
				41	'integer',
				42	'string',
				43
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	44	# Symbol and keywords types
Kenichi Ishibashi	f2f3c92	2017-07-14 02:02:40	[diff] [blame]	45	'SPECIAL_COMMENT',
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	46	'identifier',
				47
[email protected]	d4b8667	2013-04-11 16:28:31	[diff] [blame]	48	# MultiChar operators
				49	'ELLIPSIS',
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	50	]
				51
				52	# 'keywords' is a map of string to token type. All tokens matching
				53	# KEYWORD_OR_SYMBOL are matched against keywords dictionary, to determine
				54	# if the token is actually a keyword.
				55	keywords = {
				56	'any' : 'ANY',
				57	'attribute' : 'ATTRIBUTE',
				58	'boolean' : 'BOOLEAN',
				59	'byte' : 'BYTE',
[email protected]	5885b69	2014-06-19 14:43:24	[diff] [blame]	60	'ByteString' : 'BYTESTRING',
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	61	'callback' : 'CALLBACK',
				62	'const' : 'CONST',
				63	'creator' : 'CREATOR',
				64	'Date' : 'DATE',
				65	'deleter' : 'DELETER',
				66	'dictionary' : 'DICTIONARY',
				67	'DOMString' : 'DOMSTRING',
				68	'double' : 'DOUBLE',
				69	'enum' : 'ENUM',
				70	'false' : 'FALSE',
				71	'float' : 'FLOAT',
bashi	cb5c1661	2015-08-19 02:11:24	[diff] [blame]	72	'FrozenArray' : 'FROZENARRAY',
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	73	'getter': 'GETTER',
				74	'implements' : 'IMPLEMENTS',
Hitoshi Yoshida	14b7c92	2018-03-10 17:57:51	[diff] [blame]	75	'includes' : 'INCLUDES',
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	76	'Infinity' : 'INFINITY',
				77	'inherit' : 'INHERIT',
				78	'interface' : 'INTERFACE',
jl	9016ef32	2014-12-16 15:05:09	[diff] [blame]	79	'iterable': 'ITERABLE',
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	80	'legacycaller' : 'LEGACYCALLER',
				81	'long' : 'LONG',
jl	9016ef32	2014-12-16 15:05:09	[diff] [blame]	82	'maplike': 'MAPLIKE',
Hitoshi Yoshida	ec53e05	2018-03-08 07:39:31	[diff] [blame]	83	'mixin': 'MIXIN',
Lisa Suzuki	fd4acd52	2017-09-26 02:34:15	[diff] [blame]	84	'namespace' : 'NAMESPACE',
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	85	'Nan' : 'NAN',
				86	'null' : 'NULL',
				87	'object' : 'OBJECT',
				88	'octet' : 'OCTET',
				89	'optional' : 'OPTIONAL',
				90	'or' : 'OR',
yhirano	6ce2b8e	2014-10-20 12:49:09	[diff] [blame]	91	'partial' : 'PARTIAL',
				92	'Promise' : 'PROMISE',
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	93	'readonly' : 'READONLY',
[email protected]	5885b69	2014-06-19 14:43:24	[diff] [blame]	94	'RegExp' : 'REGEXP',
raphael.kubo.da.costa	4bec0d7	2017-02-22 10:12:44	[diff] [blame]	95	'record' : 'RECORD',
jl	9016ef32	2014-12-16 15:05:09	[diff] [blame]	96	'required' : 'REQUIRED',
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	97	'sequence' : 'SEQUENCE',
[email protected]	5885b69	2014-06-19 14:43:24	[diff] [blame]	98	'serializer' : 'SERIALIZER',
jl	9016ef32	2014-12-16 15:05:09	[diff] [blame]	99	'setlike' : 'SETLIKE',
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	100	'setter': 'SETTER',
				101	'short' : 'SHORT',
				102	'static' : 'STATIC',
				103	'stringifier' : 'STRINGIFIER',
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	104	'typedef' : 'TYPEDEF',
				105	'true' : 'TRUE',
				106	'unsigned' : 'UNSIGNED',
				107	'unrestricted' : 'UNRESTRICTED',
raphael.kubo.da.costa	4bec0d7	2017-02-22 10:12:44	[diff] [blame]	108	'USVString' : 'USVSTRING',
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	109	'void' : 'VOID'
				110	}
				111
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	112	# Token definitions
				113	#
				114	# Lex assumes any value or function in the form of 't_<TYPE>' represents a
				115	# regular expression where a match will emit a token of type <TYPE>. In the
				116	# case of a function, the function is called when a match is made. These
				117	# definitions come from WebIDL.
[email protected]	a8f9428	2013-08-14 01:42:30	[diff] [blame]	118	#
				119	# These need to be methods for lexer construction, despite not using self.
				120	# pylint: disable=R0201
[email protected]	ac7b49d	2013-04-12 18:48:47	[diff] [blame]	121	def t_ELLIPSIS(self, t):
				122	r'\.\.\.'
				123	return t
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	124
[email protected]	a8f9428	2013-08-14 01:42:30	[diff] [blame]	125	# Regex needs to be in the docstring
				126	# pylint: disable=C0301
[email protected]	ac7b49d	2013-04-12 18:48:47	[diff] [blame]	127	def t_float(self, t):
				128	r'-?(([0-9]+\.[0-9]\|[0-9]\.[0-9]+)([Ee][+-]?[0-9]+)?\|[0-9]+[Ee][+-]?[0-9]+)'
				129	return t
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	130
[email protected]	ac7b49d	2013-04-12 18:48:47	[diff] [blame]	131	def t_integer(self, t):
[email protected]	9f1b57f	2013-08-07 05:08:09	[diff] [blame]	132	r'-?([1-9][0-9]\|0[Xx][0-9A-Fa-f]+\|0[0-7])'
[email protected]	ac7b49d	2013-04-12 18:48:47	[diff] [blame]	133	return t
[email protected]	d4b8667	2013-04-11 16:28:31	[diff] [blame]	134
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	135
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	136	# A line ending '\n', we use this to increment the line number
				137	def t_LINE_END(self, t):
				138	r'\n+'
				139	self.AddLines(len(t.value))
				140
				141	# We do not process escapes in the IDL strings. Strings are exclusively
				142	# used for attributes and enums, and not used as typical 'C' constants.
				143	def t_string(self, t):
				144	r'"[^"]*"'
				145	t.value = t.value[1:-1]
				146	self.AddLines(t.value.count('\n'))
				147	return t
				148
Kenichi Ishibashi	f2f3c92	2017-07-14 02:02:40	[diff] [blame]	149	# A Javadoc style comment: /** xxx */
				150	# Unlike t_COMMENT, this is NOT ignored.
				151	# Also note that this should be defined before t_COMMENT.
				152	def t_SPECIAL_COMMENT(self, t):
				153	r'/\\(.\|\n)+?\*/'
				154	self.AddLines(t.value.count('\n'))
				155	return t
				156
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	157	# A C or C++ style comment: /* xxx */ or //
Kenichi Ishibashi	4e46f03	2017-06-23 07:26:05	[diff] [blame]	158	# This token is ignored.
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	159	def t_COMMENT(self, t):
				160	r'(/\(.\|\n)?\/)\|(//.(\n[ \t]//.)*)'
				161	self.AddLines(t.value.count('\n'))
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	162
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	163	# A symbol or keyword.
				164	def t_KEYWORD_OR_SYMBOL(self, t):
Hitoshi Yoshida	3425b29	2018-06-08 07:08:11	[diff] [blame]	165	r'_?[A-Za-z][A-Za-z_0-9-]*'
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	166
				167	# All non-keywords are assumed to be symbols
				168	t.type = self.keywords.get(t.value, 'identifier')
				169
				170	# We strip leading underscores so that you can specify symbols with the same
				171	# value as a keywords (E.g. a dictionary named 'interface').
				172	if t.value[0] == '_':
				173	t.value = t.value[1:]
				174	return t
				175
				176	def t_ANY_error(self, t):
				177	msg = 'Unrecognized input'
[email protected]	ac7b49d	2013-04-12 18:48:47	[diff] [blame]	178	line = self.Lexer().lineno
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	179
				180	# If that line has not been accounted for, then we must have hit
				181	# EoF, so compute the beginning of the line that caused the problem.
				182	if line >= len(self.index):
				183	# Find the offset in the line of the first word causing the issue
				184	word = t.value.split()[0]
				185	offs = self.lines[line - 1].find(word)
				186	# Add the computed line's starting position
[email protected]	ac7b49d	2013-04-12 18:48:47	[diff] [blame]	187	self.index.append(self.Lexer().lexpos - offs)
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	188	msg = 'Unexpected EoF reached after'
				189
[email protected]	ac7b49d	2013-04-12 18:48:47	[diff] [blame]	190	pos = self.Lexer().lexpos - self.index[line]
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	191	out = self.ErrorMessage(line, pos, msg)
				192	sys.stderr.write(out + '\n')
[email protected]	d4b8667	2013-04-11 16:28:31	[diff] [blame]	193	self._lex_errors += 1
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	194
				195
				196	def AddLines(self, count):
				197	# Set the lexer position for the beginning of the next line. In the case
				198	# of multiple lines, tokens can not exist on any of the lines except the
				199	# last one, so the recorded value for previous lines are unused. We still
				200	# fill the array however, to make sure the line count is correct.
[email protected]	ac7b49d	2013-04-12 18:48:47	[diff] [blame]	201	self.Lexer().lineno += count
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	202	for _ in range(count):
[email protected]	ac7b49d	2013-04-12 18:48:47	[diff] [blame]	203	self.index.append(self.Lexer().lexpos)
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	204
				205	def FileLineMsg(self, line, msg):
				206	# Generate a message containing the file and line number of a token.
[email protected]	ac7b49d	2013-04-12 18:48:47	[diff] [blame]	207	filename = self.Lexer().filename
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	208	if filename:
				209	return "%s(%d) : %s" % (filename, line + 1, msg)
				210	return "<BuiltIn> : %s" % msg
				211
				212	def SourceLine(self, line, pos):
				213	# Create a source line marker
[email protected]	d4b8667	2013-04-11 16:28:31	[diff] [blame]	214	caret = ' ' * pos + '^'
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	215	# We decrement the line number since the array is 0 based while the
				216	# line numbers are 1 based.
				217	return "%s\n%s" % (self.lines[line - 1], caret)
				218
				219	def ErrorMessage(self, line, pos, msg):
				220	return "\n%s\n%s" % (
				221	self.FileLineMsg(line, msg),
				222	self.SourceLine(line, pos))
				223
[email protected]	d4b8667	2013-04-11 16:28:31	[diff] [blame]	224	#
				225	# Tokenizer
				226	#
				227	# The token function returns the next token provided by IDLLexer for matching
				228	# against the leaf paterns.
				229	#
				230	def token(self):
[email protected]	ac7b49d	2013-04-12 18:48:47	[diff] [blame]	231	tok = self.Lexer().token()
[email protected]	d4b8667	2013-04-11 16:28:31	[diff] [blame]	232	if tok:
				233	self.last = tok
				234	return tok
				235
				236
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	237	def GetTokens(self):
				238	outlist = []
				239	while True:
[email protected]	ac7b49d	2013-04-12 18:48:47	[diff] [blame]	240	t = self.Lexer().token()
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	241	if not t:
				242	break
				243	outlist.append(t)
				244	return outlist
				245
[email protected]	d4b8667	2013-04-11 16:28:31	[diff] [blame]	246	def Tokenize(self, data, filename='__no_file__'):
[email protected]	ac7b49d	2013-04-12 18:48:47	[diff] [blame]	247	lexer = self.Lexer()
				248	lexer.lineno = 1
				249	lexer.filename = filename
				250	lexer.input(data)
[email protected]	d4b8667	2013-04-11 16:28:31	[diff] [blame]	251	self.lines = data.split('\n')
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	252
[email protected]	ac7b49d	2013-04-12 18:48:47	[diff] [blame]	253	def KnownTokens(self):
				254	return self.tokens
				255
				256	def Lexer(self):
				257	if not self._lexobj:
				258	self._lexobj = lex.lex(object=self, lextab=None, optimize=0)
				259	return self._lexobj
				260
[email protected]	ac7b49d	2013-04-12 18:48:47	[diff] [blame]	261	def _AddToken(self, token):
				262	if token in self.tokens:
				263	raise RuntimeError('Same token: ' + token)
				264	self.tokens.append(token)
				265
				266	def _AddTokens(self, tokens):
				267	for token in tokens:
				268	self._AddToken(token)
				269
				270	def _AddKeywords(self, keywords):
				271	for key in keywords:
				272	value = key.upper()
				273	self._AddToken(value)
				274	self.keywords[key] = value
				275
[email protected]	a958ace	2013-06-29 20:51:01	[diff] [blame]	276	def _DelKeywords(self, keywords):
				277	for key in keywords:
				278	self.tokens.remove(key.upper())
				279	del self.keywords[key]
				280
[email protected]	d4b8667	2013-04-11 16:28:31	[diff] [blame]	281	def __init__(self):
				282	self.index = [0]
				283	self._lex_errors = 0
				284	self.linex = []
				285	self.filename = None
[email protected]	ac7b49d	2013-04-12 18:48:47	[diff] [blame]	286	self.keywords = {}
				287	self.tokens = []
[email protected]	ac7b49d	2013-04-12 18:48:47	[diff] [blame]	288	self._AddTokens(IDLLexer.tokens)
				289	self._AddKeywords(IDLLexer.keywords)
				290	self._lexobj = None
[email protected]	a8f9428	2013-08-14 01:42:30	[diff] [blame]	291	self.last = None
				292	self.lines = None
[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame]	293
[email protected]	ac7b49d	2013-04-12 18:48:47	[diff] [blame]	294	# If run by itself, attempt to build the lexer
				295	if __name__ == '__main__':
[email protected]	a8f9428	2013-08-14 01:42:30	[diff] [blame]	296	lexer_object = IDLLexer()