Blame - tools/idl_parser/idl_lexer.py - chromium/src.git

blob: 3e42734f7c80ee9246dd78f6af9f42ddd2c05ef3 [file] [log] [blame]

[email protected]	683c8c5	2013-04-06 17:00:46	[diff] [blame^]	1	#!/usr/bin/env python
				2	# Copyright (c) 2013 The Chromium Authors. All rights reserved.
				3	# Use of this source code is governed by a BSD-style license that can be
				4	# found in the LICENSE file.
				5
				6	""" Lexer for PPAPI IDL
				7
				8	The lexer uses the PLY library to build a tokenizer which understands both
				9	WebIDL and Pepper tokens.
				10
				11	WebIDL, and WebIDL regular expressions can be found at:
				12	http://www.w3.org/TR/2012/CR-WebIDL-20120419/
				13	PLY can be found at:
				14	http://www.dabeaz.com/ply/
				15	"""
				16
				17	import optparse
				18	import os.path
				19	import sys
				20
				21	#
				22	# Try to load the ply module, if not, then assume it is in the third_party
				23	# directory.
				24	#
				25	try:
				26	# Disable lint check which fails to find the ply module.
				27	# pylint: disable=F0401
				28	from ply import lex
				29	except:
				30	module_path, module_name = os.path.split(__file__)
				31	third_party = os.path.join(module_path, '..', '..', 'third_party')
				32	sys.path.append(third_party)
				33	# pylint: disable=F0401
				34	from ply import lex
				35
				36	#
				37	# IDL Lexer
				38	#
				39	class IDLLexer(object):
				40	# 'tokens' is a value required by lex which specifies the complete list
				41	# of valid token types.
				42	tokens = [
				43	# Data types
				44	'float',
				45	'integer',
				46	'string',
				47
				48	# Operators
				49	'ELLIPSIS',
				50	'LSHIFT',
				51	'RSHIFT',
				52
				53	# Symbol and keywords types
				54	'COMMENT',
				55	'identifier',
				56
				57	# Pepper Extras
				58	'INLINE',
				59	]
				60
				61	# 'keywords' is a map of string to token type. All tokens matching
				62	# KEYWORD_OR_SYMBOL are matched against keywords dictionary, to determine
				63	# if the token is actually a keyword.
				64	keywords = {
				65	'any' : 'ANY',
				66	'attribute' : 'ATTRIBUTE',
				67	'boolean' : 'BOOLEAN',
				68	'byte' : 'BYTE',
				69	'callback' : 'CALLBACK',
				70	'const' : 'CONST',
				71	'creator' : 'CREATOR',
				72	'Date' : 'DATE',
				73	'deleter' : 'DELETER',
				74	'dictionary' : 'DICTIONARY',
				75	'DOMString' : 'DOMSTRING',
				76	'double' : 'DOUBLE',
				77	'enum' : 'ENUM',
				78	'false' : 'FALSE',
				79	'float' : 'FLOAT',
				80	'exception' : 'EXCEPTION',
				81	'getter': 'GETTER',
				82	'implements' : 'IMPLEMENTS',
				83	'Infinity' : 'INFINITY',
				84	'inherit' : 'INHERIT',
				85	'interface' : 'INTERFACE',
				86	'label' : 'LABEL',
				87	'legacycaller' : 'LEGACYCALLER',
				88	'long' : 'LONG',
				89	'namespace' : 'NAMESPACE',
				90	'Nan' : 'NAN',
				91	'null' : 'NULL',
				92	'object' : 'OBJECT',
				93	'octet' : 'OCTET',
				94	'optional' : 'OPTIONAL',
				95	'or' : 'OR',
				96	'partial' : 'PARTIAL',
				97	'readonly' : 'READONLY',
				98	'sequence' : 'SEQUENCE',
				99	'setter': 'SETTER',
				100	'short' : 'SHORT',
				101	'static' : 'STATIC',
				102	'stringifier' : 'STRINGIFIER',
				103	'struct' : 'STRUCT',
				104	'typedef' : 'TYPEDEF',
				105	'true' : 'TRUE',
				106	'unsigned' : 'UNSIGNED',
				107	'unrestricted' : 'UNRESTRICTED',
				108	'void' : 'VOID'
				109	}
				110
				111	# Add keywords
				112	for key in keywords:
				113	tokens.append(keywords[key])
				114
				115	# 'literals' is a value expected by lex which specifies a list of valid
				116	# literal tokens, meaning the token type and token value are identical.
				117	literals = '"*.(){}[],;:=+-/~\|&^?<>'
				118
				119	# Token definitions
				120	#
				121	# Lex assumes any value or function in the form of 't_<TYPE>' represents a
				122	# regular expression where a match will emit a token of type <TYPE>. In the
				123	# case of a function, the function is called when a match is made. These
				124	# definitions come from WebIDL.
				125
				126	# 't_ignore' is a special match of items to ignore
				127	t_ignore = ' \t'
				128
				129	# Constant values
				130	t_integer = r'-?(0([0-7]\|[Xx][0-9A-Fa-f]+)\|[1-9][0-9])'
				131	t_float = r'-?(([0-9]+\.[0-9]\|[0-9]\.[0-9]+)'
				132	t_float += r'([Ee][+-]?[0-9]+)?\|[0-9]+[Ee][+-]?[0-9]+)'
				133
				134	# Special multi-character operators
				135	t_ELLIPSIS = r'\.\.\.'
				136	t_LSHIFT = r'<<'
				137	t_RSHIFT = r'>>'
				138
				139	# A line ending '\n', we use this to increment the line number
				140	def t_LINE_END(self, t):
				141	r'\n+'
				142	self.AddLines(len(t.value))
				143
				144	# We do not process escapes in the IDL strings. Strings are exclusively
				145	# used for attributes and enums, and not used as typical 'C' constants.
				146	def t_string(self, t):
				147	r'"[^"]*"'
				148	t.value = t.value[1:-1]
				149	self.AddLines(t.value.count('\n'))
				150	return t
				151
				152	# A C or C++ style comment: /* xxx */ or //
				153	def t_COMMENT(self, t):
				154	r'(/\(.\|\n)?\/)\|(//.(\n[ \t]//.)*)'
				155	self.AddLines(t.value.count('\n'))
				156	return t
				157
				158	# Return a "preprocessor" inline block
				159	def t_INLINE(self, t):
				160	r'\#inline (.\|\n)?\#endinl.'
				161	self.AddLines(t.value.count('\n'))
				162	return t
				163
				164	# A symbol or keyword.
				165	def t_KEYWORD_OR_SYMBOL(self, t):
				166	r'_?[A-Za-z][A-Za-z_0-9]*'
				167
				168	# All non-keywords are assumed to be symbols
				169	t.type = self.keywords.get(t.value, 'identifier')
				170
				171	# We strip leading underscores so that you can specify symbols with the same
				172	# value as a keywords (E.g. a dictionary named 'interface').
				173	if t.value[0] == '_':
				174	t.value = t.value[1:]
				175	return t
				176
				177	def t_ANY_error(self, t):
				178	msg = 'Unrecognized input'
				179	line = self.lexobj.lineno
				180
				181	# If that line has not been accounted for, then we must have hit
				182	# EoF, so compute the beginning of the line that caused the problem.
				183	if line >= len(self.index):
				184	# Find the offset in the line of the first word causing the issue
				185	word = t.value.split()[0]
				186	offs = self.lines[line - 1].find(word)
				187	# Add the computed line's starting position
				188	self.index.append(self.lexobj.lexpos - offs)
				189	msg = 'Unexpected EoF reached after'
				190
				191	pos = self.lexobj.lexpos - self.index[line]
				192	out = self.ErrorMessage(line, pos, msg)
				193	sys.stderr.write(out + '\n')
				194	self.lex_errors += 1
				195
				196
				197	def AddLines(self, count):
				198	# Set the lexer position for the beginning of the next line. In the case
				199	# of multiple lines, tokens can not exist on any of the lines except the
				200	# last one, so the recorded value for previous lines are unused. We still
				201	# fill the array however, to make sure the line count is correct.
				202	self.lexobj.lineno += count
				203	for _ in range(count):
				204	self.index.append(self.lexobj.lexpos)
				205
				206	def FileLineMsg(self, line, msg):
				207	# Generate a message containing the file and line number of a token.
				208	filename = self.lexobj.filename
				209	if filename:
				210	return "%s(%d) : %s" % (filename, line + 1, msg)
				211	return "<BuiltIn> : %s" % msg
				212
				213	def SourceLine(self, line, pos):
				214	# Create a source line marker
				215	caret = '\t^'.expandtabs(pos)
				216	# We decrement the line number since the array is 0 based while the
				217	# line numbers are 1 based.
				218	return "%s\n%s" % (self.lines[line - 1], caret)
				219
				220	def ErrorMessage(self, line, pos, msg):
				221	return "\n%s\n%s" % (
				222	self.FileLineMsg(line, msg),
				223	self.SourceLine(line, pos))
				224
				225	def GetTokens(self):
				226	outlist = []
				227	while True:
				228	t = self.lexobj.token()
				229	if not t:
				230	break
				231	outlist.append(t)
				232	return outlist
				233
				234	def __init__(self, filename, data):
				235	self.index = [0]
				236	self.lex_errors = 0
				237	self.lines = data.split('\n')
				238	self.lexobj = lex.lex(object=self, lextab=None, optimize=0)
				239	self.lexobj.filename = filename
				240	self.lexobj.input(data)
				241
				242
				243	#
				244	# FileToTokens
				245	#
				246	# From a source file generate a list of tokens.
				247	#
				248	def FileToTokens(filename):
				249	with open(filename, 'rb') as srcfile:
				250	lexer = IDLLexer(filename, srcfile.read())
				251	return lexer.GetTokens()
				252
				253
				254	#
				255	# TextToTokens
				256	#
				257	# From a source file generate a list of tokens.
				258	#
				259	def TextToTokens(text):
				260	lexer = IDLLexer(None, text)
				261	return lexer.GetTokens()
				262
				263
				264	#
				265	# TestSameText
				266	#
				267	# From a set of tokens, generate a new source text by joining with a
				268	# single space. The new source is then tokenized and compared against the
				269	# old set.
				270	#
				271	def TestSameText(filename):
				272	tokens1 = FileToTokens(filename)
				273	to_text = '\n'.join(['%s' % t.value for t in tokens1])
				274	tokens2 = TextToTokens(to_text)
				275
				276	count1 = len(tokens1)
				277	count2 = len(tokens2)
				278	if count1 != count2:
				279	print "Size mismatch original %d vs %d\n" % (count1, count2)
				280	if count1 > count2:
				281	count1 = count2
				282
				283	failed = 0
				284	for i in range(count1):
				285	if tokens1[i].value != tokens2[i].value:
				286	print "%d >>%s<< >>%s<<" % (i, tokens1[i].type, tokens2[i].value)
				287	failed = failed + 1
				288
				289	return failed
				290
				291
				292	#
				293	# TestExpectedText
				294	#
				295	# From a set of tokens pairs, verify the type field of the second matches
				296	# the value of the first, so that:
				297	# integer 123 float 1.1
				298	# will generate a passing test, where the first token has both the type and
				299	# value of the keyword integer and the second has the type of integer and
				300	# value of 123.
				301	#
				302	def TestExpect(filename):
				303	tokens = FileToTokens(filename)
				304	count = len(tokens)
				305	index = 0
				306	errors = 0
				307	while index < count:
				308	expect_type = tokens[index].value
				309	actual_type = tokens[index + 1].type
				310	index += 2
				311
				312	if expect_type != actual_type:
				313	sys.stderr.write('Mismatch: Expected %s, but got %s.\n' %
				314	(expect_type, actual_type))
				315	errors += 1
				316
				317	return errors
				318
				319
				320	def Main(args):
				321	parser = optparse.OptionParser()
				322	parser.add_option('--test', help='Run tests.', action='store_true')
				323
				324	# If no arguments are provided, run tests.
				325	if len(args) == 0:
				326	args = ['--test', 'test_lexer/values.in', 'test_lexer/keywords.in']
				327
				328	options, filenames = parser.parse_args(args)
				329	if not filenames:
				330	parser.error('No files specified.')
				331
				332	for filename in filenames:
				333	try:
				334	if options.test:
				335	if TestSameText(filename):
				336	sys.stderr.write('Failed text match on %s.\n' % filename)
				337	return -1
				338	if TestExpect(filename):
				339	sys.stderr.write('Failed expected type match on %s.\n' % filename)
				340	return -1
				341	print 'Passed: ' + filename
				342
				343	except lex.LexError as le:
				344	sys.stderr.write('%s\n' % str(le))
				345
				346	return 0
				347
				348
				349	if __name__ == '__main__':
				350	sys.exit(Main(sys.argv[1:]))