blob: 3e42734f7c80ee9246dd78f6af9f42ddd2c05ef3 [file] [log] [blame]
[email protected]683c8c52013-04-06 17:00:461#!/usr/bin/env python
2# Copyright (c) 2013 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6""" Lexer for PPAPI IDL
7
8The lexer uses the PLY library to build a tokenizer which understands both
9WebIDL and Pepper tokens.
10
11WebIDL, and WebIDL regular expressions can be found at:
12 http://www.w3.org/TR/2012/CR-WebIDL-20120419/
13PLY can be found at:
14 http://www.dabeaz.com/ply/
15"""
16
17import optparse
18import os.path
19import sys
20
21#
22# Try to load the ply module, if not, then assume it is in the third_party
23# directory.
24#
25try:
26 # Disable lint check which fails to find the ply module.
27 # pylint: disable=F0401
28 from ply import lex
29except:
30 module_path, module_name = os.path.split(__file__)
31 third_party = os.path.join(module_path, '..', '..', 'third_party')
32 sys.path.append(third_party)
33 # pylint: disable=F0401
34 from ply import lex
35
36#
37# IDL Lexer
38#
39class IDLLexer(object):
40 # 'tokens' is a value required by lex which specifies the complete list
41 # of valid token types.
42 tokens = [
43 # Data types
44 'float',
45 'integer',
46 'string',
47
48 # Operators
49 'ELLIPSIS',
50 'LSHIFT',
51 'RSHIFT',
52
53 # Symbol and keywords types
54 'COMMENT',
55 'identifier',
56
57 # Pepper Extras
58 'INLINE',
59 ]
60
61 # 'keywords' is a map of string to token type. All tokens matching
62 # KEYWORD_OR_SYMBOL are matched against keywords dictionary, to determine
63 # if the token is actually a keyword.
64 keywords = {
65 'any' : 'ANY',
66 'attribute' : 'ATTRIBUTE',
67 'boolean' : 'BOOLEAN',
68 'byte' : 'BYTE',
69 'callback' : 'CALLBACK',
70 'const' : 'CONST',
71 'creator' : 'CREATOR',
72 'Date' : 'DATE',
73 'deleter' : 'DELETER',
74 'dictionary' : 'DICTIONARY',
75 'DOMString' : 'DOMSTRING',
76 'double' : 'DOUBLE',
77 'enum' : 'ENUM',
78 'false' : 'FALSE',
79 'float' : 'FLOAT',
80 'exception' : 'EXCEPTION',
81 'getter': 'GETTER',
82 'implements' : 'IMPLEMENTS',
83 'Infinity' : 'INFINITY',
84 'inherit' : 'INHERIT',
85 'interface' : 'INTERFACE',
86 'label' : 'LABEL',
87 'legacycaller' : 'LEGACYCALLER',
88 'long' : 'LONG',
89 'namespace' : 'NAMESPACE',
90 'Nan' : 'NAN',
91 'null' : 'NULL',
92 'object' : 'OBJECT',
93 'octet' : 'OCTET',
94 'optional' : 'OPTIONAL',
95 'or' : 'OR',
96 'partial' : 'PARTIAL',
97 'readonly' : 'READONLY',
98 'sequence' : 'SEQUENCE',
99 'setter': 'SETTER',
100 'short' : 'SHORT',
101 'static' : 'STATIC',
102 'stringifier' : 'STRINGIFIER',
103 'struct' : 'STRUCT',
104 'typedef' : 'TYPEDEF',
105 'true' : 'TRUE',
106 'unsigned' : 'UNSIGNED',
107 'unrestricted' : 'UNRESTRICTED',
108 'void' : 'VOID'
109 }
110
111 # Add keywords
112 for key in keywords:
113 tokens.append(keywords[key])
114
115 # 'literals' is a value expected by lex which specifies a list of valid
116 # literal tokens, meaning the token type and token value are identical.
117 literals = '"*.(){}[],;:=+-/~|&^?<>'
118
119 # Token definitions
120 #
121 # Lex assumes any value or function in the form of 't_<TYPE>' represents a
122 # regular expression where a match will emit a token of type <TYPE>. In the
123 # case of a function, the function is called when a match is made. These
124 # definitions come from WebIDL.
125
126 # 't_ignore' is a special match of items to ignore
127 t_ignore = ' \t'
128
129 # Constant values
130 t_integer = r'-?(0([0-7]*|[Xx][0-9A-Fa-f]+)|[1-9][0-9]*)'
131 t_float = r'-?(([0-9]+\.[0-9]*|[0-9]*\.[0-9]+)'
132 t_float += r'([Ee][+-]?[0-9]+)?|[0-9]+[Ee][+-]?[0-9]+)'
133
134 # Special multi-character operators
135 t_ELLIPSIS = r'\.\.\.'
136 t_LSHIFT = r'<<'
137 t_RSHIFT = r'>>'
138
139 # A line ending '\n', we use this to increment the line number
140 def t_LINE_END(self, t):
141 r'\n+'
142 self.AddLines(len(t.value))
143
144 # We do not process escapes in the IDL strings. Strings are exclusively
145 # used for attributes and enums, and not used as typical 'C' constants.
146 def t_string(self, t):
147 r'"[^"]*"'
148 t.value = t.value[1:-1]
149 self.AddLines(t.value.count('\n'))
150 return t
151
152 # A C or C++ style comment: /* xxx */ or //
153 def t_COMMENT(self, t):
154 r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)'
155 self.AddLines(t.value.count('\n'))
156 return t
157
158 # Return a "preprocessor" inline block
159 def t_INLINE(self, t):
160 r'\#inline (.|\n)*?\#endinl.*'
161 self.AddLines(t.value.count('\n'))
162 return t
163
164 # A symbol or keyword.
165 def t_KEYWORD_OR_SYMBOL(self, t):
166 r'_?[A-Za-z][A-Za-z_0-9]*'
167
168 # All non-keywords are assumed to be symbols
169 t.type = self.keywords.get(t.value, 'identifier')
170
171 # We strip leading underscores so that you can specify symbols with the same
172 # value as a keywords (E.g. a dictionary named 'interface').
173 if t.value[0] == '_':
174 t.value = t.value[1:]
175 return t
176
177 def t_ANY_error(self, t):
178 msg = 'Unrecognized input'
179 line = self.lexobj.lineno
180
181 # If that line has not been accounted for, then we must have hit
182 # EoF, so compute the beginning of the line that caused the problem.
183 if line >= len(self.index):
184 # Find the offset in the line of the first word causing the issue
185 word = t.value.split()[0]
186 offs = self.lines[line - 1].find(word)
187 # Add the computed line's starting position
188 self.index.append(self.lexobj.lexpos - offs)
189 msg = 'Unexpected EoF reached after'
190
191 pos = self.lexobj.lexpos - self.index[line]
192 out = self.ErrorMessage(line, pos, msg)
193 sys.stderr.write(out + '\n')
194 self.lex_errors += 1
195
196
197 def AddLines(self, count):
198 # Set the lexer position for the beginning of the next line. In the case
199 # of multiple lines, tokens can not exist on any of the lines except the
200 # last one, so the recorded value for previous lines are unused. We still
201 # fill the array however, to make sure the line count is correct.
202 self.lexobj.lineno += count
203 for _ in range(count):
204 self.index.append(self.lexobj.lexpos)
205
206 def FileLineMsg(self, line, msg):
207 # Generate a message containing the file and line number of a token.
208 filename = self.lexobj.filename
209 if filename:
210 return "%s(%d) : %s" % (filename, line + 1, msg)
211 return "<BuiltIn> : %s" % msg
212
213 def SourceLine(self, line, pos):
214 # Create a source line marker
215 caret = '\t^'.expandtabs(pos)
216 # We decrement the line number since the array is 0 based while the
217 # line numbers are 1 based.
218 return "%s\n%s" % (self.lines[line - 1], caret)
219
220 def ErrorMessage(self, line, pos, msg):
221 return "\n%s\n%s" % (
222 self.FileLineMsg(line, msg),
223 self.SourceLine(line, pos))
224
225 def GetTokens(self):
226 outlist = []
227 while True:
228 t = self.lexobj.token()
229 if not t:
230 break
231 outlist.append(t)
232 return outlist
233
234 def __init__(self, filename, data):
235 self.index = [0]
236 self.lex_errors = 0
237 self.lines = data.split('\n')
238 self.lexobj = lex.lex(object=self, lextab=None, optimize=0)
239 self.lexobj.filename = filename
240 self.lexobj.input(data)
241
242
243#
244# FileToTokens
245#
246# From a source file generate a list of tokens.
247#
248def FileToTokens(filename):
249 with open(filename, 'rb') as srcfile:
250 lexer = IDLLexer(filename, srcfile.read())
251 return lexer.GetTokens()
252
253
254#
255# TextToTokens
256#
257# From a source file generate a list of tokens.
258#
259def TextToTokens(text):
260 lexer = IDLLexer(None, text)
261 return lexer.GetTokens()
262
263
264#
265# TestSameText
266#
267# From a set of tokens, generate a new source text by joining with a
268# single space. The new source is then tokenized and compared against the
269# old set.
270#
271def TestSameText(filename):
272 tokens1 = FileToTokens(filename)
273 to_text = '\n'.join(['%s' % t.value for t in tokens1])
274 tokens2 = TextToTokens(to_text)
275
276 count1 = len(tokens1)
277 count2 = len(tokens2)
278 if count1 != count2:
279 print "Size mismatch original %d vs %d\n" % (count1, count2)
280 if count1 > count2:
281 count1 = count2
282
283 failed = 0
284 for i in range(count1):
285 if tokens1[i].value != tokens2[i].value:
286 print "%d >>%s<< >>%s<<" % (i, tokens1[i].type, tokens2[i].value)
287 failed = failed + 1
288
289 return failed
290
291
292#
293# TestExpectedText
294#
295# From a set of tokens pairs, verify the type field of the second matches
296# the value of the first, so that:
297# integer 123 float 1.1
298# will generate a passing test, where the first token has both the type and
299# value of the keyword integer and the second has the type of integer and
300# value of 123.
301#
302def TestExpect(filename):
303 tokens = FileToTokens(filename)
304 count = len(tokens)
305 index = 0
306 errors = 0
307 while index < count:
308 expect_type = tokens[index].value
309 actual_type = tokens[index + 1].type
310 index += 2
311
312 if expect_type != actual_type:
313 sys.stderr.write('Mismatch: Expected %s, but got %s.\n' %
314 (expect_type, actual_type))
315 errors += 1
316
317 return errors
318
319
320def Main(args):
321 parser = optparse.OptionParser()
322 parser.add_option('--test', help='Run tests.', action='store_true')
323
324 # If no arguments are provided, run tests.
325 if len(args) == 0:
326 args = ['--test', 'test_lexer/values.in', 'test_lexer/keywords.in']
327
328 options, filenames = parser.parse_args(args)
329 if not filenames:
330 parser.error('No files specified.')
331
332 for filename in filenames:
333 try:
334 if options.test:
335 if TestSameText(filename):
336 sys.stderr.write('Failed text match on %s.\n' % filename)
337 return -1
338 if TestExpect(filename):
339 sys.stderr.write('Failed expected type match on %s.\n' % filename)
340 return -1
341 print 'Passed: ' + filename
342
343 except lex.LexError as le:
344 sys.stderr.write('%s\n' % str(le))
345
346 return 0
347
348
349if __name__ == '__main__':
350 sys.exit(Main(sys.argv[1:]))