blob: d45a4b8ea425110e3314946a42f844c98a565602 [file] [log] [blame]
[email protected]2ec654a2012-01-10 17:47:001#!/usr/bin/env python
2# Copyright (c) 2012 The Chromium Authors. All rights reserved.
[email protected]bd3f4b3d2011-03-23 20:01:403# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6""" Lexer for PPAPI IDL """
7
[email protected]d3864f5c12011-04-08 15:19:048#
9# IDL Lexer
10#
11# The lexer is uses the PLY lex library to build a tokenizer which understands
12# WebIDL tokens.
13#
14# WebIDL, and WebIDL regular expressions can be found at:
15# http://dev.w3.org/2006/webapi/WebIDL/
16# PLY can be found at:
17# http://www.dabeaz.com/ply/
[email protected]bd3f4b3d2011-03-23 20:01:4018
[email protected]bd3f4b3d2011-03-23 20:01:4019import os.path
20import re
21import sys
22
23#
24# Try to load the ply module, if not, then assume it is in the third_party
25# directory, relative to ppapi
26#
27try:
28 from ply import lex
29except:
30 module_path, module_name = os.path.split(__file__)
31 third_party = os.path.join(module_path, '..', '..', 'third_party')
32 sys.path.append(third_party)
33 from ply import lex
34
[email protected]5b497ed2011-05-15 22:08:5635from idl_option import GetOption, Option, ParseOptions
[email protected]d3864f5c12011-04-08 15:19:0436
37
[email protected]5b497ed2011-05-15 22:08:5638Option('output', 'Generate output.')
[email protected]5b497ed2011-05-15 22:08:5639
[email protected]bd3f4b3d2011-03-23 20:01:4040#
41# IDL Lexer
42#
43class IDLLexer(object):
44 # 'tokens' is a value required by lex which specifies the complete list
45 # of valid token types.
46 tokens = [
47 # Symbol and keywords types
48 'COMMENT',
49 'DESCRIBE',
50 'ENUM',
[email protected]16796362011-07-02 19:43:1951 'LABEL',
[email protected]bd3f4b3d2011-03-23 20:01:4052 'SYMBOL',
[email protected]16796362011-07-02 19:43:1953 'INLINE',
[email protected]bd3f4b3d2011-03-23 20:01:4054 'INTERFACE',
55 'STRUCT',
56 'TYPEDEF',
57
[email protected]d01e6ff92012-03-05 17:38:4358 # Extra WebIDL keywords
59 'CALLBACK',
60 'DICTIONARY',
61 'OPTIONAL',
62 'STATIC',
63
64 # Invented for apps use
65 'NAMESPACE',
66
67
[email protected]bd3f4b3d2011-03-23 20:01:4068 # Data types
69 'FLOAT',
[email protected]d3864f5c12011-04-08 15:19:0470 'OCT',
[email protected]bd3f4b3d2011-03-23 20:01:4071 'INT',
72 'HEX',
73 'STRING',
74
75 # Operators
[email protected]a2365a22011-10-07 22:43:0476 'LSHIFT',
77 'RSHIFT'
[email protected]bd3f4b3d2011-03-23 20:01:4078 ]
79
80 # 'keywords' is a map of string to token type. All SYMBOL tokens are
81 # matched against keywords, to determine if the token is actually a keyword.
82 keywords = {
83 'describe' : 'DESCRIBE',
84 'enum' : 'ENUM',
[email protected]16796362011-07-02 19:43:1985 'label' : 'LABEL',
[email protected]bd3f4b3d2011-03-23 20:01:4086 'interface' : 'INTERFACE',
87 'readonly' : 'READONLY',
88 'struct' : 'STRUCT',
89 'typedef' : 'TYPEDEF',
[email protected]d01e6ff92012-03-05 17:38:4390
91 'callback' : 'CALLBACK',
92 'dictionary' : 'DICTIONARY',
93 'optional' : 'OPTIONAL',
94 'static' : 'STATIC',
95 'namespace' : 'NAMESPACE',
[email protected]bd3f4b3d2011-03-23 20:01:4096 }
97
98 # 'literals' is a value expected by lex which specifies a list of valid
99 # literal tokens, meaning the token type and token value are identical.
[email protected]d01e6ff92012-03-05 17:38:43100 literals = '"*.(){}[],;:=+-/~|&^?'
[email protected]bd3f4b3d2011-03-23 20:01:40101
102 # Token definitions
103 #
104 # Lex assumes any value or function in the form of 't_<TYPE>' represents a
105 # regular expression where a match will emit a token of type <TYPE>. In the
[email protected]d3864f5c12011-04-08 15:19:04106 # case of a function, the function is called when a match is made. These
107 # definitions come from WebIDL.
[email protected]bd3f4b3d2011-03-23 20:01:40108
109 # 't_ignore' is a special match of items to ignore
110 t_ignore = ' \t'
111
112 # Constant values
113 t_FLOAT = r'-?(\d+\.\d*|\d*\.\d+)([Ee][+-]?\d+)?|-?\d+[Ee][+-]?\d+'
[email protected]03f41ec2011-11-18 22:53:04114 t_INT = r'-?[0-9]+[uU]?'
[email protected]d3864f5c12011-04-08 15:19:04115 t_OCT = r'-?0[0-7]+'
116 t_HEX = r'-?0[Xx][0-9A-Fa-f]+'
[email protected]bd3f4b3d2011-03-23 20:01:40117 t_LSHIFT = r'<<'
[email protected]a2365a22011-10-07 22:43:04118 t_RSHIFT = r'>>'
[email protected]bd3f4b3d2011-03-23 20:01:40119
120 # A line ending '\n', we use this to increment the line number
121 def t_LINE_END(self, t):
122 r'\n+'
123 self.AddLines(len(t.value))
124
125 # We do not process escapes in the IDL strings. Strings are exclusively
126 # used for attributes, and not used as typical 'C' constants.
127 def t_STRING(self, t):
128 r'"[^"]*"'
129 t.value = t.value[1:-1]
130 self.AddLines(t.value.count('\n'))
131 return t
132
133 # A C or C++ style comment: /* xxx */ or //
134 def t_COMMENT(self, t):
[email protected]d01e6ff92012-03-05 17:38:43135 r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)'
[email protected]bd3f4b3d2011-03-23 20:01:40136 self.AddLines(t.value.count('\n'))
[email protected]16796362011-07-02 19:43:19137 return t
[email protected]bd3f4b3d2011-03-23 20:01:40138
[email protected]16796362011-07-02 19:43:19139 # Return a "preprocessor" inline block
140 def t_INLINE(self, t):
[email protected]5cf7302d72012-02-08 18:27:34141 r'\#inline (.|\n)*?\#endinl.*'
[email protected]16796362011-07-02 19:43:19142 self.AddLines(t.value.count('\n'))
[email protected]bd3f4b3d2011-03-23 20:01:40143 return t
144
145 # A symbol or keyword.
146 def t_KEYWORD_SYMBOL(self, t):
147 r'[A-Za-z][A-Za-z_0-9]*'
148
149 #All non-keywords are assumed to be symbols
150 t.type = self.keywords.get(t.value, 'SYMBOL')
151 return t
152
153 def t_ANY_error(self, t):
[email protected]16796362011-07-02 19:43:19154 msg = "Unrecognized input"
[email protected]bd3f4b3d2011-03-23 20:01:40155 line = self.lexobj.lineno
[email protected]16796362011-07-02 19:43:19156
157 # If that line has not been accounted for, then we must have hit
158 # EoF, so compute the beginning of the line that caused the problem.
159 if line >= len(self.index):
160 # Find the offset in the line of the first word causing the issue
161 word = t.value.split()[0]
162 offs = self.lines[line - 1].find(word)
163 # Add the computed line's starting position
164 self.index.append(self.lexobj.lexpos - offs)
165 msg = "Unexpected EoF reached after"
166
[email protected]bd3f4b3d2011-03-23 20:01:40167 pos = self.lexobj.lexpos - self.index[line]
168 file = self.lexobj.filename
[email protected]16796362011-07-02 19:43:19169 out = self.ErrorMessage(file, line, pos, msg)
[email protected]bd3f4b3d2011-03-23 20:01:40170 sys.stderr.write(out + '\n')
[email protected]b5a99f02011-07-14 16:02:42171 self.lex_errors += 1
172
[email protected]bd3f4b3d2011-03-23 20:01:40173
174 def AddLines(self, count):
175 # Set the lexer position for the beginning of the next line. In the case
176 # of multiple lines, tokens can not exist on any of the lines except the
177 # last one, so the recorded value for previous lines are unused. We still
178 # fill the array however, to make sure the line count is correct.
179 self.lexobj.lineno += count
180 for i in range(count):
181 self.index.append(self.lexobj.lexpos)
182
183 def FileLineMsg(self, file, line, msg):
184 if file: return "%s(%d) : %s" % (file, line + 1, msg)
185 return "<BuiltIn> : %s" % msg
186
187 def SourceLine(self, file, line, pos):
188 caret = '\t^'.expandtabs(pos)
[email protected]4ad36962011-04-26 18:27:57189 # We decrement the line number since the array is 0 based while the
190 # line numbers are 1 based.
191 return "%s\n%s" % (self.lines[line - 1], caret)
[email protected]bd3f4b3d2011-03-23 20:01:40192
193 def ErrorMessage(self, file, line, pos, msg):
194 return "\n%s\n%s" % (
195 self.FileLineMsg(file, line, msg),
196 self.SourceLine(file, line, pos))
197
198 def SetData(self, filename, data):
[email protected]4ad36962011-04-26 18:27:57199 # Start with line 1, not zero
200 self.lexobj.lineno = 1
[email protected]bd3f4b3d2011-03-23 20:01:40201 self.lexobj.filename = filename
[email protected]bd3f4b3d2011-03-23 20:01:40202 self.lines = data.split('\n')
203 self.index = [0]
204 self.lexobj.input(data)
[email protected]b5a99f02011-07-14 16:02:42205 self.lex_errors = 0
[email protected]bd3f4b3d2011-03-23 20:01:40206
[email protected]5b497ed2011-05-15 22:08:56207 def __init__(self):
[email protected]bd3f4b3d2011-03-23 20:01:40208 self.lexobj = lex.lex(object=self, lextab=None, optimize=0)
[email protected]d3864f5c12011-04-08 15:19:04209
[email protected]bd3f4b3d2011-03-23 20:01:40210
211
212#
213# FilesToTokens
214#
215# From a set of source file names, generate a list of tokens.
216#
217def FilesToTokens(filenames, verbose=False):
218 lexer = IDLLexer()
219 outlist = []
220 for filename in filenames:
221 data = open(filename).read()
222 lexer.SetData(filename, data)
223 if verbose: sys.stdout.write(' Loaded %s...\n' % filename)
224 while 1:
225 t = lexer.lexobj.token()
226 if t is None: break
227 outlist.append(t)
228 return outlist
229
[email protected]16796362011-07-02 19:43:19230
231def TokensFromText(text):
232 lexer = IDLLexer()
233 lexer.SetData('unknown', text)
234 outlist = []
235 while 1:
236 t = lexer.lexobj.token()
237 if t is None: break
238 outlist.append(t.value)
239 return outlist
240
[email protected]bd3f4b3d2011-03-23 20:01:40241#
242# TextToTokens
243#
244# From a block of text, generate a list of tokens
245#
246def TextToTokens(source):
247 lexer = IDLLexer()
248 outlist = []
249 lexer.SetData('AUTO', source)
250 while 1:
251 t = lexer.lexobj.token()
252 if t is None: break
253 outlist.append(t.value)
254 return outlist
255
256
257#
258# TestSame
259#
260# From a set of token values, generate a new source text by joining with a
261# single space. The new source is then tokenized and compared against the
262# old set.
263#
[email protected]16796362011-07-02 19:43:19264def TestSame(values1):
265 # Recreate the source from the tokens. We use newline instead of whitespace
266 # since the '//' and #inline regex are line sensitive.
267 text = '\n'.join(values1)
268 values2 = TextToTokens(text)
269
270 count1 = len(values1)
271 count2 = len(values2)
272 if count1 != count2:
273 print "Size mismatch original %d vs %d\n" % (count1, count2)
274 if count1 > count2: count1 = count2
275
276 for i in range(count1):
277 if values1[i] != values2[i]:
278 print "%d >>%s<< >>%s<<" % (i, values1[i], values2[i])
[email protected]bd3f4b3d2011-03-23 20:01:40279
[email protected]5b497ed2011-05-15 22:08:56280 if GetOption('output'):
[email protected]bd3f4b3d2011-03-23 20:01:40281 sys.stdout.write('Generating original.txt and tokenized.txt\n')
282 open('original.txt', 'w').write(src1)
283 open('tokenized.txt', 'w').write(src2)
284
[email protected]16796362011-07-02 19:43:19285 if values1 == values2:
[email protected]bd3f4b3d2011-03-23 20:01:40286 sys.stdout.write('Same: Pass\n')
287 return 0
288
[email protected]16796362011-07-02 19:43:19289 print "****************\n%s\n%s***************\n" % (src1, src2)
[email protected]bd3f4b3d2011-03-23 20:01:40290 sys.stdout.write('Same: Failed\n')
291 return -1
292
293
294#
295# TestExpect
296#
297# From a set of tokens pairs, verify the type field of the second matches
298# the value of the first, so that:
299# INT 123 FLOAT 1.1
300# will generate a passing test, where the first token is the SYMBOL INT,
301# and the second token is the INT 123, third token is the SYMBOL FLOAT and
302# the fourth is the FLOAT 1.1, etc...
303def TestExpect(tokens):
304 count = len(tokens)
305 index = 0
306 errors = 0
307 while index < count:
308 type = tokens[index].value
309 token = tokens[index + 1]
310 index += 2
311
312 if type != token.type:
[email protected]d3864f5c12011-04-08 15:19:04313 sys.stderr.write('Mismatch: Expected %s, but got %s = %s.\n' %
[email protected]bd3f4b3d2011-03-23 20:01:40314 (type, token.type, token.value))
315 errors += 1
316
317 if not errors:
318 sys.stdout.write('Expect: Pass\n')
319 return 0
320
321 sys.stdout.write('Expect: Failed\n')
322 return -1
323
324
[email protected]bd3f4b3d2011-03-23 20:01:40325def Main(args):
[email protected]5b497ed2011-05-15 22:08:56326 filenames = ParseOptions(args)
[email protected]d3864f5c12011-04-08 15:19:04327
[email protected]bd3f4b3d2011-03-23 20:01:40328 try:
[email protected]5b497ed2011-05-15 22:08:56329 tokens = FilesToTokens(filenames, GetOption('verbose'))
[email protected]bd3f4b3d2011-03-23 20:01:40330 values = [tok.value for tok in tokens]
[email protected]5b497ed2011-05-15 22:08:56331 if GetOption('output'): sys.stdout.write(' <> '.join(values) + '\n')
332 if GetOption('test'):
[email protected]d3864f5c12011-04-08 15:19:04333 if TestSame(values):
[email protected]bd3f4b3d2011-03-23 20:01:40334 return -1
[email protected]bd3f4b3d2011-03-23 20:01:40335 if TestExpect(tokens):
336 return -1
337 return 0
338
339 except lex.LexError as le:
340 sys.stderr.write('%s\n' % str(le))
341 return -1
342
[email protected]2ec654a2012-01-10 17:47:00343
[email protected]bd3f4b3d2011-03-23 20:01:40344if __name__ == '__main__':
345 sys.exit(Main(sys.argv[1:]))