[email protected] | 2ec654a | 2012-01-10 17:47:00 | [diff] [blame] | 1 | #!/usr/bin/env python |
| 2 | # Copyright (c) 2012 The Chromium Authors. All rights reserved. |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 3 | # Use of this source code is governed by a BSD-style license that can be |
| 4 | # found in the LICENSE file. |
| 5 | |
| 6 | """ Lexer for PPAPI IDL """ |
| 7 | |
[email protected] | d3864f5c1 | 2011-04-08 15:19:04 | [diff] [blame] | 8 | # |
| 9 | # IDL Lexer |
| 10 | # |
| 11 | # The lexer is uses the PLY lex library to build a tokenizer which understands |
| 12 | # WebIDL tokens. |
| 13 | # |
| 14 | # WebIDL, and WebIDL regular expressions can be found at: |
| 15 | # http://dev.w3.org/2006/webapi/WebIDL/ |
| 16 | # PLY can be found at: |
| 17 | # http://www.dabeaz.com/ply/ |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 18 | |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 19 | import os.path |
| 20 | import re |
| 21 | import sys |
| 22 | |
| 23 | # |
| 24 | # Try to load the ply module, if not, then assume it is in the third_party |
| 25 | # directory, relative to ppapi |
| 26 | # |
| 27 | try: |
| 28 | from ply import lex |
| 29 | except: |
| 30 | module_path, module_name = os.path.split(__file__) |
| 31 | third_party = os.path.join(module_path, '..', '..', 'third_party') |
| 32 | sys.path.append(third_party) |
| 33 | from ply import lex |
| 34 | |
[email protected] | 5b497ed | 2011-05-15 22:08:56 | [diff] [blame] | 35 | from idl_option import GetOption, Option, ParseOptions |
[email protected] | d3864f5c1 | 2011-04-08 15:19:04 | [diff] [blame] | 36 | |
| 37 | |
[email protected] | 5b497ed | 2011-05-15 22:08:56 | [diff] [blame] | 38 | Option('output', 'Generate output.') |
[email protected] | 5b497ed | 2011-05-15 22:08:56 | [diff] [blame] | 39 | |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 40 | # |
| 41 | # IDL Lexer |
| 42 | # |
| 43 | class IDLLexer(object): |
| 44 | # 'tokens' is a value required by lex which specifies the complete list |
| 45 | # of valid token types. |
| 46 | tokens = [ |
| 47 | # Symbol and keywords types |
| 48 | 'COMMENT', |
| 49 | 'DESCRIBE', |
| 50 | 'ENUM', |
[email protected] | 1679636 | 2011-07-02 19:43:19 | [diff] [blame] | 51 | 'LABEL', |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 52 | 'SYMBOL', |
[email protected] | 1679636 | 2011-07-02 19:43:19 | [diff] [blame] | 53 | 'INLINE', |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 54 | 'INTERFACE', |
| 55 | 'STRUCT', |
| 56 | 'TYPEDEF', |
| 57 | |
[email protected] | d01e6ff9 | 2012-03-05 17:38:43 | [diff] [blame^] | 58 | # Extra WebIDL keywords |
| 59 | 'CALLBACK', |
| 60 | 'DICTIONARY', |
| 61 | 'OPTIONAL', |
| 62 | 'STATIC', |
| 63 | |
| 64 | # Invented for apps use |
| 65 | 'NAMESPACE', |
| 66 | |
| 67 | |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 68 | # Data types |
| 69 | 'FLOAT', |
[email protected] | d3864f5c1 | 2011-04-08 15:19:04 | [diff] [blame] | 70 | 'OCT', |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 71 | 'INT', |
| 72 | 'HEX', |
| 73 | 'STRING', |
| 74 | |
| 75 | # Operators |
[email protected] | a2365a2 | 2011-10-07 22:43:04 | [diff] [blame] | 76 | 'LSHIFT', |
| 77 | 'RSHIFT' |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 78 | ] |
| 79 | |
| 80 | # 'keywords' is a map of string to token type. All SYMBOL tokens are |
| 81 | # matched against keywords, to determine if the token is actually a keyword. |
| 82 | keywords = { |
| 83 | 'describe' : 'DESCRIBE', |
| 84 | 'enum' : 'ENUM', |
[email protected] | 1679636 | 2011-07-02 19:43:19 | [diff] [blame] | 85 | 'label' : 'LABEL', |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 86 | 'interface' : 'INTERFACE', |
| 87 | 'readonly' : 'READONLY', |
| 88 | 'struct' : 'STRUCT', |
| 89 | 'typedef' : 'TYPEDEF', |
[email protected] | d01e6ff9 | 2012-03-05 17:38:43 | [diff] [blame^] | 90 | |
| 91 | 'callback' : 'CALLBACK', |
| 92 | 'dictionary' : 'DICTIONARY', |
| 93 | 'optional' : 'OPTIONAL', |
| 94 | 'static' : 'STATIC', |
| 95 | 'namespace' : 'NAMESPACE', |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 96 | } |
| 97 | |
| 98 | # 'literals' is a value expected by lex which specifies a list of valid |
| 99 | # literal tokens, meaning the token type and token value are identical. |
[email protected] | d01e6ff9 | 2012-03-05 17:38:43 | [diff] [blame^] | 100 | literals = '"*.(){}[],;:=+-/~|&^?' |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 101 | |
| 102 | # Token definitions |
| 103 | # |
| 104 | # Lex assumes any value or function in the form of 't_<TYPE>' represents a |
| 105 | # regular expression where a match will emit a token of type <TYPE>. In the |
[email protected] | d3864f5c1 | 2011-04-08 15:19:04 | [diff] [blame] | 106 | # case of a function, the function is called when a match is made. These |
| 107 | # definitions come from WebIDL. |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 108 | |
| 109 | # 't_ignore' is a special match of items to ignore |
| 110 | t_ignore = ' \t' |
| 111 | |
| 112 | # Constant values |
| 113 | t_FLOAT = r'-?(\d+\.\d*|\d*\.\d+)([Ee][+-]?\d+)?|-?\d+[Ee][+-]?\d+' |
[email protected] | 03f41ec | 2011-11-18 22:53:04 | [diff] [blame] | 114 | t_INT = r'-?[0-9]+[uU]?' |
[email protected] | d3864f5c1 | 2011-04-08 15:19:04 | [diff] [blame] | 115 | t_OCT = r'-?0[0-7]+' |
| 116 | t_HEX = r'-?0[Xx][0-9A-Fa-f]+' |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 117 | t_LSHIFT = r'<<' |
[email protected] | a2365a2 | 2011-10-07 22:43:04 | [diff] [blame] | 118 | t_RSHIFT = r'>>' |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 119 | |
| 120 | # A line ending '\n', we use this to increment the line number |
| 121 | def t_LINE_END(self, t): |
| 122 | r'\n+' |
| 123 | self.AddLines(len(t.value)) |
| 124 | |
| 125 | # We do not process escapes in the IDL strings. Strings are exclusively |
| 126 | # used for attributes, and not used as typical 'C' constants. |
| 127 | def t_STRING(self, t): |
| 128 | r'"[^"]*"' |
| 129 | t.value = t.value[1:-1] |
| 130 | self.AddLines(t.value.count('\n')) |
| 131 | return t |
| 132 | |
| 133 | # A C or C++ style comment: /* xxx */ or // |
| 134 | def t_COMMENT(self, t): |
[email protected] | d01e6ff9 | 2012-03-05 17:38:43 | [diff] [blame^] | 135 | r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)' |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 136 | self.AddLines(t.value.count('\n')) |
[email protected] | 1679636 | 2011-07-02 19:43:19 | [diff] [blame] | 137 | return t |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 138 | |
[email protected] | 1679636 | 2011-07-02 19:43:19 | [diff] [blame] | 139 | # Return a "preprocessor" inline block |
| 140 | def t_INLINE(self, t): |
[email protected] | 5cf7302d7 | 2012-02-08 18:27:34 | [diff] [blame] | 141 | r'\#inline (.|\n)*?\#endinl.*' |
[email protected] | 1679636 | 2011-07-02 19:43:19 | [diff] [blame] | 142 | self.AddLines(t.value.count('\n')) |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 143 | return t |
| 144 | |
| 145 | # A symbol or keyword. |
| 146 | def t_KEYWORD_SYMBOL(self, t): |
| 147 | r'[A-Za-z][A-Za-z_0-9]*' |
| 148 | |
| 149 | #All non-keywords are assumed to be symbols |
| 150 | t.type = self.keywords.get(t.value, 'SYMBOL') |
| 151 | return t |
| 152 | |
| 153 | def t_ANY_error(self, t): |
[email protected] | 1679636 | 2011-07-02 19:43:19 | [diff] [blame] | 154 | msg = "Unrecognized input" |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 155 | line = self.lexobj.lineno |
[email protected] | 1679636 | 2011-07-02 19:43:19 | [diff] [blame] | 156 | |
| 157 | # If that line has not been accounted for, then we must have hit |
| 158 | # EoF, so compute the beginning of the line that caused the problem. |
| 159 | if line >= len(self.index): |
| 160 | # Find the offset in the line of the first word causing the issue |
| 161 | word = t.value.split()[0] |
| 162 | offs = self.lines[line - 1].find(word) |
| 163 | # Add the computed line's starting position |
| 164 | self.index.append(self.lexobj.lexpos - offs) |
| 165 | msg = "Unexpected EoF reached after" |
| 166 | |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 167 | pos = self.lexobj.lexpos - self.index[line] |
| 168 | file = self.lexobj.filename |
[email protected] | 1679636 | 2011-07-02 19:43:19 | [diff] [blame] | 169 | out = self.ErrorMessage(file, line, pos, msg) |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 170 | sys.stderr.write(out + '\n') |
[email protected] | b5a99f0 | 2011-07-14 16:02:42 | [diff] [blame] | 171 | self.lex_errors += 1 |
| 172 | |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 173 | |
| 174 | def AddLines(self, count): |
| 175 | # Set the lexer position for the beginning of the next line. In the case |
| 176 | # of multiple lines, tokens can not exist on any of the lines except the |
| 177 | # last one, so the recorded value for previous lines are unused. We still |
| 178 | # fill the array however, to make sure the line count is correct. |
| 179 | self.lexobj.lineno += count |
| 180 | for i in range(count): |
| 181 | self.index.append(self.lexobj.lexpos) |
| 182 | |
| 183 | def FileLineMsg(self, file, line, msg): |
| 184 | if file: return "%s(%d) : %s" % (file, line + 1, msg) |
| 185 | return "<BuiltIn> : %s" % msg |
| 186 | |
| 187 | def SourceLine(self, file, line, pos): |
| 188 | caret = '\t^'.expandtabs(pos) |
[email protected] | 4ad3696 | 2011-04-26 18:27:57 | [diff] [blame] | 189 | # We decrement the line number since the array is 0 based while the |
| 190 | # line numbers are 1 based. |
| 191 | return "%s\n%s" % (self.lines[line - 1], caret) |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 192 | |
| 193 | def ErrorMessage(self, file, line, pos, msg): |
| 194 | return "\n%s\n%s" % ( |
| 195 | self.FileLineMsg(file, line, msg), |
| 196 | self.SourceLine(file, line, pos)) |
| 197 | |
| 198 | def SetData(self, filename, data): |
[email protected] | 4ad3696 | 2011-04-26 18:27:57 | [diff] [blame] | 199 | # Start with line 1, not zero |
| 200 | self.lexobj.lineno = 1 |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 201 | self.lexobj.filename = filename |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 202 | self.lines = data.split('\n') |
| 203 | self.index = [0] |
| 204 | self.lexobj.input(data) |
[email protected] | b5a99f0 | 2011-07-14 16:02:42 | [diff] [blame] | 205 | self.lex_errors = 0 |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 206 | |
[email protected] | 5b497ed | 2011-05-15 22:08:56 | [diff] [blame] | 207 | def __init__(self): |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 208 | self.lexobj = lex.lex(object=self, lextab=None, optimize=0) |
[email protected] | d3864f5c1 | 2011-04-08 15:19:04 | [diff] [blame] | 209 | |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 210 | |
| 211 | |
| 212 | # |
| 213 | # FilesToTokens |
| 214 | # |
| 215 | # From a set of source file names, generate a list of tokens. |
| 216 | # |
| 217 | def FilesToTokens(filenames, verbose=False): |
| 218 | lexer = IDLLexer() |
| 219 | outlist = [] |
| 220 | for filename in filenames: |
| 221 | data = open(filename).read() |
| 222 | lexer.SetData(filename, data) |
| 223 | if verbose: sys.stdout.write(' Loaded %s...\n' % filename) |
| 224 | while 1: |
| 225 | t = lexer.lexobj.token() |
| 226 | if t is None: break |
| 227 | outlist.append(t) |
| 228 | return outlist |
| 229 | |
[email protected] | 1679636 | 2011-07-02 19:43:19 | [diff] [blame] | 230 | |
| 231 | def TokensFromText(text): |
| 232 | lexer = IDLLexer() |
| 233 | lexer.SetData('unknown', text) |
| 234 | outlist = [] |
| 235 | while 1: |
| 236 | t = lexer.lexobj.token() |
| 237 | if t is None: break |
| 238 | outlist.append(t.value) |
| 239 | return outlist |
| 240 | |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 241 | # |
| 242 | # TextToTokens |
| 243 | # |
| 244 | # From a block of text, generate a list of tokens |
| 245 | # |
| 246 | def TextToTokens(source): |
| 247 | lexer = IDLLexer() |
| 248 | outlist = [] |
| 249 | lexer.SetData('AUTO', source) |
| 250 | while 1: |
| 251 | t = lexer.lexobj.token() |
| 252 | if t is None: break |
| 253 | outlist.append(t.value) |
| 254 | return outlist |
| 255 | |
| 256 | |
| 257 | # |
| 258 | # TestSame |
| 259 | # |
| 260 | # From a set of token values, generate a new source text by joining with a |
| 261 | # single space. The new source is then tokenized and compared against the |
| 262 | # old set. |
| 263 | # |
[email protected] | 1679636 | 2011-07-02 19:43:19 | [diff] [blame] | 264 | def TestSame(values1): |
| 265 | # Recreate the source from the tokens. We use newline instead of whitespace |
| 266 | # since the '//' and #inline regex are line sensitive. |
| 267 | text = '\n'.join(values1) |
| 268 | values2 = TextToTokens(text) |
| 269 | |
| 270 | count1 = len(values1) |
| 271 | count2 = len(values2) |
| 272 | if count1 != count2: |
| 273 | print "Size mismatch original %d vs %d\n" % (count1, count2) |
| 274 | if count1 > count2: count1 = count2 |
| 275 | |
| 276 | for i in range(count1): |
| 277 | if values1[i] != values2[i]: |
| 278 | print "%d >>%s<< >>%s<<" % (i, values1[i], values2[i]) |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 279 | |
[email protected] | 5b497ed | 2011-05-15 22:08:56 | [diff] [blame] | 280 | if GetOption('output'): |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 281 | sys.stdout.write('Generating original.txt and tokenized.txt\n') |
| 282 | open('original.txt', 'w').write(src1) |
| 283 | open('tokenized.txt', 'w').write(src2) |
| 284 | |
[email protected] | 1679636 | 2011-07-02 19:43:19 | [diff] [blame] | 285 | if values1 == values2: |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 286 | sys.stdout.write('Same: Pass\n') |
| 287 | return 0 |
| 288 | |
[email protected] | 1679636 | 2011-07-02 19:43:19 | [diff] [blame] | 289 | print "****************\n%s\n%s***************\n" % (src1, src2) |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 290 | sys.stdout.write('Same: Failed\n') |
| 291 | return -1 |
| 292 | |
| 293 | |
| 294 | # |
| 295 | # TestExpect |
| 296 | # |
| 297 | # From a set of tokens pairs, verify the type field of the second matches |
| 298 | # the value of the first, so that: |
| 299 | # INT 123 FLOAT 1.1 |
| 300 | # will generate a passing test, where the first token is the SYMBOL INT, |
| 301 | # and the second token is the INT 123, third token is the SYMBOL FLOAT and |
| 302 | # the fourth is the FLOAT 1.1, etc... |
| 303 | def TestExpect(tokens): |
| 304 | count = len(tokens) |
| 305 | index = 0 |
| 306 | errors = 0 |
| 307 | while index < count: |
| 308 | type = tokens[index].value |
| 309 | token = tokens[index + 1] |
| 310 | index += 2 |
| 311 | |
| 312 | if type != token.type: |
[email protected] | d3864f5c1 | 2011-04-08 15:19:04 | [diff] [blame] | 313 | sys.stderr.write('Mismatch: Expected %s, but got %s = %s.\n' % |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 314 | (type, token.type, token.value)) |
| 315 | errors += 1 |
| 316 | |
| 317 | if not errors: |
| 318 | sys.stdout.write('Expect: Pass\n') |
| 319 | return 0 |
| 320 | |
| 321 | sys.stdout.write('Expect: Failed\n') |
| 322 | return -1 |
| 323 | |
| 324 | |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 325 | def Main(args): |
[email protected] | 5b497ed | 2011-05-15 22:08:56 | [diff] [blame] | 326 | filenames = ParseOptions(args) |
[email protected] | d3864f5c1 | 2011-04-08 15:19:04 | [diff] [blame] | 327 | |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 328 | try: |
[email protected] | 5b497ed | 2011-05-15 22:08:56 | [diff] [blame] | 329 | tokens = FilesToTokens(filenames, GetOption('verbose')) |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 330 | values = [tok.value for tok in tokens] |
[email protected] | 5b497ed | 2011-05-15 22:08:56 | [diff] [blame] | 331 | if GetOption('output'): sys.stdout.write(' <> '.join(values) + '\n') |
| 332 | if GetOption('test'): |
[email protected] | d3864f5c1 | 2011-04-08 15:19:04 | [diff] [blame] | 333 | if TestSame(values): |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 334 | return -1 |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 335 | if TestExpect(tokens): |
| 336 | return -1 |
| 337 | return 0 |
| 338 | |
| 339 | except lex.LexError as le: |
| 340 | sys.stderr.write('%s\n' % str(le)) |
| 341 | return -1 |
| 342 | |
[email protected] | 2ec654a | 2012-01-10 17:47:00 | [diff] [blame] | 343 | |
[email protected] | bd3f4b3d | 2011-03-23 20:01:40 | [diff] [blame] | 344 | if __name__ == '__main__': |
| 345 | sys.exit(Main(sys.argv[1:])) |