blob: c810558eadaaf9bbcf3f5adc93f3bcf02412e007 [file] [log] [blame]
[email protected]cffee7f2013-04-11 17:03:481#!/usr/bin/env python
[email protected]683c8c52013-04-06 17:00:462# Copyright (c) 2013 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
Kenichi Ishibashi23e996b2017-06-22 07:16:286""" Lexer for Web IDL
[email protected]683c8c52013-04-06 17:00:467
Kenichi Ishibashi23e996b2017-06-22 07:16:288The lexer uses the PLY library to build a tokenizer which understands
9Web IDL tokens.
[email protected]683c8c52013-04-06 17:00:4610
Kenichi Ishibashi23e996b2017-06-22 07:16:2811Web IDL, and Web IDL regular expressions can be found at:
raphael.kubo.da.costa4bec0d72017-02-22 10:12:4412 http://heycam.github.io/webidl/
[email protected]683c8c52013-04-06 17:00:4613PLY can be found at:
14 http://www.dabeaz.com/ply/
15"""
16
[email protected]683c8c52013-04-06 17:00:4617import os.path
18import sys
19
raphael.kubo.da.costaadf05592017-02-21 12:58:2320SRC_DIR = os.path.join(os.path.dirname(__file__), os.pardir, os.pardir)
21sys.path.insert(0, os.path.join(SRC_DIR, 'third_party'))
22from ply import lex
23
[email protected]683c8c52013-04-06 17:00:4624
25#
26# IDL Lexer
27#
28class IDLLexer(object):
[email protected]a8f94282013-08-14 01:42:3029 # 'literals' is a value expected by lex which specifies a list of valid
30 # literal tokens, meaning the token type and token value are identical.
31 literals = r'"*.(){}[],;:=+-/~|&^?<>'
32
33 # 't_ignore' contains ignored characters (spaces and tabs)
34 t_ignore = ' \t'
35
[email protected]683c8c52013-04-06 17:00:4636 # 'tokens' is a value required by lex which specifies the complete list
37 # of valid token types.
38 tokens = [
39 # Data types
40 'float',
41 'integer',
42 'string',
43
[email protected]683c8c52013-04-06 17:00:4644 # Symbol and keywords types
45 'COMMENT',
46 'identifier',
47
[email protected]d4b86672013-04-11 16:28:3148 # MultiChar operators
49 'ELLIPSIS',
[email protected]683c8c52013-04-06 17:00:4650 ]
51
52 # 'keywords' is a map of string to token type. All tokens matching
53 # KEYWORD_OR_SYMBOL are matched against keywords dictionary, to determine
54 # if the token is actually a keyword.
55 keywords = {
56 'any' : 'ANY',
57 'attribute' : 'ATTRIBUTE',
58 'boolean' : 'BOOLEAN',
59 'byte' : 'BYTE',
[email protected]5885b692014-06-19 14:43:2460 'ByteString' : 'BYTESTRING',
[email protected]683c8c52013-04-06 17:00:4661 'callback' : 'CALLBACK',
62 'const' : 'CONST',
63 'creator' : 'CREATOR',
64 'Date' : 'DATE',
65 'deleter' : 'DELETER',
66 'dictionary' : 'DICTIONARY',
67 'DOMString' : 'DOMSTRING',
68 'double' : 'DOUBLE',
69 'enum' : 'ENUM',
jl9016ef322014-12-16 15:05:0970 'exception' : 'EXCEPTION',
[email protected]683c8c52013-04-06 17:00:4671 'false' : 'FALSE',
72 'float' : 'FLOAT',
bashicb5c16612015-08-19 02:11:2473 'FrozenArray' : 'FROZENARRAY',
[email protected]683c8c52013-04-06 17:00:4674 'getter': 'GETTER',
75 'implements' : 'IMPLEMENTS',
76 'Infinity' : 'INFINITY',
77 'inherit' : 'INHERIT',
78 'interface' : 'INTERFACE',
jl9016ef322014-12-16 15:05:0979 'iterable': 'ITERABLE',
[email protected]683c8c52013-04-06 17:00:4680 'legacycaller' : 'LEGACYCALLER',
[email protected]683c8c52013-04-06 17:00:4681 'long' : 'LONG',
jl9016ef322014-12-16 15:05:0982 'maplike': 'MAPLIKE',
[email protected]683c8c52013-04-06 17:00:4683 'Nan' : 'NAN',
84 'null' : 'NULL',
85 'object' : 'OBJECT',
86 'octet' : 'OCTET',
87 'optional' : 'OPTIONAL',
88 'or' : 'OR',
yhirano6ce2b8e2014-10-20 12:49:0989 'partial' : 'PARTIAL',
90 'Promise' : 'PROMISE',
[email protected]683c8c52013-04-06 17:00:4691 'readonly' : 'READONLY',
[email protected]5885b692014-06-19 14:43:2492 'RegExp' : 'REGEXP',
raphael.kubo.da.costa4bec0d72017-02-22 10:12:4493 'record' : 'RECORD',
jl9016ef322014-12-16 15:05:0994 'required' : 'REQUIRED',
[email protected]683c8c52013-04-06 17:00:4695 'sequence' : 'SEQUENCE',
[email protected]5885b692014-06-19 14:43:2496 'serializer' : 'SERIALIZER',
jl9016ef322014-12-16 15:05:0997 'setlike' : 'SETLIKE',
[email protected]683c8c52013-04-06 17:00:4698 'setter': 'SETTER',
99 'short' : 'SHORT',
100 'static' : 'STATIC',
101 'stringifier' : 'STRINGIFIER',
[email protected]683c8c52013-04-06 17:00:46102 'typedef' : 'TYPEDEF',
103 'true' : 'TRUE',
104 'unsigned' : 'UNSIGNED',
105 'unrestricted' : 'UNRESTRICTED',
raphael.kubo.da.costa4bec0d72017-02-22 10:12:44106 'USVString' : 'USVSTRING',
[email protected]683c8c52013-04-06 17:00:46107 'void' : 'VOID'
108 }
109
[email protected]683c8c52013-04-06 17:00:46110 # Token definitions
111 #
112 # Lex assumes any value or function in the form of 't_<TYPE>' represents a
113 # regular expression where a match will emit a token of type <TYPE>. In the
114 # case of a function, the function is called when a match is made. These
115 # definitions come from WebIDL.
[email protected]a8f94282013-08-14 01:42:30116 #
117 # These need to be methods for lexer construction, despite not using self.
118 # pylint: disable=R0201
[email protected]ac7b49d2013-04-12 18:48:47119 def t_ELLIPSIS(self, t):
120 r'\.\.\.'
121 return t
[email protected]683c8c52013-04-06 17:00:46122
[email protected]a8f94282013-08-14 01:42:30123 # Regex needs to be in the docstring
124 # pylint: disable=C0301
[email protected]ac7b49d2013-04-12 18:48:47125 def t_float(self, t):
126 r'-?(([0-9]+\.[0-9]*|[0-9]*\.[0-9]+)([Ee][+-]?[0-9]+)?|[0-9]+[Ee][+-]?[0-9]+)'
127 return t
[email protected]683c8c52013-04-06 17:00:46128
[email protected]ac7b49d2013-04-12 18:48:47129 def t_integer(self, t):
[email protected]9f1b57f2013-08-07 05:08:09130 r'-?([1-9][0-9]*|0[Xx][0-9A-Fa-f]+|0[0-7]*)'
[email protected]ac7b49d2013-04-12 18:48:47131 return t
[email protected]d4b86672013-04-11 16:28:31132
[email protected]683c8c52013-04-06 17:00:46133
[email protected]683c8c52013-04-06 17:00:46134 # A line ending '\n', we use this to increment the line number
135 def t_LINE_END(self, t):
136 r'\n+'
137 self.AddLines(len(t.value))
138
139 # We do not process escapes in the IDL strings. Strings are exclusively
140 # used for attributes and enums, and not used as typical 'C' constants.
141 def t_string(self, t):
142 r'"[^"]*"'
143 t.value = t.value[1:-1]
144 self.AddLines(t.value.count('\n'))
145 return t
146
147 # A C or C++ style comment: /* xxx */ or //
148 def t_COMMENT(self, t):
149 r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)'
150 self.AddLines(t.value.count('\n'))
151 return t
152
[email protected]683c8c52013-04-06 17:00:46153 # A symbol or keyword.
154 def t_KEYWORD_OR_SYMBOL(self, t):
155 r'_?[A-Za-z][A-Za-z_0-9]*'
156
157 # All non-keywords are assumed to be symbols
158 t.type = self.keywords.get(t.value, 'identifier')
159
160 # We strip leading underscores so that you can specify symbols with the same
161 # value as a keywords (E.g. a dictionary named 'interface').
162 if t.value[0] == '_':
163 t.value = t.value[1:]
164 return t
165
166 def t_ANY_error(self, t):
167 msg = 'Unrecognized input'
[email protected]ac7b49d2013-04-12 18:48:47168 line = self.Lexer().lineno
[email protected]683c8c52013-04-06 17:00:46169
170 # If that line has not been accounted for, then we must have hit
171 # EoF, so compute the beginning of the line that caused the problem.
172 if line >= len(self.index):
173 # Find the offset in the line of the first word causing the issue
174 word = t.value.split()[0]
175 offs = self.lines[line - 1].find(word)
176 # Add the computed line's starting position
[email protected]ac7b49d2013-04-12 18:48:47177 self.index.append(self.Lexer().lexpos - offs)
[email protected]683c8c52013-04-06 17:00:46178 msg = 'Unexpected EoF reached after'
179
[email protected]ac7b49d2013-04-12 18:48:47180 pos = self.Lexer().lexpos - self.index[line]
[email protected]683c8c52013-04-06 17:00:46181 out = self.ErrorMessage(line, pos, msg)
182 sys.stderr.write(out + '\n')
[email protected]d4b86672013-04-11 16:28:31183 self._lex_errors += 1
[email protected]683c8c52013-04-06 17:00:46184
185
186 def AddLines(self, count):
187 # Set the lexer position for the beginning of the next line. In the case
188 # of multiple lines, tokens can not exist on any of the lines except the
189 # last one, so the recorded value for previous lines are unused. We still
190 # fill the array however, to make sure the line count is correct.
[email protected]ac7b49d2013-04-12 18:48:47191 self.Lexer().lineno += count
[email protected]683c8c52013-04-06 17:00:46192 for _ in range(count):
[email protected]ac7b49d2013-04-12 18:48:47193 self.index.append(self.Lexer().lexpos)
[email protected]683c8c52013-04-06 17:00:46194
195 def FileLineMsg(self, line, msg):
196 # Generate a message containing the file and line number of a token.
[email protected]ac7b49d2013-04-12 18:48:47197 filename = self.Lexer().filename
[email protected]683c8c52013-04-06 17:00:46198 if filename:
199 return "%s(%d) : %s" % (filename, line + 1, msg)
200 return "<BuiltIn> : %s" % msg
201
202 def SourceLine(self, line, pos):
203 # Create a source line marker
[email protected]d4b86672013-04-11 16:28:31204 caret = ' ' * pos + '^'
[email protected]683c8c52013-04-06 17:00:46205 # We decrement the line number since the array is 0 based while the
206 # line numbers are 1 based.
207 return "%s\n%s" % (self.lines[line - 1], caret)
208
209 def ErrorMessage(self, line, pos, msg):
210 return "\n%s\n%s" % (
211 self.FileLineMsg(line, msg),
212 self.SourceLine(line, pos))
213
[email protected]d4b86672013-04-11 16:28:31214#
215# Tokenizer
216#
217# The token function returns the next token provided by IDLLexer for matching
218# against the leaf paterns.
219#
220 def token(self):
[email protected]ac7b49d2013-04-12 18:48:47221 tok = self.Lexer().token()
[email protected]d4b86672013-04-11 16:28:31222 if tok:
223 self.last = tok
224 return tok
225
226
[email protected]683c8c52013-04-06 17:00:46227 def GetTokens(self):
228 outlist = []
229 while True:
[email protected]ac7b49d2013-04-12 18:48:47230 t = self.Lexer().token()
[email protected]683c8c52013-04-06 17:00:46231 if not t:
232 break
233 outlist.append(t)
234 return outlist
235
[email protected]d4b86672013-04-11 16:28:31236 def Tokenize(self, data, filename='__no_file__'):
[email protected]ac7b49d2013-04-12 18:48:47237 lexer = self.Lexer()
238 lexer.lineno = 1
239 lexer.filename = filename
240 lexer.input(data)
[email protected]d4b86672013-04-11 16:28:31241 self.lines = data.split('\n')
[email protected]683c8c52013-04-06 17:00:46242
[email protected]ac7b49d2013-04-12 18:48:47243 def KnownTokens(self):
244 return self.tokens
245
246 def Lexer(self):
247 if not self._lexobj:
248 self._lexobj = lex.lex(object=self, lextab=None, optimize=0)
249 return self._lexobj
250
[email protected]ac7b49d2013-04-12 18:48:47251 def _AddToken(self, token):
252 if token in self.tokens:
253 raise RuntimeError('Same token: ' + token)
254 self.tokens.append(token)
255
256 def _AddTokens(self, tokens):
257 for token in tokens:
258 self._AddToken(token)
259
260 def _AddKeywords(self, keywords):
261 for key in keywords:
262 value = key.upper()
263 self._AddToken(value)
264 self.keywords[key] = value
265
[email protected]a958ace2013-06-29 20:51:01266 def _DelKeywords(self, keywords):
267 for key in keywords:
268 self.tokens.remove(key.upper())
269 del self.keywords[key]
270
[email protected]d4b86672013-04-11 16:28:31271 def __init__(self):
272 self.index = [0]
273 self._lex_errors = 0
274 self.linex = []
275 self.filename = None
[email protected]ac7b49d2013-04-12 18:48:47276 self.keywords = {}
277 self.tokens = []
[email protected]ac7b49d2013-04-12 18:48:47278 self._AddTokens(IDLLexer.tokens)
279 self._AddKeywords(IDLLexer.keywords)
280 self._lexobj = None
[email protected]a8f94282013-08-14 01:42:30281 self.last = None
282 self.lines = None
[email protected]683c8c52013-04-06 17:00:46283
[email protected]ac7b49d2013-04-12 18:48:47284# If run by itself, attempt to build the lexer
285if __name__ == '__main__':
[email protected]a8f94282013-08-14 01:42:30286 lexer_object = IDLLexer()