blob: 7fde1fff24e2d8b4bf5ea3d625f8fc389b106ae0 [file] [log] [blame]
[email protected]cffee7f2013-04-11 17:03:481#!/usr/bin/env python
[email protected]683c8c52013-04-06 17:00:462# Copyright (c) 2013 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
Kenichi Ishibashi23e996b2017-06-22 07:16:286""" Lexer for Web IDL
[email protected]683c8c52013-04-06 17:00:467
Kenichi Ishibashi23e996b2017-06-22 07:16:288The lexer uses the PLY library to build a tokenizer which understands
9Web IDL tokens.
[email protected]683c8c52013-04-06 17:00:4610
Kenichi Ishibashi23e996b2017-06-22 07:16:2811Web IDL, and Web IDL regular expressions can be found at:
raphael.kubo.da.costa4bec0d72017-02-22 10:12:4412 http://heycam.github.io/webidl/
[email protected]683c8c52013-04-06 17:00:4613PLY can be found at:
14 http://www.dabeaz.com/ply/
15"""
16
[email protected]683c8c52013-04-06 17:00:4617import os.path
18import sys
19
raphael.kubo.da.costaadf05592017-02-21 12:58:2320SRC_DIR = os.path.join(os.path.dirname(__file__), os.pardir, os.pardir)
21sys.path.insert(0, os.path.join(SRC_DIR, 'third_party'))
22from ply import lex
23
[email protected]683c8c52013-04-06 17:00:4624
25#
26# IDL Lexer
27#
28class IDLLexer(object):
[email protected]a8f94282013-08-14 01:42:3029 # 'literals' is a value expected by lex which specifies a list of valid
30 # literal tokens, meaning the token type and token value are identical.
31 literals = r'"*.(){}[],;:=+-/~|&^?<>'
32
33 # 't_ignore' contains ignored characters (spaces and tabs)
34 t_ignore = ' \t'
35
[email protected]683c8c52013-04-06 17:00:4636 # 'tokens' is a value required by lex which specifies the complete list
37 # of valid token types.
38 tokens = [
39 # Data types
40 'float',
41 'integer',
42 'string',
43
[email protected]683c8c52013-04-06 17:00:4644 # Symbol and keywords types
Kenichi Ishibashif2f3c922017-07-14 02:02:4045 'SPECIAL_COMMENT',
[email protected]683c8c52013-04-06 17:00:4646 'identifier',
47
[email protected]d4b86672013-04-11 16:28:3148 # MultiChar operators
49 'ELLIPSIS',
[email protected]683c8c52013-04-06 17:00:4650 ]
51
52 # 'keywords' is a map of string to token type. All tokens matching
53 # KEYWORD_OR_SYMBOL are matched against keywords dictionary, to determine
54 # if the token is actually a keyword.
55 keywords = {
Yuki Shiino3bffb1292020-02-03 08:35:2256 'any': 'ANY',
57 'async': 'ASYNC',
58 'attribute': 'ATTRIBUTE',
59 'boolean': 'BOOLEAN',
60 'byte': 'BYTE',
61 'ByteString': 'BYTESTRING',
62 'callback': 'CALLBACK',
63 'const': 'CONST',
64 'constructor': 'CONSTRUCTOR',
65 'deleter': 'DELETER',
66 'dictionary': 'DICTIONARY',
67 'DOMString': 'DOMSTRING',
68 'double': 'DOUBLE',
69 'enum': 'ENUM',
70 'false': 'FALSE',
71 'float': 'FLOAT',
72 'FrozenArray': 'FROZENARRAY',
73 'getter': 'GETTER',
74 'includes': 'INCLUDES',
75 'Infinity': 'INFINITY',
76 'inherit': 'INHERIT',
77 'interface': 'INTERFACE',
78 'iterable': 'ITERABLE',
79 'long': 'LONG',
80 'maplike': 'MAPLIKE',
81 'mixin': 'MIXIN',
82 'namespace': 'NAMESPACE',
83 'NaN': 'NAN',
84 'null': 'NULL',
85 'object': 'OBJECT',
86 'octet': 'OCTET',
87 'optional': 'OPTIONAL',
88 'or': 'OR',
89 'partial': 'PARTIAL',
90 'Promise': 'PROMISE',
91 'readonly': 'READONLY',
92 'record': 'RECORD',
93 'required': 'REQUIRED',
94 'sequence': 'SEQUENCE',
95 'setlike': 'SETLIKE',
96 'setter': 'SETTER',
97 'short': 'SHORT',
98 'static': 'STATIC',
99 'stringifier': 'STRINGIFIER',
100 'true': 'TRUE',
101 'typedef': 'TYPEDEF',
102 'unrestricted': 'UNRESTRICTED',
103 'unsigned': 'UNSIGNED',
104 'USVString': 'USVSTRING',
105 'void': 'VOID'
[email protected]683c8c52013-04-06 17:00:46106 }
107
[email protected]683c8c52013-04-06 17:00:46108 # Token definitions
109 #
110 # Lex assumes any value or function in the form of 't_<TYPE>' represents a
111 # regular expression where a match will emit a token of type <TYPE>. In the
112 # case of a function, the function is called when a match is made. These
113 # definitions come from WebIDL.
[email protected]a8f94282013-08-14 01:42:30114 #
115 # These need to be methods for lexer construction, despite not using self.
116 # pylint: disable=R0201
[email protected]ac7b49d2013-04-12 18:48:47117 def t_ELLIPSIS(self, t):
118 r'\.\.\.'
119 return t
[email protected]683c8c52013-04-06 17:00:46120
[email protected]a8f94282013-08-14 01:42:30121 # Regex needs to be in the docstring
122 # pylint: disable=C0301
[email protected]ac7b49d2013-04-12 18:48:47123 def t_float(self, t):
124 r'-?(([0-9]+\.[0-9]*|[0-9]*\.[0-9]+)([Ee][+-]?[0-9]+)?|[0-9]+[Ee][+-]?[0-9]+)'
125 return t
[email protected]683c8c52013-04-06 17:00:46126
[email protected]ac7b49d2013-04-12 18:48:47127 def t_integer(self, t):
[email protected]9f1b57f2013-08-07 05:08:09128 r'-?([1-9][0-9]*|0[Xx][0-9A-Fa-f]+|0[0-7]*)'
[email protected]ac7b49d2013-04-12 18:48:47129 return t
[email protected]d4b86672013-04-11 16:28:31130
[email protected]683c8c52013-04-06 17:00:46131
[email protected]683c8c52013-04-06 17:00:46132 # A line ending '\n', we use this to increment the line number
133 def t_LINE_END(self, t):
134 r'\n+'
135 self.AddLines(len(t.value))
136
137 # We do not process escapes in the IDL strings. Strings are exclusively
138 # used for attributes and enums, and not used as typical 'C' constants.
139 def t_string(self, t):
140 r'"[^"]*"'
141 t.value = t.value[1:-1]
142 self.AddLines(t.value.count('\n'))
143 return t
144
Kenichi Ishibashif2f3c922017-07-14 02:02:40145 # A Javadoc style comment: /** xxx */
146 # Unlike t_COMMENT, this is NOT ignored.
147 # Also note that this should be defined before t_COMMENT.
148 def t_SPECIAL_COMMENT(self, t):
149 r'/\*\*(.|\n)+?\*/'
150 self.AddLines(t.value.count('\n'))
151 return t
152
[email protected]683c8c52013-04-06 17:00:46153 # A C or C++ style comment: /* xxx */ or //
Kenichi Ishibashi4e46f032017-06-23 07:26:05154 # This token is ignored.
[email protected]683c8c52013-04-06 17:00:46155 def t_COMMENT(self, t):
156 r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)'
157 self.AddLines(t.value.count('\n'))
[email protected]683c8c52013-04-06 17:00:46158
[email protected]683c8c52013-04-06 17:00:46159 # A symbol or keyword.
160 def t_KEYWORD_OR_SYMBOL(self, t):
Hitoshi Yoshida94b88302019-01-31 05:19:31161 r'[_-]?[A-Za-z][A-Za-z_0-9-]*'
[email protected]683c8c52013-04-06 17:00:46162
163 # All non-keywords are assumed to be symbols
164 t.type = self.keywords.get(t.value, 'identifier')
165
166 # We strip leading underscores so that you can specify symbols with the same
167 # value as a keywords (E.g. a dictionary named 'interface').
168 if t.value[0] == '_':
169 t.value = t.value[1:]
170 return t
171
172 def t_ANY_error(self, t):
173 msg = 'Unrecognized input'
[email protected]ac7b49d2013-04-12 18:48:47174 line = self.Lexer().lineno
[email protected]683c8c52013-04-06 17:00:46175
176 # If that line has not been accounted for, then we must have hit
177 # EoF, so compute the beginning of the line that caused the problem.
178 if line >= len(self.index):
179 # Find the offset in the line of the first word causing the issue
180 word = t.value.split()[0]
181 offs = self.lines[line - 1].find(word)
182 # Add the computed line's starting position
[email protected]ac7b49d2013-04-12 18:48:47183 self.index.append(self.Lexer().lexpos - offs)
[email protected]683c8c52013-04-06 17:00:46184 msg = 'Unexpected EoF reached after'
185
[email protected]ac7b49d2013-04-12 18:48:47186 pos = self.Lexer().lexpos - self.index[line]
[email protected]683c8c52013-04-06 17:00:46187 out = self.ErrorMessage(line, pos, msg)
188 sys.stderr.write(out + '\n')
[email protected]d4b86672013-04-11 16:28:31189 self._lex_errors += 1
[email protected]683c8c52013-04-06 17:00:46190
191
192 def AddLines(self, count):
193 # Set the lexer position for the beginning of the next line. In the case
194 # of multiple lines, tokens can not exist on any of the lines except the
195 # last one, so the recorded value for previous lines are unused. We still
196 # fill the array however, to make sure the line count is correct.
[email protected]ac7b49d2013-04-12 18:48:47197 self.Lexer().lineno += count
[email protected]683c8c52013-04-06 17:00:46198 for _ in range(count):
[email protected]ac7b49d2013-04-12 18:48:47199 self.index.append(self.Lexer().lexpos)
[email protected]683c8c52013-04-06 17:00:46200
201 def FileLineMsg(self, line, msg):
202 # Generate a message containing the file and line number of a token.
[email protected]ac7b49d2013-04-12 18:48:47203 filename = self.Lexer().filename
[email protected]683c8c52013-04-06 17:00:46204 if filename:
205 return "%s(%d) : %s" % (filename, line + 1, msg)
206 return "<BuiltIn> : %s" % msg
207
208 def SourceLine(self, line, pos):
209 # Create a source line marker
[email protected]d4b86672013-04-11 16:28:31210 caret = ' ' * pos + '^'
[email protected]683c8c52013-04-06 17:00:46211 # We decrement the line number since the array is 0 based while the
212 # line numbers are 1 based.
213 return "%s\n%s" % (self.lines[line - 1], caret)
214
215 def ErrorMessage(self, line, pos, msg):
216 return "\n%s\n%s" % (
217 self.FileLineMsg(line, msg),
218 self.SourceLine(line, pos))
219
[email protected]d4b86672013-04-11 16:28:31220#
221# Tokenizer
222#
223# The token function returns the next token provided by IDLLexer for matching
224# against the leaf paterns.
225#
226 def token(self):
[email protected]ac7b49d2013-04-12 18:48:47227 tok = self.Lexer().token()
[email protected]d4b86672013-04-11 16:28:31228 if tok:
229 self.last = tok
230 return tok
231
232
[email protected]683c8c52013-04-06 17:00:46233 def GetTokens(self):
234 outlist = []
235 while True:
[email protected]ac7b49d2013-04-12 18:48:47236 t = self.Lexer().token()
[email protected]683c8c52013-04-06 17:00:46237 if not t:
238 break
239 outlist.append(t)
240 return outlist
241
[email protected]d4b86672013-04-11 16:28:31242 def Tokenize(self, data, filename='__no_file__'):
[email protected]ac7b49d2013-04-12 18:48:47243 lexer = self.Lexer()
244 lexer.lineno = 1
245 lexer.filename = filename
246 lexer.input(data)
[email protected]d4b86672013-04-11 16:28:31247 self.lines = data.split('\n')
[email protected]683c8c52013-04-06 17:00:46248
[email protected]ac7b49d2013-04-12 18:48:47249 def KnownTokens(self):
250 return self.tokens
251
252 def Lexer(self):
253 if not self._lexobj:
254 self._lexobj = lex.lex(object=self, lextab=None, optimize=0)
255 return self._lexobj
256
[email protected]ac7b49d2013-04-12 18:48:47257 def _AddToken(self, token):
258 if token in self.tokens:
259 raise RuntimeError('Same token: ' + token)
260 self.tokens.append(token)
261
262 def _AddTokens(self, tokens):
263 for token in tokens:
264 self._AddToken(token)
265
266 def _AddKeywords(self, keywords):
267 for key in keywords:
268 value = key.upper()
269 self._AddToken(value)
270 self.keywords[key] = value
271
[email protected]a958ace2013-06-29 20:51:01272 def _DelKeywords(self, keywords):
273 for key in keywords:
274 self.tokens.remove(key.upper())
275 del self.keywords[key]
276
[email protected]d4b86672013-04-11 16:28:31277 def __init__(self):
278 self.index = [0]
279 self._lex_errors = 0
280 self.linex = []
281 self.filename = None
[email protected]ac7b49d2013-04-12 18:48:47282 self.keywords = {}
283 self.tokens = []
[email protected]ac7b49d2013-04-12 18:48:47284 self._AddTokens(IDLLexer.tokens)
285 self._AddKeywords(IDLLexer.keywords)
286 self._lexobj = None
[email protected]a8f94282013-08-14 01:42:30287 self.last = None
288 self.lines = None
[email protected]683c8c52013-04-06 17:00:46289
[email protected]ac7b49d2013-04-12 18:48:47290# If run by itself, attempt to build the lexer
291if __name__ == '__main__':
[email protected]a8f94282013-08-14 01:42:30292 lexer_object = IDLLexer()