blob: 0568becbc8dcb51f92808b74c14a83de5cc45f32 [file] [log] [blame]
[email protected]96ea63d2013-07-30 10:17:071// Copyright (c) 2013 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "tools/gn/tokenizer.h"
6
7#include "base/logging.h"
scottmgbe5d4512014-09-24 02:29:128#include "base/strings/string_util.h"
[email protected]96ea63d2013-07-30 10:17:079#include "tools/gn/input_file.h"
10
11namespace {
12
[email protected]96ea63d2013-07-30 10:17:0713bool CouldBeTwoCharOperatorBegin(char c) {
14 return c == '<' || c == '>' || c == '!' || c == '=' || c == '-' ||
15 c == '+' || c == '|' || c == '&';
16}
17
18bool CouldBeTwoCharOperatorEnd(char c) {
19 return c == '=' || c == '|' || c == '&';
20}
21
22bool CouldBeOneCharOperator(char c) {
23 return c == '=' || c == '<' || c == '>' || c == '+' || c == '!' ||
24 c == ':' || c == '|' || c == '&' || c == '-';
25}
26
27bool CouldBeOperator(char c) {
28 return CouldBeOneCharOperator(c) || CouldBeTwoCharOperatorBegin(c);
29}
30
[email protected]96ea63d2013-07-30 10:17:0731bool IsScoperChar(char c) {
32 return c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}';
33}
34
[email protected]f38ddec2013-08-15 23:59:1135Token::Type GetSpecificOperatorType(base::StringPiece value) {
36 if (value == "=")
37 return Token::EQUAL;
38 if (value == "+")
39 return Token::PLUS;
40 if (value == "-")
41 return Token::MINUS;
42 if (value == "+=")
43 return Token::PLUS_EQUALS;
44 if (value == "-=")
45 return Token::MINUS_EQUALS;
46 if (value == "==")
47 return Token::EQUAL_EQUAL;
48 if (value == "!=")
49 return Token::NOT_EQUAL;
50 if (value == "<=")
51 return Token::LESS_EQUAL;
52 if (value == ">=")
53 return Token::GREATER_EQUAL;
54 if (value == "<")
55 return Token::LESS_THAN;
56 if (value == ">")
57 return Token::GREATER_THAN;
58 if (value == "&&")
59 return Token::BOOLEAN_AND;
60 if (value == "||")
61 return Token::BOOLEAN_OR;
62 if (value == "!")
63 return Token::BANG;
[email protected]51d01722014-03-26 16:57:0764 if (value == ".")
65 return Token::DOT;
[email protected]f38ddec2013-08-15 23:59:1166 return Token::INVALID;
67}
68
[email protected]96ea63d2013-07-30 10:17:0769} // namespace
70
71Tokenizer::Tokenizer(const InputFile* input_file, Err* err)
72 : input_file_(input_file),
73 input_(input_file->contents()),
74 err_(err),
75 cur_(0),
76 line_number_(1),
tfarina21aff0d2016-01-06 19:50:0877 column_number_(1) {
[email protected]96ea63d2013-07-30 10:17:0778}
79
80Tokenizer::~Tokenizer() {
81}
82
83// static
84std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) {
85 Tokenizer t(input_file, err);
86 return t.Run();
87}
88
89std::vector<Token> Tokenizer::Run() {
[email protected]f38ddec2013-08-15 23:59:1190 DCHECK(tokens_.empty());
[email protected]96ea63d2013-07-30 10:17:0791 while (!done()) {
92 AdvanceToNextToken();
93 if (done())
94 break;
95 Location location = GetCurrentLocation();
96
97 Token::Type type = ClassifyCurrent();
98 if (type == Token::INVALID) {
99 *err_ = GetErrorForInvalidToken(location);
100 break;
101 }
102 size_t token_begin = cur_;
103 AdvanceToEndOfToken(location, type);
104 if (has_error())
105 break;
106 size_t token_end = cur_;
107
[email protected]f38ddec2013-08-15 23:59:11108 base::StringPiece token_value(&input_.data()[token_begin],
109 token_end - token_begin);
110
scottmgbe5d4512014-09-24 02:29:12111 if (type == Token::UNCLASSIFIED_OPERATOR) {
[email protected]f38ddec2013-08-15 23:59:11112 type = GetSpecificOperatorType(token_value);
scottmgbe5d4512014-09-24 02:29:12113 } else if (type == Token::IDENTIFIER) {
[email protected]f38ddec2013-08-15 23:59:11114 if (token_value == "if")
115 type = Token::IF;
[email protected]ed7d2be22013-08-20 17:23:15116 else if (token_value == "else")
[email protected]f38ddec2013-08-15 23:59:11117 type = Token::ELSE;
[email protected]ed7d2be22013-08-20 17:23:15118 else if (token_value == "true")
119 type = Token::TRUE_TOKEN;
120 else if (token_value == "false")
121 type = Token::FALSE_TOKEN;
scottmgbe5d4512014-09-24 02:29:12122 } else if (type == Token::UNCLASSIFIED_COMMENT) {
scottmg7b80f172014-09-24 03:35:13123 if (AtStartOfLine(token_begin) &&
124 // If it's a standalone comment, but is a continuation of a comment on
125 // a previous line, then instead make it a continued suffix comment.
126 (tokens_.empty() || tokens_.back().type() != Token::SUFFIX_COMMENT ||
127 tokens_.back().location().line_number() + 1 !=
128 location.line_number() ||
tfarina21aff0d2016-01-06 19:50:08129 tokens_.back().location().column_number() !=
130 location.column_number())) {
scottmgbe5d4512014-09-24 02:29:12131 type = Token::LINE_COMMENT;
brettw693c60562015-08-07 19:55:17132 if (!at_end()) // Could be EOF.
133 Advance(); // The current \n.
scottmg2dd93e8b2014-09-26 04:07:05134 // If this comment is separated from the next syntax element, then we
135 // want to tag it as a block comment. This will become a standalone
136 // statement at the parser level to keep this comment separate, rather
137 // than attached to the subsequent statement.
138 while (!at_end() && IsCurrentWhitespace()) {
139 if (IsCurrentNewline()) {
140 type = Token::BLOCK_COMMENT;
141 break;
142 }
143 Advance();
144 }
scottmg7b80f172014-09-24 03:35:13145 } else {
scottmgbe5d4512014-09-24 02:29:12146 type = Token::SUFFIX_COMMENT;
scottmg7b80f172014-09-24 03:35:13147 }
[email protected]f38ddec2013-08-15 23:59:11148 }
149
scottmgbe5d4512014-09-24 02:29:12150 tokens_.push_back(Token(location, type, token_value));
[email protected]96ea63d2013-07-30 10:17:07151 }
152 if (err_->has_error())
[email protected]f38ddec2013-08-15 23:59:11153 tokens_.clear();
154 return tokens_;
[email protected]96ea63d2013-07-30 10:17:07155}
156
157// static
158size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) {
[email protected]481c3e82014-07-18 01:40:47159 DCHECK_GT(n, 0);
[email protected]96ea63d2013-07-30 10:17:07160
161 if (n == 1)
162 return 0;
163
[email protected]481c3e82014-07-18 01:40:47164 int cur_line = 1;
165 size_t cur_byte = 0;
[email protected]96ea63d2013-07-30 10:17:07166 while (cur_byte < buf.size()) {
167 if (IsNewline(buf, cur_byte)) {
168 cur_line++;
169 if (cur_line == n)
170 return cur_byte + 1;
171 }
172 cur_byte++;
173 }
[email protected]481c3e82014-07-18 01:40:47174 return static_cast<size_t>(-1);
[email protected]96ea63d2013-07-30 10:17:07175}
176
177// static
178bool Tokenizer::IsNewline(const base::StringPiece& buffer, size_t offset) {
179 DCHECK(offset < buffer.size());
180 // We may need more logic here to handle different line ending styles.
181 return buffer[offset] == '\n';
182}
183
tfarina0c60e3a92015-12-01 00:10:50184// static
185bool Tokenizer::IsIdentifierFirstChar(char c) {
186 return base::IsAsciiAlpha(c) || c == '_';
187}
188
189// static
190bool Tokenizer::IsIdentifierContinuingChar(char c) {
191 // Also allow digits after the first char.
192 return IsIdentifierFirstChar(c) || base::IsAsciiDigit(c);
193}
[email protected]96ea63d2013-07-30 10:17:07194
195void Tokenizer::AdvanceToNextToken() {
196 while (!at_end() && IsCurrentWhitespace())
197 Advance();
198}
199
200Token::Type Tokenizer::ClassifyCurrent() const {
201 DCHECK(!at_end());
202 char next_char = cur_char();
brettwb3413062015-06-24 00:39:02203 if (base::IsAsciiDigit(next_char))
[email protected]96ea63d2013-07-30 10:17:07204 return Token::INTEGER;
205 if (next_char == '"')
206 return Token::STRING;
207
208 // Note: '-' handled specially below.
209 if (next_char != '-' && CouldBeOperator(next_char))
[email protected]f38ddec2013-08-15 23:59:11210 return Token::UNCLASSIFIED_OPERATOR;
[email protected]96ea63d2013-07-30 10:17:07211
212 if (IsIdentifierFirstChar(next_char))
213 return Token::IDENTIFIER;
214
[email protected]f38ddec2013-08-15 23:59:11215 if (next_char == '[')
216 return Token::LEFT_BRACKET;
217 if (next_char == ']')
218 return Token::RIGHT_BRACKET;
219 if (next_char == '(')
220 return Token::LEFT_PAREN;
221 if (next_char == ')')
222 return Token::RIGHT_PAREN;
223 if (next_char == '{')
224 return Token::LEFT_BRACE;
225 if (next_char == '}')
226 return Token::RIGHT_BRACE;
[email protected]96ea63d2013-07-30 10:17:07227
[email protected]51d01722014-03-26 16:57:07228 if (next_char == '.')
229 return Token::DOT;
[email protected]f38ddec2013-08-15 23:59:11230 if (next_char == ',')
231 return Token::COMMA;
[email protected]96ea63d2013-07-30 10:17:07232
233 if (next_char == '#')
scottmgbe5d4512014-09-24 02:29:12234 return Token::UNCLASSIFIED_COMMENT;
[email protected]96ea63d2013-07-30 10:17:07235
236 // For the case of '-' differentiate between a negative number and anything
237 // else.
238 if (next_char == '-') {
239 if (!CanIncrement())
[email protected]f38ddec2013-08-15 23:59:11240 return Token::UNCLASSIFIED_OPERATOR; // Just the minus before end of
241 // file.
[email protected]96ea63d2013-07-30 10:17:07242 char following_char = input_[cur_ + 1];
brettwb3413062015-06-24 00:39:02243 if (base::IsAsciiDigit(following_char))
[email protected]96ea63d2013-07-30 10:17:07244 return Token::INTEGER;
[email protected]f38ddec2013-08-15 23:59:11245 return Token::UNCLASSIFIED_OPERATOR;
[email protected]96ea63d2013-07-30 10:17:07246 }
247
248 return Token::INVALID;
249}
250
251void Tokenizer::AdvanceToEndOfToken(const Location& location,
252 Token::Type type) {
253 switch (type) {
254 case Token::INTEGER:
255 do {
256 Advance();
brettwb3413062015-06-24 00:39:02257 } while (!at_end() && base::IsAsciiDigit(cur_char()));
[email protected]96ea63d2013-07-30 10:17:07258 if (!at_end()) {
259 // Require the char after a number to be some kind of space, scope,
260 // or operator.
261 char c = cur_char();
262 if (!IsCurrentWhitespace() && !CouldBeOperator(c) &&
[email protected]f38ddec2013-08-15 23:59:11263 !IsScoperChar(c) && c != ',') {
[email protected]96ea63d2013-07-30 10:17:07264 *err_ = Err(GetCurrentLocation(),
[email protected]df15e82d2014-05-15 19:41:58265 "This is not a valid number.",
266 "Learn to count.");
[email protected]96ea63d2013-07-30 10:17:07267 // Highlight the number.
268 err_->AppendRange(LocationRange(location, GetCurrentLocation()));
269 }
270 }
271 break;
272
273 case Token::STRING: {
274 char initial = cur_char();
275 Advance(); // Advance past initial "
276 for (;;) {
277 if (at_end()) {
[email protected]df15e82d2014-05-15 19:41:58278 *err_ = Err(LocationRange(location, GetCurrentLocation()),
279 "Unterminated string literal.",
280 "Don't leave me hanging like this!");
[email protected]96ea63d2013-07-30 10:17:07281 break;
282 }
283 if (IsCurrentStringTerminator(initial)) {
284 Advance(); // Skip past last "
285 break;
tfarina9d6f5442014-12-18 04:42:21286 } else if (IsCurrentNewline()) {
[email protected]df15e82d2014-05-15 19:41:58287 *err_ = Err(LocationRange(location, GetCurrentLocation()),
288 "Newline in string constant.");
[email protected]96ea63d2013-07-30 10:17:07289 }
290 Advance();
291 }
292 break;
293 }
294
[email protected]f38ddec2013-08-15 23:59:11295 case Token::UNCLASSIFIED_OPERATOR:
[email protected]96ea63d2013-07-30 10:17:07296 // Some operators are two characters, some are one.
297 if (CouldBeTwoCharOperatorBegin(cur_char())) {
298 if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1]))
299 Advance();
300 }
301 Advance();
302 break;
303
304 case Token::IDENTIFIER:
305 while (!at_end() && IsIdentifierContinuingChar(cur_char()))
306 Advance();
307 break;
308
[email protected]f38ddec2013-08-15 23:59:11309 case Token::LEFT_BRACKET:
310 case Token::RIGHT_BRACKET:
311 case Token::LEFT_BRACE:
312 case Token::RIGHT_BRACE:
313 case Token::LEFT_PAREN:
314 case Token::RIGHT_PAREN:
[email protected]51d01722014-03-26 16:57:07315 case Token::DOT:
[email protected]f38ddec2013-08-15 23:59:11316 case Token::COMMA:
[email protected]96ea63d2013-07-30 10:17:07317 Advance(); // All are one char.
318 break;
319
scottmgbe5d4512014-09-24 02:29:12320 case Token::UNCLASSIFIED_COMMENT:
[email protected]96ea63d2013-07-30 10:17:07321 // Eat to EOL.
322 while (!at_end() && !IsCurrentNewline())
323 Advance();
324 break;
325
326 case Token::INVALID:
[email protected]6ac871ea2013-08-19 21:04:50327 default:
[email protected]96ea63d2013-07-30 10:17:07328 *err_ = Err(location, "Everything is all messed up",
329 "Please insert system disk in drive A: and press any key.");
330 NOTREACHED();
331 return;
332 }
333}
334
scottmgbe5d4512014-09-24 02:29:12335bool Tokenizer::AtStartOfLine(size_t location) const {
336 while (location > 0) {
337 --location;
338 char c = input_[location];
339 if (c == '\n')
340 return true;
341 if (c != ' ')
342 return false;
343 }
344 return true;
345}
346
[email protected]96ea63d2013-07-30 10:17:07347bool Tokenizer::IsCurrentWhitespace() const {
348 DCHECK(!at_end());
349 char c = input_[cur_];
scottmgbe5d4512014-09-24 02:29:12350 // Note that tab (0x09), vertical tab (0x0B), and formfeed (0x0C) are illegal.
351 return c == 0x0A || c == 0x0D || c == 0x20;
[email protected]96ea63d2013-07-30 10:17:07352}
353
354bool Tokenizer::IsCurrentStringTerminator(char quote_char) const {
355 DCHECK(!at_end());
356 if (cur_char() != quote_char)
357 return false;
358
359 // Check for escaping. \" is not a string terminator, but \\" is. Count
360 // the number of preceeding backslashes.
361 int num_backslashes = 0;
362 for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--)
363 num_backslashes++;
364
365 // Even backslashes mean that they were escaping each other and don't count
366 // as escaping this quote.
367 return (num_backslashes % 2) == 0;
368}
369
370bool Tokenizer::IsCurrentNewline() const {
371 return IsNewline(input_, cur_);
372}
373
374void Tokenizer::Advance() {
375 DCHECK(cur_ < input_.size());
376 if (IsCurrentNewline()) {
[email protected]96ea63d2013-07-30 10:17:07377 line_number_++;
tfarina21aff0d2016-01-06 19:50:08378 column_number_ = 1;
[email protected]96ea63d2013-07-30 10:17:07379 } else {
tfarina21aff0d2016-01-06 19:50:08380 column_number_++;
[email protected]96ea63d2013-07-30 10:17:07381 }
382 cur_++;
383}
384
385Location Tokenizer::GetCurrentLocation() const {
scottmgbe5d4512014-09-24 02:29:12386 return Location(
tfarina21aff0d2016-01-06 19:50:08387 input_file_, line_number_, column_number_, static_cast<int>(cur_));
[email protected]96ea63d2013-07-30 10:17:07388}
389
390Err Tokenizer::GetErrorForInvalidToken(const Location& location) const {
391 std::string help;
392 if (cur_char() == ';') {
393 // Semicolon.
394 help = "Semicolons are not needed, delete this one.";
395 } else if (cur_char() == '\t') {
396 // Tab.
397 help = "You got a tab character in here. Tabs are evil. "
398 "Convert to spaces.";
399 } else if (cur_char() == '/' && cur_ + 1 < input_.size() &&
400 (input_[cur_ + 1] == '/' || input_[cur_ + 1] == '*')) {
401 // Different types of comments.
402 help = "Comments should start with # instead";
tfarinadd495002015-03-23 18:34:19403 } else if (cur_char() == '\'') {
404 help = "Strings are delimited by \" characters, not apostrophes.";
[email protected]96ea63d2013-07-30 10:17:07405 } else {
406 help = "I have no idea what this is.";
407 }
408
409 return Err(location, "Invalid token.", help);
410}