blob: c0890060b37e80cf931901abf0b2b3bb68aec344 [file] [log] [blame]
[email protected]96ea63d2013-07-30 10:17:071// Copyright (c) 2013 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "tools/gn/tokenizer.h"
6
7#include "base/logging.h"
8#include "tools/gn/input_file.h"
9
10namespace {
11
[email protected]96ea63d2013-07-30 10:17:0712bool CouldBeTwoCharOperatorBegin(char c) {
13 return c == '<' || c == '>' || c == '!' || c == '=' || c == '-' ||
14 c == '+' || c == '|' || c == '&';
15}
16
17bool CouldBeTwoCharOperatorEnd(char c) {
18 return c == '=' || c == '|' || c == '&';
19}
20
21bool CouldBeOneCharOperator(char c) {
22 return c == '=' || c == '<' || c == '>' || c == '+' || c == '!' ||
23 c == ':' || c == '|' || c == '&' || c == '-';
24}
25
26bool CouldBeOperator(char c) {
27 return CouldBeOneCharOperator(c) || CouldBeTwoCharOperatorBegin(c);
28}
29
[email protected]96ea63d2013-07-30 10:17:0730bool IsScoperChar(char c) {
31 return c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}';
32}
33
[email protected]f38ddec2013-08-15 23:59:1134Token::Type GetSpecificOperatorType(base::StringPiece value) {
35 if (value == "=")
36 return Token::EQUAL;
37 if (value == "+")
38 return Token::PLUS;
39 if (value == "-")
40 return Token::MINUS;
41 if (value == "+=")
42 return Token::PLUS_EQUALS;
43 if (value == "-=")
44 return Token::MINUS_EQUALS;
45 if (value == "==")
46 return Token::EQUAL_EQUAL;
47 if (value == "!=")
48 return Token::NOT_EQUAL;
49 if (value == "<=")
50 return Token::LESS_EQUAL;
51 if (value == ">=")
52 return Token::GREATER_EQUAL;
53 if (value == "<")
54 return Token::LESS_THAN;
55 if (value == ">")
56 return Token::GREATER_THAN;
57 if (value == "&&")
58 return Token::BOOLEAN_AND;
59 if (value == "||")
60 return Token::BOOLEAN_OR;
61 if (value == "!")
62 return Token::BANG;
[email protected]51d01722014-03-26 16:57:0763 if (value == ".")
64 return Token::DOT;
[email protected]f38ddec2013-08-15 23:59:1165 return Token::INVALID;
66}
67
[email protected]96ea63d2013-07-30 10:17:0768} // namespace
69
70Tokenizer::Tokenizer(const InputFile* input_file, Err* err)
71 : input_file_(input_file),
72 input_(input_file->contents()),
73 err_(err),
74 cur_(0),
75 line_number_(1),
76 char_in_line_(1) {
77}
78
79Tokenizer::~Tokenizer() {
80}
81
82// static
83std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) {
84 Tokenizer t(input_file, err);
85 return t.Run();
86}
87
88std::vector<Token> Tokenizer::Run() {
[email protected]f38ddec2013-08-15 23:59:1189 DCHECK(tokens_.empty());
[email protected]96ea63d2013-07-30 10:17:0790 while (!done()) {
91 AdvanceToNextToken();
92 if (done())
93 break;
94 Location location = GetCurrentLocation();
95
96 Token::Type type = ClassifyCurrent();
97 if (type == Token::INVALID) {
98 *err_ = GetErrorForInvalidToken(location);
99 break;
100 }
101 size_t token_begin = cur_;
102 AdvanceToEndOfToken(location, type);
103 if (has_error())
104 break;
105 size_t token_end = cur_;
106
[email protected]f38ddec2013-08-15 23:59:11107 base::StringPiece token_value(&input_.data()[token_begin],
108 token_end - token_begin);
109
110 if (type == Token::UNCLASSIFIED_OPERATOR)
111 type = GetSpecificOperatorType(token_value);
112 if (type == Token::IDENTIFIER) {
113 if (token_value == "if")
114 type = Token::IF;
[email protected]ed7d2be22013-08-20 17:23:15115 else if (token_value == "else")
[email protected]f38ddec2013-08-15 23:59:11116 type = Token::ELSE;
[email protected]ed7d2be22013-08-20 17:23:15117 else if (token_value == "true")
118 type = Token::TRUE_TOKEN;
119 else if (token_value == "false")
120 type = Token::FALSE_TOKEN;
[email protected]f38ddec2013-08-15 23:59:11121 }
122
[email protected]96ea63d2013-07-30 10:17:07123 // TODO(brettw) This just strips comments from the token stream. This
124 // is probably wrong, they should be removed at a later stage so we can
125 // do things like rewrite the file. But this makes the parser simpler and
126 // is OK for now.
[email protected]f38ddec2013-08-15 23:59:11127 if (type != Token::COMMENT)
128 tokens_.push_back(Token(location, type, token_value));
[email protected]96ea63d2013-07-30 10:17:07129 }
130 if (err_->has_error())
[email protected]f38ddec2013-08-15 23:59:11131 tokens_.clear();
132 return tokens_;
[email protected]96ea63d2013-07-30 10:17:07133}
134
135// static
136size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) {
137 int cur_line = 1;
138 size_t cur_byte = 0;
139
140 DCHECK(n > 0);
141
142 if (n == 1)
143 return 0;
144
145 while (cur_byte < buf.size()) {
146 if (IsNewline(buf, cur_byte)) {
147 cur_line++;
148 if (cur_line == n)
149 return cur_byte + 1;
150 }
151 cur_byte++;
152 }
153 return -1;
154}
155
156// static
157bool Tokenizer::IsNewline(const base::StringPiece& buffer, size_t offset) {
158 DCHECK(offset < buffer.size());
159 // We may need more logic here to handle different line ending styles.
160 return buffer[offset] == '\n';
161}
162
163
164void Tokenizer::AdvanceToNextToken() {
165 while (!at_end() && IsCurrentWhitespace())
166 Advance();
167}
168
169Token::Type Tokenizer::ClassifyCurrent() const {
170 DCHECK(!at_end());
171 char next_char = cur_char();
[email protected]8ae5d422014-05-13 20:17:42172 if (IsAsciiDigit(next_char))
[email protected]96ea63d2013-07-30 10:17:07173 return Token::INTEGER;
174 if (next_char == '"')
175 return Token::STRING;
176
177 // Note: '-' handled specially below.
178 if (next_char != '-' && CouldBeOperator(next_char))
[email protected]f38ddec2013-08-15 23:59:11179 return Token::UNCLASSIFIED_OPERATOR;
[email protected]96ea63d2013-07-30 10:17:07180
181 if (IsIdentifierFirstChar(next_char))
182 return Token::IDENTIFIER;
183
[email protected]f38ddec2013-08-15 23:59:11184 if (next_char == '[')
185 return Token::LEFT_BRACKET;
186 if (next_char == ']')
187 return Token::RIGHT_BRACKET;
188 if (next_char == '(')
189 return Token::LEFT_PAREN;
190 if (next_char == ')')
191 return Token::RIGHT_PAREN;
192 if (next_char == '{')
193 return Token::LEFT_BRACE;
194 if (next_char == '}')
195 return Token::RIGHT_BRACE;
[email protected]96ea63d2013-07-30 10:17:07196
[email protected]51d01722014-03-26 16:57:07197 if (next_char == '.')
198 return Token::DOT;
[email protected]f38ddec2013-08-15 23:59:11199 if (next_char == ',')
200 return Token::COMMA;
[email protected]96ea63d2013-07-30 10:17:07201
202 if (next_char == '#')
203 return Token::COMMENT;
204
205 // For the case of '-' differentiate between a negative number and anything
206 // else.
207 if (next_char == '-') {
208 if (!CanIncrement())
[email protected]f38ddec2013-08-15 23:59:11209 return Token::UNCLASSIFIED_OPERATOR; // Just the minus before end of
210 // file.
[email protected]96ea63d2013-07-30 10:17:07211 char following_char = input_[cur_ + 1];
[email protected]8ae5d422014-05-13 20:17:42212 if (IsAsciiDigit(following_char))
[email protected]96ea63d2013-07-30 10:17:07213 return Token::INTEGER;
[email protected]f38ddec2013-08-15 23:59:11214 return Token::UNCLASSIFIED_OPERATOR;
[email protected]96ea63d2013-07-30 10:17:07215 }
216
217 return Token::INVALID;
218}
219
220void Tokenizer::AdvanceToEndOfToken(const Location& location,
221 Token::Type type) {
222 switch (type) {
223 case Token::INTEGER:
224 do {
225 Advance();
[email protected]8ae5d422014-05-13 20:17:42226 } while (!at_end() && IsAsciiDigit(cur_char()));
[email protected]96ea63d2013-07-30 10:17:07227 if (!at_end()) {
228 // Require the char after a number to be some kind of space, scope,
229 // or operator.
230 char c = cur_char();
231 if (!IsCurrentWhitespace() && !CouldBeOperator(c) &&
[email protected]f38ddec2013-08-15 23:59:11232 !IsScoperChar(c) && c != ',') {
[email protected]96ea63d2013-07-30 10:17:07233 *err_ = Err(GetCurrentLocation(),
[email protected]df15e82d2014-05-15 19:41:58234 "This is not a valid number.",
235 "Learn to count.");
[email protected]96ea63d2013-07-30 10:17:07236 // Highlight the number.
237 err_->AppendRange(LocationRange(location, GetCurrentLocation()));
238 }
239 }
240 break;
241
242 case Token::STRING: {
243 char initial = cur_char();
244 Advance(); // Advance past initial "
245 for (;;) {
246 if (at_end()) {
[email protected]df15e82d2014-05-15 19:41:58247 *err_ = Err(LocationRange(location, GetCurrentLocation()),
248 "Unterminated string literal.",
249 "Don't leave me hanging like this!");
[email protected]96ea63d2013-07-30 10:17:07250 break;
251 }
252 if (IsCurrentStringTerminator(initial)) {
253 Advance(); // Skip past last "
254 break;
255 } else if (cur_char() == '\n') {
[email protected]df15e82d2014-05-15 19:41:58256 *err_ = Err(LocationRange(location, GetCurrentLocation()),
257 "Newline in string constant.");
[email protected]96ea63d2013-07-30 10:17:07258 }
259 Advance();
260 }
261 break;
262 }
263
[email protected]f38ddec2013-08-15 23:59:11264 case Token::UNCLASSIFIED_OPERATOR:
[email protected]96ea63d2013-07-30 10:17:07265 // Some operators are two characters, some are one.
266 if (CouldBeTwoCharOperatorBegin(cur_char())) {
267 if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1]))
268 Advance();
269 }
270 Advance();
271 break;
272
273 case Token::IDENTIFIER:
274 while (!at_end() && IsIdentifierContinuingChar(cur_char()))
275 Advance();
276 break;
277
[email protected]f38ddec2013-08-15 23:59:11278 case Token::LEFT_BRACKET:
279 case Token::RIGHT_BRACKET:
280 case Token::LEFT_BRACE:
281 case Token::RIGHT_BRACE:
282 case Token::LEFT_PAREN:
283 case Token::RIGHT_PAREN:
[email protected]51d01722014-03-26 16:57:07284 case Token::DOT:
[email protected]f38ddec2013-08-15 23:59:11285 case Token::COMMA:
[email protected]96ea63d2013-07-30 10:17:07286 Advance(); // All are one char.
287 break;
288
289 case Token::COMMENT:
290 // Eat to EOL.
291 while (!at_end() && !IsCurrentNewline())
292 Advance();
293 break;
294
295 case Token::INVALID:
[email protected]6ac871ea2013-08-19 21:04:50296 default:
[email protected]96ea63d2013-07-30 10:17:07297 *err_ = Err(location, "Everything is all messed up",
298 "Please insert system disk in drive A: and press any key.");
299 NOTREACHED();
300 return;
301 }
302}
303
304bool Tokenizer::IsCurrentWhitespace() const {
305 DCHECK(!at_end());
306 char c = input_[cur_];
307 // Note that tab (0x09) is illegal.
308 return c == 0x0A || c == 0x0B || c == 0x0C || c == 0x0D || c == 0x20;
309}
310
311bool Tokenizer::IsCurrentStringTerminator(char quote_char) const {
312 DCHECK(!at_end());
313 if (cur_char() != quote_char)
314 return false;
315
316 // Check for escaping. \" is not a string terminator, but \\" is. Count
317 // the number of preceeding backslashes.
318 int num_backslashes = 0;
319 for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--)
320 num_backslashes++;
321
322 // Even backslashes mean that they were escaping each other and don't count
323 // as escaping this quote.
324 return (num_backslashes % 2) == 0;
325}
326
327bool Tokenizer::IsCurrentNewline() const {
328 return IsNewline(input_, cur_);
329}
330
331void Tokenizer::Advance() {
332 DCHECK(cur_ < input_.size());
333 if (IsCurrentNewline()) {
[email protected]96ea63d2013-07-30 10:17:07334 line_number_++;
335 char_in_line_ = 1;
336 } else {
337 char_in_line_++;
338 }
339 cur_++;
340}
341
342Location Tokenizer::GetCurrentLocation() const {
343 return Location(input_file_, line_number_, char_in_line_);
344}
345
346Err Tokenizer::GetErrorForInvalidToken(const Location& location) const {
347 std::string help;
348 if (cur_char() == ';') {
349 // Semicolon.
350 help = "Semicolons are not needed, delete this one.";
351 } else if (cur_char() == '\t') {
352 // Tab.
353 help = "You got a tab character in here. Tabs are evil. "
354 "Convert to spaces.";
355 } else if (cur_char() == '/' && cur_ + 1 < input_.size() &&
356 (input_[cur_ + 1] == '/' || input_[cur_ + 1] == '*')) {
357 // Different types of comments.
358 help = "Comments should start with # instead";
359 } else {
360 help = "I have no idea what this is.";
361 }
362
363 return Err(location, "Invalid token.", help);
364}