blob: 6051e1b5bb9a4e51836c6bce77c1826b7fc8db39 [file] [log] [blame]
[email protected]96ea63d2013-07-30 10:17:071// Copyright (c) 2013 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "tools/gn/tokenizer.h"
6
7#include "base/logging.h"
8#include "tools/gn/input_file.h"
9
10namespace {
11
12bool IsNumberChar(char c) {
[email protected]f38ddec2013-08-15 23:59:1113 return c >= '0' && c <= '9';
[email protected]96ea63d2013-07-30 10:17:0714}
15
16bool CouldBeTwoCharOperatorBegin(char c) {
17 return c == '<' || c == '>' || c == '!' || c == '=' || c == '-' ||
18 c == '+' || c == '|' || c == '&';
19}
20
21bool CouldBeTwoCharOperatorEnd(char c) {
22 return c == '=' || c == '|' || c == '&';
23}
24
25bool CouldBeOneCharOperator(char c) {
26 return c == '=' || c == '<' || c == '>' || c == '+' || c == '!' ||
27 c == ':' || c == '|' || c == '&' || c == '-';
28}
29
30bool CouldBeOperator(char c) {
31 return CouldBeOneCharOperator(c) || CouldBeTwoCharOperatorBegin(c);
32}
33
[email protected]96ea63d2013-07-30 10:17:0734bool IsScoperChar(char c) {
35 return c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}';
36}
37
[email protected]f38ddec2013-08-15 23:59:1138Token::Type GetSpecificOperatorType(base::StringPiece value) {
39 if (value == "=")
40 return Token::EQUAL;
41 if (value == "+")
42 return Token::PLUS;
43 if (value == "-")
44 return Token::MINUS;
45 if (value == "+=")
46 return Token::PLUS_EQUALS;
47 if (value == "-=")
48 return Token::MINUS_EQUALS;
49 if (value == "==")
50 return Token::EQUAL_EQUAL;
51 if (value == "!=")
52 return Token::NOT_EQUAL;
53 if (value == "<=")
54 return Token::LESS_EQUAL;
55 if (value == ">=")
56 return Token::GREATER_EQUAL;
57 if (value == "<")
58 return Token::LESS_THAN;
59 if (value == ">")
60 return Token::GREATER_THAN;
61 if (value == "&&")
62 return Token::BOOLEAN_AND;
63 if (value == "||")
64 return Token::BOOLEAN_OR;
65 if (value == "!")
66 return Token::BANG;
67 NOTREACHED();
68 return Token::INVALID;
69}
70
[email protected]96ea63d2013-07-30 10:17:0771} // namespace
72
73Tokenizer::Tokenizer(const InputFile* input_file, Err* err)
74 : input_file_(input_file),
75 input_(input_file->contents()),
76 err_(err),
77 cur_(0),
78 line_number_(1),
79 char_in_line_(1) {
80}
81
82Tokenizer::~Tokenizer() {
83}
84
85// static
86std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) {
87 Tokenizer t(input_file, err);
88 return t.Run();
89}
90
91std::vector<Token> Tokenizer::Run() {
[email protected]f38ddec2013-08-15 23:59:1192 DCHECK(tokens_.empty());
[email protected]96ea63d2013-07-30 10:17:0793 while (!done()) {
94 AdvanceToNextToken();
95 if (done())
96 break;
97 Location location = GetCurrentLocation();
98
99 Token::Type type = ClassifyCurrent();
100 if (type == Token::INVALID) {
101 *err_ = GetErrorForInvalidToken(location);
102 break;
103 }
104 size_t token_begin = cur_;
105 AdvanceToEndOfToken(location, type);
106 if (has_error())
107 break;
108 size_t token_end = cur_;
109
[email protected]f38ddec2013-08-15 23:59:11110 base::StringPiece token_value(&input_.data()[token_begin],
111 token_end - token_begin);
112
113 if (type == Token::UNCLASSIFIED_OPERATOR)
114 type = GetSpecificOperatorType(token_value);
115 if (type == Token::IDENTIFIER) {
116 if (token_value == "if")
117 type = Token::IF;
118 if (token_value == "else")
119 type = Token::ELSE;
120 }
121
[email protected]96ea63d2013-07-30 10:17:07122 // TODO(brettw) This just strips comments from the token stream. This
123 // is probably wrong, they should be removed at a later stage so we can
124 // do things like rewrite the file. But this makes the parser simpler and
125 // is OK for now.
[email protected]f38ddec2013-08-15 23:59:11126 if (type != Token::COMMENT)
127 tokens_.push_back(Token(location, type, token_value));
[email protected]96ea63d2013-07-30 10:17:07128 }
129 if (err_->has_error())
[email protected]f38ddec2013-08-15 23:59:11130 tokens_.clear();
131 return tokens_;
[email protected]96ea63d2013-07-30 10:17:07132}
133
134// static
135size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) {
136 int cur_line = 1;
137 size_t cur_byte = 0;
138
139 DCHECK(n > 0);
140
141 if (n == 1)
142 return 0;
143
144 while (cur_byte < buf.size()) {
145 if (IsNewline(buf, cur_byte)) {
146 cur_line++;
147 if (cur_line == n)
148 return cur_byte + 1;
149 }
150 cur_byte++;
151 }
152 return -1;
153}
154
155// static
156bool Tokenizer::IsNewline(const base::StringPiece& buffer, size_t offset) {
157 DCHECK(offset < buffer.size());
158 // We may need more logic here to handle different line ending styles.
159 return buffer[offset] == '\n';
160}
161
162
163void Tokenizer::AdvanceToNextToken() {
164 while (!at_end() && IsCurrentWhitespace())
165 Advance();
166}
167
168Token::Type Tokenizer::ClassifyCurrent() const {
169 DCHECK(!at_end());
170 char next_char = cur_char();
171 if (next_char >= '0' && next_char <= '9')
172 return Token::INTEGER;
173 if (next_char == '"')
174 return Token::STRING;
175
176 // Note: '-' handled specially below.
177 if (next_char != '-' && CouldBeOperator(next_char))
[email protected]f38ddec2013-08-15 23:59:11178 return Token::UNCLASSIFIED_OPERATOR;
[email protected]96ea63d2013-07-30 10:17:07179
180 if (IsIdentifierFirstChar(next_char))
181 return Token::IDENTIFIER;
182
[email protected]f38ddec2013-08-15 23:59:11183 if (next_char == '[')
184 return Token::LEFT_BRACKET;
185 if (next_char == ']')
186 return Token::RIGHT_BRACKET;
187 if (next_char == '(')
188 return Token::LEFT_PAREN;
189 if (next_char == ')')
190 return Token::RIGHT_PAREN;
191 if (next_char == '{')
192 return Token::LEFT_BRACE;
193 if (next_char == '}')
194 return Token::RIGHT_BRACE;
[email protected]96ea63d2013-07-30 10:17:07195
[email protected]f38ddec2013-08-15 23:59:11196 if (next_char == ',')
197 return Token::COMMA;
[email protected]96ea63d2013-07-30 10:17:07198
199 if (next_char == '#')
200 return Token::COMMENT;
201
202 // For the case of '-' differentiate between a negative number and anything
203 // else.
204 if (next_char == '-') {
205 if (!CanIncrement())
[email protected]f38ddec2013-08-15 23:59:11206 return Token::UNCLASSIFIED_OPERATOR; // Just the minus before end of
207 // file.
[email protected]96ea63d2013-07-30 10:17:07208 char following_char = input_[cur_ + 1];
209 if (following_char >= '0' && following_char <= '9')
210 return Token::INTEGER;
[email protected]f38ddec2013-08-15 23:59:11211 return Token::UNCLASSIFIED_OPERATOR;
[email protected]96ea63d2013-07-30 10:17:07212 }
213
214 return Token::INVALID;
215}
216
217void Tokenizer::AdvanceToEndOfToken(const Location& location,
218 Token::Type type) {
219 switch (type) {
220 case Token::INTEGER:
221 do {
222 Advance();
223 } while (!at_end() && IsNumberChar(cur_char()));
224 if (!at_end()) {
225 // Require the char after a number to be some kind of space, scope,
226 // or operator.
227 char c = cur_char();
228 if (!IsCurrentWhitespace() && !CouldBeOperator(c) &&
[email protected]f38ddec2013-08-15 23:59:11229 !IsScoperChar(c) && c != ',') {
[email protected]96ea63d2013-07-30 10:17:07230 *err_ = Err(GetCurrentLocation(),
231 "This is not a valid number.",
232 "Learn to count.");
233 // Highlight the number.
234 err_->AppendRange(LocationRange(location, GetCurrentLocation()));
235 }
236 }
237 break;
238
239 case Token::STRING: {
240 char initial = cur_char();
241 Advance(); // Advance past initial "
242 for (;;) {
243 if (at_end()) {
244 *err_ = Err(LocationRange(location,
245 Location(input_file_, line_number_, char_in_line_)),
246 "Unterminated string literal.",
247 "Don't leave me hanging like this!");
248 break;
249 }
250 if (IsCurrentStringTerminator(initial)) {
251 Advance(); // Skip past last "
252 break;
253 } else if (cur_char() == '\n') {
254 *err_ = Err(LocationRange(location,
255 GetCurrentLocation()),
256 "Newline in string constant.");
257 }
258 Advance();
259 }
260 break;
261 }
262
[email protected]f38ddec2013-08-15 23:59:11263 case Token::UNCLASSIFIED_OPERATOR:
[email protected]96ea63d2013-07-30 10:17:07264 // Some operators are two characters, some are one.
265 if (CouldBeTwoCharOperatorBegin(cur_char())) {
266 if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1]))
267 Advance();
268 }
269 Advance();
270 break;
271
272 case Token::IDENTIFIER:
273 while (!at_end() && IsIdentifierContinuingChar(cur_char()))
274 Advance();
275 break;
276
[email protected]f38ddec2013-08-15 23:59:11277 case Token::LEFT_BRACKET:
278 case Token::RIGHT_BRACKET:
279 case Token::LEFT_BRACE:
280 case Token::RIGHT_BRACE:
281 case Token::LEFT_PAREN:
282 case Token::RIGHT_PAREN:
283 case Token::COMMA:
[email protected]96ea63d2013-07-30 10:17:07284 Advance(); // All are one char.
285 break;
286
287 case Token::COMMENT:
288 // Eat to EOL.
289 while (!at_end() && !IsCurrentNewline())
290 Advance();
291 break;
292
293 case Token::INVALID:
[email protected]6ac871ea2013-08-19 21:04:50294 default:
[email protected]96ea63d2013-07-30 10:17:07295 *err_ = Err(location, "Everything is all messed up",
296 "Please insert system disk in drive A: and press any key.");
297 NOTREACHED();
298 return;
299 }
300}
301
302bool Tokenizer::IsCurrentWhitespace() const {
303 DCHECK(!at_end());
304 char c = input_[cur_];
305 // Note that tab (0x09) is illegal.
306 return c == 0x0A || c == 0x0B || c == 0x0C || c == 0x0D || c == 0x20;
307}
308
309bool Tokenizer::IsCurrentStringTerminator(char quote_char) const {
310 DCHECK(!at_end());
311 if (cur_char() != quote_char)
312 return false;
313
314 // Check for escaping. \" is not a string terminator, but \\" is. Count
315 // the number of preceeding backslashes.
316 int num_backslashes = 0;
317 for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--)
318 num_backslashes++;
319
320 // Even backslashes mean that they were escaping each other and don't count
321 // as escaping this quote.
322 return (num_backslashes % 2) == 0;
323}
324
325bool Tokenizer::IsCurrentNewline() const {
326 return IsNewline(input_, cur_);
327}
328
329void Tokenizer::Advance() {
330 DCHECK(cur_ < input_.size());
331 if (IsCurrentNewline()) {
[email protected]96ea63d2013-07-30 10:17:07332 line_number_++;
333 char_in_line_ = 1;
334 } else {
335 char_in_line_++;
336 }
337 cur_++;
338}
339
340Location Tokenizer::GetCurrentLocation() const {
341 return Location(input_file_, line_number_, char_in_line_);
342}
343
344Err Tokenizer::GetErrorForInvalidToken(const Location& location) const {
345 std::string help;
346 if (cur_char() == ';') {
347 // Semicolon.
348 help = "Semicolons are not needed, delete this one.";
349 } else if (cur_char() == '\t') {
350 // Tab.
351 help = "You got a tab character in here. Tabs are evil. "
352 "Convert to spaces.";
353 } else if (cur_char() == '/' && cur_ + 1 < input_.size() &&
354 (input_[cur_ + 1] == '/' || input_[cur_ + 1] == '*')) {
355 // Different types of comments.
356 help = "Comments should start with # instead";
357 } else {
358 help = "I have no idea what this is.";
359 }
360
361 return Err(location, "Invalid token.", help);
362}