[email protected] | 96ea63d | 2013-07-30 10:17:07 | [diff] [blame] | 1 | // Copyright (c) 2013 The Chromium Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
| 5 | #include "tools/gn/tokenizer.h" |
| 6 | |
| 7 | #include "base/logging.h" |
scottmg | be5d451 | 2014-09-24 02:29:12 | [diff] [blame] | 8 | #include "base/strings/string_util.h" |
[email protected] | 96ea63d | 2013-07-30 10:17:07 | [diff] [blame] | 9 | #include "tools/gn/input_file.h" |
| 10 | |
| 11 | namespace { |
| 12 | |
[email protected] | 96ea63d | 2013-07-30 10:17:07 | [diff] [blame] | 13 | bool CouldBeTwoCharOperatorBegin(char c) { |
| 14 | return c == '<' || c == '>' || c == '!' || c == '=' || c == '-' || |
| 15 | c == '+' || c == '|' || c == '&'; |
| 16 | } |
| 17 | |
| 18 | bool CouldBeTwoCharOperatorEnd(char c) { |
| 19 | return c == '=' || c == '|' || c == '&'; |
| 20 | } |
| 21 | |
| 22 | bool CouldBeOneCharOperator(char c) { |
| 23 | return c == '=' || c == '<' || c == '>' || c == '+' || c == '!' || |
| 24 | c == ':' || c == '|' || c == '&' || c == '-'; |
| 25 | } |
| 26 | |
| 27 | bool CouldBeOperator(char c) { |
| 28 | return CouldBeOneCharOperator(c) || CouldBeTwoCharOperatorBegin(c); |
| 29 | } |
| 30 | |
[email protected] | 96ea63d | 2013-07-30 10:17:07 | [diff] [blame] | 31 | bool IsScoperChar(char c) { |
| 32 | return c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}'; |
| 33 | } |
| 34 | |
[email protected] | f38ddec | 2013-08-15 23:59:11 | [diff] [blame] | 35 | Token::Type GetSpecificOperatorType(base::StringPiece value) { |
| 36 | if (value == "=") |
| 37 | return Token::EQUAL; |
| 38 | if (value == "+") |
| 39 | return Token::PLUS; |
| 40 | if (value == "-") |
| 41 | return Token::MINUS; |
| 42 | if (value == "+=") |
| 43 | return Token::PLUS_EQUALS; |
| 44 | if (value == "-=") |
| 45 | return Token::MINUS_EQUALS; |
| 46 | if (value == "==") |
| 47 | return Token::EQUAL_EQUAL; |
| 48 | if (value == "!=") |
| 49 | return Token::NOT_EQUAL; |
| 50 | if (value == "<=") |
| 51 | return Token::LESS_EQUAL; |
| 52 | if (value == ">=") |
| 53 | return Token::GREATER_EQUAL; |
| 54 | if (value == "<") |
| 55 | return Token::LESS_THAN; |
| 56 | if (value == ">") |
| 57 | return Token::GREATER_THAN; |
| 58 | if (value == "&&") |
| 59 | return Token::BOOLEAN_AND; |
| 60 | if (value == "||") |
| 61 | return Token::BOOLEAN_OR; |
| 62 | if (value == "!") |
| 63 | return Token::BANG; |
[email protected] | 51d0172 | 2014-03-26 16:57:07 | [diff] [blame] | 64 | if (value == ".") |
| 65 | return Token::DOT; |
[email protected] | f38ddec | 2013-08-15 23:59:11 | [diff] [blame] | 66 | return Token::INVALID; |
| 67 | } |
| 68 | |
[email protected] | 96ea63d | 2013-07-30 10:17:07 | [diff] [blame] | 69 | } // namespace |
| 70 | |
| 71 | Tokenizer::Tokenizer(const InputFile* input_file, Err* err) |
| 72 | : input_file_(input_file), |
| 73 | input_(input_file->contents()), |
| 74 | err_(err), |
| 75 | cur_(0), |
| 76 | line_number_(1), |
| 77 | char_in_line_(1) { |
| 78 | } |
| 79 | |
| 80 | Tokenizer::~Tokenizer() { |
| 81 | } |
| 82 | |
| 83 | // static |
| 84 | std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) { |
| 85 | Tokenizer t(input_file, err); |
| 86 | return t.Run(); |
| 87 | } |
| 88 | |
| 89 | std::vector<Token> Tokenizer::Run() { |
[email protected] | f38ddec | 2013-08-15 23:59:11 | [diff] [blame] | 90 | DCHECK(tokens_.empty()); |
[email protected] | 96ea63d | 2013-07-30 10:17:07 | [diff] [blame] | 91 | while (!done()) { |
| 92 | AdvanceToNextToken(); |
| 93 | if (done()) |
| 94 | break; |
| 95 | Location location = GetCurrentLocation(); |
| 96 | |
| 97 | Token::Type type = ClassifyCurrent(); |
| 98 | if (type == Token::INVALID) { |
| 99 | *err_ = GetErrorForInvalidToken(location); |
| 100 | break; |
| 101 | } |
| 102 | size_t token_begin = cur_; |
| 103 | AdvanceToEndOfToken(location, type); |
| 104 | if (has_error()) |
| 105 | break; |
| 106 | size_t token_end = cur_; |
| 107 | |
[email protected] | f38ddec | 2013-08-15 23:59:11 | [diff] [blame] | 108 | base::StringPiece token_value(&input_.data()[token_begin], |
| 109 | token_end - token_begin); |
| 110 | |
scottmg | be5d451 | 2014-09-24 02:29:12 | [diff] [blame] | 111 | if (type == Token::UNCLASSIFIED_OPERATOR) { |
[email protected] | f38ddec | 2013-08-15 23:59:11 | [diff] [blame] | 112 | type = GetSpecificOperatorType(token_value); |
scottmg | be5d451 | 2014-09-24 02:29:12 | [diff] [blame] | 113 | } else if (type == Token::IDENTIFIER) { |
[email protected] | f38ddec | 2013-08-15 23:59:11 | [diff] [blame] | 114 | if (token_value == "if") |
| 115 | type = Token::IF; |
[email protected] | ed7d2be2 | 2013-08-20 17:23:15 | [diff] [blame] | 116 | else if (token_value == "else") |
[email protected] | f38ddec | 2013-08-15 23:59:11 | [diff] [blame] | 117 | type = Token::ELSE; |
[email protected] | ed7d2be2 | 2013-08-20 17:23:15 | [diff] [blame] | 118 | else if (token_value == "true") |
| 119 | type = Token::TRUE_TOKEN; |
| 120 | else if (token_value == "false") |
| 121 | type = Token::FALSE_TOKEN; |
scottmg | be5d451 | 2014-09-24 02:29:12 | [diff] [blame] | 122 | } else if (type == Token::UNCLASSIFIED_COMMENT) { |
scottmg | 7b80f17 | 2014-09-24 03:35:13 | [diff] [blame^] | 123 | if (AtStartOfLine(token_begin) && |
| 124 | // If it's a standalone comment, but is a continuation of a comment on |
| 125 | // a previous line, then instead make it a continued suffix comment. |
| 126 | (tokens_.empty() || tokens_.back().type() != Token::SUFFIX_COMMENT || |
| 127 | tokens_.back().location().line_number() + 1 != |
| 128 | location.line_number() || |
| 129 | tokens_.back().location().char_offset() != location.char_offset())) { |
scottmg | be5d451 | 2014-09-24 02:29:12 | [diff] [blame] | 130 | type = Token::LINE_COMMENT; |
scottmg | 7b80f17 | 2014-09-24 03:35:13 | [diff] [blame^] | 131 | } else { |
scottmg | be5d451 | 2014-09-24 02:29:12 | [diff] [blame] | 132 | type = Token::SUFFIX_COMMENT; |
scottmg | 7b80f17 | 2014-09-24 03:35:13 | [diff] [blame^] | 133 | } |
[email protected] | f38ddec | 2013-08-15 23:59:11 | [diff] [blame] | 134 | } |
| 135 | |
scottmg | be5d451 | 2014-09-24 02:29:12 | [diff] [blame] | 136 | tokens_.push_back(Token(location, type, token_value)); |
[email protected] | 96ea63d | 2013-07-30 10:17:07 | [diff] [blame] | 137 | } |
| 138 | if (err_->has_error()) |
[email protected] | f38ddec | 2013-08-15 23:59:11 | [diff] [blame] | 139 | tokens_.clear(); |
| 140 | return tokens_; |
[email protected] | 96ea63d | 2013-07-30 10:17:07 | [diff] [blame] | 141 | } |
| 142 | |
| 143 | // static |
| 144 | size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) { |
[email protected] | 481c3e8 | 2014-07-18 01:40:47 | [diff] [blame] | 145 | DCHECK_GT(n, 0); |
[email protected] | 96ea63d | 2013-07-30 10:17:07 | [diff] [blame] | 146 | |
| 147 | if (n == 1) |
| 148 | return 0; |
| 149 | |
[email protected] | 481c3e8 | 2014-07-18 01:40:47 | [diff] [blame] | 150 | int cur_line = 1; |
| 151 | size_t cur_byte = 0; |
[email protected] | 96ea63d | 2013-07-30 10:17:07 | [diff] [blame] | 152 | while (cur_byte < buf.size()) { |
| 153 | if (IsNewline(buf, cur_byte)) { |
| 154 | cur_line++; |
| 155 | if (cur_line == n) |
| 156 | return cur_byte + 1; |
| 157 | } |
| 158 | cur_byte++; |
| 159 | } |
[email protected] | 481c3e8 | 2014-07-18 01:40:47 | [diff] [blame] | 160 | return static_cast<size_t>(-1); |
[email protected] | 96ea63d | 2013-07-30 10:17:07 | [diff] [blame] | 161 | } |
| 162 | |
| 163 | // static |
| 164 | bool Tokenizer::IsNewline(const base::StringPiece& buffer, size_t offset) { |
| 165 | DCHECK(offset < buffer.size()); |
| 166 | // We may need more logic here to handle different line ending styles. |
| 167 | return buffer[offset] == '\n'; |
| 168 | } |
| 169 | |
| 170 | |
| 171 | void Tokenizer::AdvanceToNextToken() { |
| 172 | while (!at_end() && IsCurrentWhitespace()) |
| 173 | Advance(); |
| 174 | } |
| 175 | |
| 176 | Token::Type Tokenizer::ClassifyCurrent() const { |
| 177 | DCHECK(!at_end()); |
| 178 | char next_char = cur_char(); |
[email protected] | 8ae5d42 | 2014-05-13 20:17:42 | [diff] [blame] | 179 | if (IsAsciiDigit(next_char)) |
[email protected] | 96ea63d | 2013-07-30 10:17:07 | [diff] [blame] | 180 | return Token::INTEGER; |
| 181 | if (next_char == '"') |
| 182 | return Token::STRING; |
| 183 | |
| 184 | // Note: '-' handled specially below. |
| 185 | if (next_char != '-' && CouldBeOperator(next_char)) |
[email protected] | f38ddec | 2013-08-15 23:59:11 | [diff] [blame] | 186 | return Token::UNCLASSIFIED_OPERATOR; |
[email protected] | 96ea63d | 2013-07-30 10:17:07 | [diff] [blame] | 187 | |
| 188 | if (IsIdentifierFirstChar(next_char)) |
| 189 | return Token::IDENTIFIER; |
| 190 | |
[email protected] | f38ddec | 2013-08-15 23:59:11 | [diff] [blame] | 191 | if (next_char == '[') |
| 192 | return Token::LEFT_BRACKET; |
| 193 | if (next_char == ']') |
| 194 | return Token::RIGHT_BRACKET; |
| 195 | if (next_char == '(') |
| 196 | return Token::LEFT_PAREN; |
| 197 | if (next_char == ')') |
| 198 | return Token::RIGHT_PAREN; |
| 199 | if (next_char == '{') |
| 200 | return Token::LEFT_BRACE; |
| 201 | if (next_char == '}') |
| 202 | return Token::RIGHT_BRACE; |
[email protected] | 96ea63d | 2013-07-30 10:17:07 | [diff] [blame] | 203 | |
[email protected] | 51d0172 | 2014-03-26 16:57:07 | [diff] [blame] | 204 | if (next_char == '.') |
| 205 | return Token::DOT; |
[email protected] | f38ddec | 2013-08-15 23:59:11 | [diff] [blame] | 206 | if (next_char == ',') |
| 207 | return Token::COMMA; |
[email protected] | 96ea63d | 2013-07-30 10:17:07 | [diff] [blame] | 208 | |
| 209 | if (next_char == '#') |
scottmg | be5d451 | 2014-09-24 02:29:12 | [diff] [blame] | 210 | return Token::UNCLASSIFIED_COMMENT; |
[email protected] | 96ea63d | 2013-07-30 10:17:07 | [diff] [blame] | 211 | |
| 212 | // For the case of '-' differentiate between a negative number and anything |
| 213 | // else. |
| 214 | if (next_char == '-') { |
| 215 | if (!CanIncrement()) |
[email protected] | f38ddec | 2013-08-15 23:59:11 | [diff] [blame] | 216 | return Token::UNCLASSIFIED_OPERATOR; // Just the minus before end of |
| 217 | // file. |
[email protected] | 96ea63d | 2013-07-30 10:17:07 | [diff] [blame] | 218 | char following_char = input_[cur_ + 1]; |
[email protected] | 8ae5d42 | 2014-05-13 20:17:42 | [diff] [blame] | 219 | if (IsAsciiDigit(following_char)) |
[email protected] | 96ea63d | 2013-07-30 10:17:07 | [diff] [blame] | 220 | return Token::INTEGER; |
[email protected] | f38ddec | 2013-08-15 23:59:11 | [diff] [blame] | 221 | return Token::UNCLASSIFIED_OPERATOR; |
[email protected] | 96ea63d | 2013-07-30 10:17:07 | [diff] [blame] | 222 | } |
| 223 | |
| 224 | return Token::INVALID; |
| 225 | } |
| 226 | |
| 227 | void Tokenizer::AdvanceToEndOfToken(const Location& location, |
| 228 | Token::Type type) { |
| 229 | switch (type) { |
| 230 | case Token::INTEGER: |
| 231 | do { |
| 232 | Advance(); |
[email protected] | 8ae5d42 | 2014-05-13 20:17:42 | [diff] [blame] | 233 | } while (!at_end() && IsAsciiDigit(cur_char())); |
[email protected] | 96ea63d | 2013-07-30 10:17:07 | [diff] [blame] | 234 | if (!at_end()) { |
| 235 | // Require the char after a number to be some kind of space, scope, |
| 236 | // or operator. |
| 237 | char c = cur_char(); |
| 238 | if (!IsCurrentWhitespace() && !CouldBeOperator(c) && |
[email protected] | f38ddec | 2013-08-15 23:59:11 | [diff] [blame] | 239 | !IsScoperChar(c) && c != ',') { |
[email protected] | 96ea63d | 2013-07-30 10:17:07 | [diff] [blame] | 240 | *err_ = Err(GetCurrentLocation(), |
[email protected] | df15e82d | 2014-05-15 19:41:58 | [diff] [blame] | 241 | "This is not a valid number.", |
| 242 | "Learn to count."); |
[email protected] | 96ea63d | 2013-07-30 10:17:07 | [diff] [blame] | 243 | // Highlight the number. |
| 244 | err_->AppendRange(LocationRange(location, GetCurrentLocation())); |
| 245 | } |
| 246 | } |
| 247 | break; |
| 248 | |
| 249 | case Token::STRING: { |
| 250 | char initial = cur_char(); |
| 251 | Advance(); // Advance past initial " |
| 252 | for (;;) { |
| 253 | if (at_end()) { |
[email protected] | df15e82d | 2014-05-15 19:41:58 | [diff] [blame] | 254 | *err_ = Err(LocationRange(location, GetCurrentLocation()), |
| 255 | "Unterminated string literal.", |
| 256 | "Don't leave me hanging like this!"); |
[email protected] | 96ea63d | 2013-07-30 10:17:07 | [diff] [blame] | 257 | break; |
| 258 | } |
| 259 | if (IsCurrentStringTerminator(initial)) { |
| 260 | Advance(); // Skip past last " |
| 261 | break; |
| 262 | } else if (cur_char() == '\n') { |
[email protected] | df15e82d | 2014-05-15 19:41:58 | [diff] [blame] | 263 | *err_ = Err(LocationRange(location, GetCurrentLocation()), |
| 264 | "Newline in string constant."); |
[email protected] | 96ea63d | 2013-07-30 10:17:07 | [diff] [blame] | 265 | } |
| 266 | Advance(); |
| 267 | } |
| 268 | break; |
| 269 | } |
| 270 | |
[email protected] | f38ddec | 2013-08-15 23:59:11 | [diff] [blame] | 271 | case Token::UNCLASSIFIED_OPERATOR: |
[email protected] | 96ea63d | 2013-07-30 10:17:07 | [diff] [blame] | 272 | // Some operators are two characters, some are one. |
| 273 | if (CouldBeTwoCharOperatorBegin(cur_char())) { |
| 274 | if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1])) |
| 275 | Advance(); |
| 276 | } |
| 277 | Advance(); |
| 278 | break; |
| 279 | |
| 280 | case Token::IDENTIFIER: |
| 281 | while (!at_end() && IsIdentifierContinuingChar(cur_char())) |
| 282 | Advance(); |
| 283 | break; |
| 284 | |
[email protected] | f38ddec | 2013-08-15 23:59:11 | [diff] [blame] | 285 | case Token::LEFT_BRACKET: |
| 286 | case Token::RIGHT_BRACKET: |
| 287 | case Token::LEFT_BRACE: |
| 288 | case Token::RIGHT_BRACE: |
| 289 | case Token::LEFT_PAREN: |
| 290 | case Token::RIGHT_PAREN: |
[email protected] | 51d0172 | 2014-03-26 16:57:07 | [diff] [blame] | 291 | case Token::DOT: |
[email protected] | f38ddec | 2013-08-15 23:59:11 | [diff] [blame] | 292 | case Token::COMMA: |
[email protected] | 96ea63d | 2013-07-30 10:17:07 | [diff] [blame] | 293 | Advance(); // All are one char. |
| 294 | break; |
| 295 | |
scottmg | be5d451 | 2014-09-24 02:29:12 | [diff] [blame] | 296 | case Token::UNCLASSIFIED_COMMENT: |
[email protected] | 96ea63d | 2013-07-30 10:17:07 | [diff] [blame] | 297 | // Eat to EOL. |
| 298 | while (!at_end() && !IsCurrentNewline()) |
| 299 | Advance(); |
| 300 | break; |
| 301 | |
| 302 | case Token::INVALID: |
[email protected] | 6ac871ea | 2013-08-19 21:04:50 | [diff] [blame] | 303 | default: |
[email protected] | 96ea63d | 2013-07-30 10:17:07 | [diff] [blame] | 304 | *err_ = Err(location, "Everything is all messed up", |
| 305 | "Please insert system disk in drive A: and press any key."); |
| 306 | NOTREACHED(); |
| 307 | return; |
| 308 | } |
| 309 | } |
| 310 | |
scottmg | be5d451 | 2014-09-24 02:29:12 | [diff] [blame] | 311 | bool Tokenizer::AtStartOfLine(size_t location) const { |
| 312 | while (location > 0) { |
| 313 | --location; |
| 314 | char c = input_[location]; |
| 315 | if (c == '\n') |
| 316 | return true; |
| 317 | if (c != ' ') |
| 318 | return false; |
| 319 | } |
| 320 | return true; |
| 321 | } |
| 322 | |
[email protected] | 96ea63d | 2013-07-30 10:17:07 | [diff] [blame] | 323 | bool Tokenizer::IsCurrentWhitespace() const { |
| 324 | DCHECK(!at_end()); |
| 325 | char c = input_[cur_]; |
scottmg | be5d451 | 2014-09-24 02:29:12 | [diff] [blame] | 326 | // Note that tab (0x09), vertical tab (0x0B), and formfeed (0x0C) are illegal. |
| 327 | return c == 0x0A || c == 0x0D || c == 0x20; |
[email protected] | 96ea63d | 2013-07-30 10:17:07 | [diff] [blame] | 328 | } |
| 329 | |
| 330 | bool Tokenizer::IsCurrentStringTerminator(char quote_char) const { |
| 331 | DCHECK(!at_end()); |
| 332 | if (cur_char() != quote_char) |
| 333 | return false; |
| 334 | |
| 335 | // Check for escaping. \" is not a string terminator, but \\" is. Count |
| 336 | // the number of preceeding backslashes. |
| 337 | int num_backslashes = 0; |
| 338 | for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--) |
| 339 | num_backslashes++; |
| 340 | |
| 341 | // Even backslashes mean that they were escaping each other and don't count |
| 342 | // as escaping this quote. |
| 343 | return (num_backslashes % 2) == 0; |
| 344 | } |
| 345 | |
| 346 | bool Tokenizer::IsCurrentNewline() const { |
| 347 | return IsNewline(input_, cur_); |
| 348 | } |
| 349 | |
| 350 | void Tokenizer::Advance() { |
| 351 | DCHECK(cur_ < input_.size()); |
| 352 | if (IsCurrentNewline()) { |
[email protected] | 96ea63d | 2013-07-30 10:17:07 | [diff] [blame] | 353 | line_number_++; |
| 354 | char_in_line_ = 1; |
| 355 | } else { |
| 356 | char_in_line_++; |
| 357 | } |
| 358 | cur_++; |
| 359 | } |
| 360 | |
| 361 | Location Tokenizer::GetCurrentLocation() const { |
scottmg | be5d451 | 2014-09-24 02:29:12 | [diff] [blame] | 362 | return Location( |
| 363 | input_file_, line_number_, char_in_line_, static_cast<int>(cur_)); |
[email protected] | 96ea63d | 2013-07-30 10:17:07 | [diff] [blame] | 364 | } |
| 365 | |
| 366 | Err Tokenizer::GetErrorForInvalidToken(const Location& location) const { |
| 367 | std::string help; |
| 368 | if (cur_char() == ';') { |
| 369 | // Semicolon. |
| 370 | help = "Semicolons are not needed, delete this one."; |
| 371 | } else if (cur_char() == '\t') { |
| 372 | // Tab. |
| 373 | help = "You got a tab character in here. Tabs are evil. " |
| 374 | "Convert to spaces."; |
| 375 | } else if (cur_char() == '/' && cur_ + 1 < input_.size() && |
| 376 | (input_[cur_ + 1] == '/' || input_[cur_ + 1] == '*')) { |
| 377 | // Different types of comments. |
| 378 | help = "Comments should start with # instead"; |
| 379 | } else { |
| 380 | help = "I have no idea what this is."; |
| 381 | } |
| 382 | |
| 383 | return Err(location, "Invalid token.", help); |
| 384 | } |