Blame - tools/gn/tokenizer.cc - chromium/src

blob: 6051e1b5bb9a4e51836c6bce77c1826b7fc8db39 [file] [log] [blame]

[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	1	// Copyright (c) 2013 The Chromium Authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style license that can be
				3	// found in the LICENSE file.
				4
				5	#include "tools/gn/tokenizer.h"
				6
				7	#include "base/logging.h"
				8	#include "tools/gn/input_file.h"
				9
				10	namespace {
				11
				12	bool IsNumberChar(char c) {
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	13	return c >= '0' && c <= '9';
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	14	}
				15
				16	bool CouldBeTwoCharOperatorBegin(char c) {
				17	return c == '<' \|\| c == '>' \|\| c == '!' \|\| c == '=' \|\| c == '-' \|\|
				18	c == '+' \|\| c == '\|' \|\| c == '&';
				19	}
				20
				21	bool CouldBeTwoCharOperatorEnd(char c) {
				22	return c == '=' \|\| c == '\|' \|\| c == '&';
				23	}
				24
				25	bool CouldBeOneCharOperator(char c) {
				26	return c == '=' \|\| c == '<' \|\| c == '>' \|\| c == '+' \|\| c == '!' \|\|
				27	c == ':' \|\| c == '\|' \|\| c == '&' \|\| c == '-';
				28	}
				29
				30	bool CouldBeOperator(char c) {
				31	return CouldBeOneCharOperator(c) \|\| CouldBeTwoCharOperatorBegin(c);
				32	}
				33
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	34	bool IsScoperChar(char c) {
				35	return c == '(' \|\| c == ')' \|\| c == '[' \|\| c == ']' \|\| c == '{' \|\| c == '}';
				36	}
				37
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	38	Token::Type GetSpecificOperatorType(base::StringPiece value) {
				39	if (value == "=")
				40	return Token::EQUAL;
				41	if (value == "+")
				42	return Token::PLUS;
				43	if (value == "-")
				44	return Token::MINUS;
				45	if (value == "+=")
				46	return Token::PLUS_EQUALS;
				47	if (value == "-=")
				48	return Token::MINUS_EQUALS;
				49	if (value == "==")
				50	return Token::EQUAL_EQUAL;
				51	if (value == "!=")
				52	return Token::NOT_EQUAL;
				53	if (value == "<=")
				54	return Token::LESS_EQUAL;
				55	if (value == ">=")
				56	return Token::GREATER_EQUAL;
				57	if (value == "<")
				58	return Token::LESS_THAN;
				59	if (value == ">")
				60	return Token::GREATER_THAN;
				61	if (value == "&&")
				62	return Token::BOOLEAN_AND;
				63	if (value == "\|\|")
				64	return Token::BOOLEAN_OR;
				65	if (value == "!")
				66	return Token::BANG;
				67	NOTREACHED();
				68	return Token::INVALID;
				69	}
				70
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	71	} // namespace
				72
				73	Tokenizer::Tokenizer(const InputFile* input_file, Err* err)
				74	: input_file_(input_file),
				75	input_(input_file->contents()),
				76	err_(err),
				77	cur_(0),
				78	line_number_(1),
				79	char_in_line_(1) {
				80	}
				81
				82	Tokenizer::~Tokenizer() {
				83	}
				84
				85	// static
				86	std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) {
				87	Tokenizer t(input_file, err);
				88	return t.Run();
				89	}
				90
				91	std::vector<Token> Tokenizer::Run() {
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	92	DCHECK(tokens_.empty());
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	93	while (!done()) {
				94	AdvanceToNextToken();
				95	if (done())
				96	break;
				97	Location location = GetCurrentLocation();
				98
				99	Token::Type type = ClassifyCurrent();
				100	if (type == Token::INVALID) {
				101	*err_ = GetErrorForInvalidToken(location);
				102	break;
				103	}
				104	size_t token_begin = cur_;
				105	AdvanceToEndOfToken(location, type);
				106	if (has_error())
				107	break;
				108	size_t token_end = cur_;
				109
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	110	base::StringPiece token_value(&input_.data()[token_begin],
				111	token_end - token_begin);
				112
				113	if (type == Token::UNCLASSIFIED_OPERATOR)
				114	type = GetSpecificOperatorType(token_value);
				115	if (type == Token::IDENTIFIER) {
				116	if (token_value == "if")
				117	type = Token::IF;
				118	if (token_value == "else")
				119	type = Token::ELSE;
				120	}
				121
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	122	// TODO(brettw) This just strips comments from the token stream. This
				123	// is probably wrong, they should be removed at a later stage so we can
				124	// do things like rewrite the file. But this makes the parser simpler and
				125	// is OK for now.
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	126	if (type != Token::COMMENT)
				127	tokens_.push_back(Token(location, type, token_value));
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	128	}
				129	if (err_->has_error())
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	130	tokens_.clear();
				131	return tokens_;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	132	}
				133
				134	// static
				135	size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) {
				136	int cur_line = 1;
				137	size_t cur_byte = 0;
				138
				139	DCHECK(n > 0);
				140
				141	if (n == 1)
				142	return 0;
				143
				144	while (cur_byte < buf.size()) {
				145	if (IsNewline(buf, cur_byte)) {
				146	cur_line++;
				147	if (cur_line == n)
				148	return cur_byte + 1;
				149	}
				150	cur_byte++;
				151	}
				152	return -1;
				153	}
				154
				155	// static
				156	bool Tokenizer::IsNewline(const base::StringPiece& buffer, size_t offset) {
				157	DCHECK(offset < buffer.size());
				158	// We may need more logic here to handle different line ending styles.
				159	return buffer[offset] == '\n';
				160	}
				161
				162
				163	void Tokenizer::AdvanceToNextToken() {
				164	while (!at_end() && IsCurrentWhitespace())
				165	Advance();
				166	}
				167
				168	Token::Type Tokenizer::ClassifyCurrent() const {
				169	DCHECK(!at_end());
				170	char next_char = cur_char();
				171	if (next_char >= '0' && next_char <= '9')
				172	return Token::INTEGER;
				173	if (next_char == '"')
				174	return Token::STRING;
				175
				176	// Note: '-' handled specially below.
				177	if (next_char != '-' && CouldBeOperator(next_char))
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	178	return Token::UNCLASSIFIED_OPERATOR;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	179
				180	if (IsIdentifierFirstChar(next_char))
				181	return Token::IDENTIFIER;
				182
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	183	if (next_char == '[')
				184	return Token::LEFT_BRACKET;
				185	if (next_char == ']')
				186	return Token::RIGHT_BRACKET;
				187	if (next_char == '(')
				188	return Token::LEFT_PAREN;
				189	if (next_char == ')')
				190	return Token::RIGHT_PAREN;
				191	if (next_char == '{')
				192	return Token::LEFT_BRACE;
				193	if (next_char == '}')
				194	return Token::RIGHT_BRACE;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	195
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	196	if (next_char == ',')
				197	return Token::COMMA;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	198
				199	if (next_char == '#')
				200	return Token::COMMENT;
				201
				202	// For the case of '-' differentiate between a negative number and anything
				203	// else.
				204	if (next_char == '-') {
				205	if (!CanIncrement())
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	206	return Token::UNCLASSIFIED_OPERATOR; // Just the minus before end of
				207	// file.
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	208	char following_char = input_[cur_ + 1];
				209	if (following_char >= '0' && following_char <= '9')
				210	return Token::INTEGER;
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	211	return Token::UNCLASSIFIED_OPERATOR;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	212	}
				213
				214	return Token::INVALID;
				215	}
				216
				217	void Tokenizer::AdvanceToEndOfToken(const Location& location,
				218	Token::Type type) {
				219	switch (type) {
				220	case Token::INTEGER:
				221	do {
				222	Advance();
				223	} while (!at_end() && IsNumberChar(cur_char()));
				224	if (!at_end()) {
				225	// Require the char after a number to be some kind of space, scope,
				226	// or operator.
				227	char c = cur_char();
				228	if (!IsCurrentWhitespace() && !CouldBeOperator(c) &&
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	229	!IsScoperChar(c) && c != ',') {
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	230	*err_ = Err(GetCurrentLocation(),
				231	"This is not a valid number.",
				232	"Learn to count.");
				233	// Highlight the number.
				234	err_->AppendRange(LocationRange(location, GetCurrentLocation()));
				235	}
				236	}
				237	break;
				238
				239	case Token::STRING: {
				240	char initial = cur_char();
				241	Advance(); // Advance past initial "
				242	for (;;) {
				243	if (at_end()) {
				244	*err_ = Err(LocationRange(location,
				245	Location(input_file_, line_number_, char_in_line_)),
				246	"Unterminated string literal.",
				247	"Don't leave me hanging like this!");
				248	break;
				249	}
				250	if (IsCurrentStringTerminator(initial)) {
				251	Advance(); // Skip past last "
				252	break;
				253	} else if (cur_char() == '\n') {
				254	*err_ = Err(LocationRange(location,
				255	GetCurrentLocation()),
				256	"Newline in string constant.");
				257	}
				258	Advance();
				259	}
				260	break;
				261	}
				262
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	263	case Token::UNCLASSIFIED_OPERATOR:
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	264	// Some operators are two characters, some are one.
				265	if (CouldBeTwoCharOperatorBegin(cur_char())) {
				266	if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1]))
				267	Advance();
				268	}
				269	Advance();
				270	break;
				271
				272	case Token::IDENTIFIER:
				273	while (!at_end() && IsIdentifierContinuingChar(cur_char()))
				274	Advance();
				275	break;
				276
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	277	case Token::LEFT_BRACKET:
				278	case Token::RIGHT_BRACKET:
				279	case Token::LEFT_BRACE:
				280	case Token::RIGHT_BRACE:
				281	case Token::LEFT_PAREN:
				282	case Token::RIGHT_PAREN:
				283	case Token::COMMA:
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	284	Advance(); // All are one char.
				285	break;
				286
				287	case Token::COMMENT:
				288	// Eat to EOL.
				289	while (!at_end() && !IsCurrentNewline())
				290	Advance();
				291	break;
				292
				293	case Token::INVALID:
[email protected]	6ac871ea	2013-08-19 21:04:50	[diff] [blame^]	294	default:
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	295	*err_ = Err(location, "Everything is all messed up",
				296	"Please insert system disk in drive A: and press any key.");
				297	NOTREACHED();
				298	return;
				299	}
				300	}
				301
				302	bool Tokenizer::IsCurrentWhitespace() const {
				303	DCHECK(!at_end());
				304	char c = input_[cur_];
				305	// Note that tab (0x09) is illegal.
				306	return c == 0x0A \|\| c == 0x0B \|\| c == 0x0C \|\| c == 0x0D \|\| c == 0x20;
				307	}
				308
				309	bool Tokenizer::IsCurrentStringTerminator(char quote_char) const {
				310	DCHECK(!at_end());
				311	if (cur_char() != quote_char)
				312	return false;
				313
				314	// Check for escaping. \" is not a string terminator, but \\" is. Count
				315	// the number of preceeding backslashes.
				316	int num_backslashes = 0;
				317	for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--)
				318	num_backslashes++;
				319
				320	// Even backslashes mean that they were escaping each other and don't count
				321	// as escaping this quote.
				322	return (num_backslashes % 2) == 0;
				323	}
				324
				325	bool Tokenizer::IsCurrentNewline() const {
				326	return IsNewline(input_, cur_);
				327	}
				328
				329	void Tokenizer::Advance() {
				330	DCHECK(cur_ < input_.size());
				331	if (IsCurrentNewline()) {
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	332	line_number_++;
				333	char_in_line_ = 1;
				334	} else {
				335	char_in_line_++;
				336	}
				337	cur_++;
				338	}
				339
				340	Location Tokenizer::GetCurrentLocation() const {
				341	return Location(input_file_, line_number_, char_in_line_);
				342	}
				343
				344	Err Tokenizer::GetErrorForInvalidToken(const Location& location) const {
				345	std::string help;
				346	if (cur_char() == ';') {
				347	// Semicolon.
				348	help = "Semicolons are not needed, delete this one.";
				349	} else if (cur_char() == '\t') {
				350	// Tab.
				351	help = "You got a tab character in here. Tabs are evil. "
				352	"Convert to spaces.";
				353	} else if (cur_char() == '/' && cur_ + 1 < input_.size() &&
				354	(input_[cur_ + 1] == '/' \|\| input_[cur_ + 1] == '*')) {
				355	// Different types of comments.
				356	help = "Comments should start with # instead";
				357	} else {
				358	help = "I have no idea what this is.";
				359	}
				360
				361	return Err(location, "Invalid token.", help);
				362	}