Blame - tools/gn/tokenizer.cc - chromium/src

blob: c0890060b37e80cf931901abf0b2b3bb68aec344 [file] [log] [blame]

[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	1	// Copyright (c) 2013 The Chromium Authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style license that can be
				3	// found in the LICENSE file.
				4
				5	#include "tools/gn/tokenizer.h"
				6
				7	#include "base/logging.h"
				8	#include "tools/gn/input_file.h"
				9
				10	namespace {
				11
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	12	bool CouldBeTwoCharOperatorBegin(char c) {
				13	return c == '<' \|\| c == '>' \|\| c == '!' \|\| c == '=' \|\| c == '-' \|\|
				14	c == '+' \|\| c == '\|' \|\| c == '&';
				15	}
				16
				17	bool CouldBeTwoCharOperatorEnd(char c) {
				18	return c == '=' \|\| c == '\|' \|\| c == '&';
				19	}
				20
				21	bool CouldBeOneCharOperator(char c) {
				22	return c == '=' \|\| c == '<' \|\| c == '>' \|\| c == '+' \|\| c == '!' \|\|
				23	c == ':' \|\| c == '\|' \|\| c == '&' \|\| c == '-';
				24	}
				25
				26	bool CouldBeOperator(char c) {
				27	return CouldBeOneCharOperator(c) \|\| CouldBeTwoCharOperatorBegin(c);
				28	}
				29
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	30	bool IsScoperChar(char c) {
				31	return c == '(' \|\| c == ')' \|\| c == '[' \|\| c == ']' \|\| c == '{' \|\| c == '}';
				32	}
				33
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	34	Token::Type GetSpecificOperatorType(base::StringPiece value) {
				35	if (value == "=")
				36	return Token::EQUAL;
				37	if (value == "+")
				38	return Token::PLUS;
				39	if (value == "-")
				40	return Token::MINUS;
				41	if (value == "+=")
				42	return Token::PLUS_EQUALS;
				43	if (value == "-=")
				44	return Token::MINUS_EQUALS;
				45	if (value == "==")
				46	return Token::EQUAL_EQUAL;
				47	if (value == "!=")
				48	return Token::NOT_EQUAL;
				49	if (value == "<=")
				50	return Token::LESS_EQUAL;
				51	if (value == ">=")
				52	return Token::GREATER_EQUAL;
				53	if (value == "<")
				54	return Token::LESS_THAN;
				55	if (value == ">")
				56	return Token::GREATER_THAN;
				57	if (value == "&&")
				58	return Token::BOOLEAN_AND;
				59	if (value == "\|\|")
				60	return Token::BOOLEAN_OR;
				61	if (value == "!")
				62	return Token::BANG;
[email protected]	51d0172	2014-03-26 16:57:07	[diff] [blame]	63	if (value == ".")
				64	return Token::DOT;
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	65	return Token::INVALID;
				66	}
				67
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	68	} // namespace
				69
				70	Tokenizer::Tokenizer(const InputFile* input_file, Err* err)
				71	: input_file_(input_file),
				72	input_(input_file->contents()),
				73	err_(err),
				74	cur_(0),
				75	line_number_(1),
				76	char_in_line_(1) {
				77	}
				78
				79	Tokenizer::~Tokenizer() {
				80	}
				81
				82	// static
				83	std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) {
				84	Tokenizer t(input_file, err);
				85	return t.Run();
				86	}
				87
				88	std::vector<Token> Tokenizer::Run() {
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	89	DCHECK(tokens_.empty());
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	90	while (!done()) {
				91	AdvanceToNextToken();
				92	if (done())
				93	break;
				94	Location location = GetCurrentLocation();
				95
				96	Token::Type type = ClassifyCurrent();
				97	if (type == Token::INVALID) {
				98	*err_ = GetErrorForInvalidToken(location);
				99	break;
				100	}
				101	size_t token_begin = cur_;
				102	AdvanceToEndOfToken(location, type);
				103	if (has_error())
				104	break;
				105	size_t token_end = cur_;
				106
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	107	base::StringPiece token_value(&input_.data()[token_begin],
				108	token_end - token_begin);
				109
				110	if (type == Token::UNCLASSIFIED_OPERATOR)
				111	type = GetSpecificOperatorType(token_value);
				112	if (type == Token::IDENTIFIER) {
				113	if (token_value == "if")
				114	type = Token::IF;
[email protected]	ed7d2be2	2013-08-20 17:23:15	[diff] [blame]	115	else if (token_value == "else")
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	116	type = Token::ELSE;
[email protected]	ed7d2be2	2013-08-20 17:23:15	[diff] [blame]	117	else if (token_value == "true")
				118	type = Token::TRUE_TOKEN;
				119	else if (token_value == "false")
				120	type = Token::FALSE_TOKEN;
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	121	}
				122
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	123	// TODO(brettw) This just strips comments from the token stream. This
				124	// is probably wrong, they should be removed at a later stage so we can
				125	// do things like rewrite the file. But this makes the parser simpler and
				126	// is OK for now.
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	127	if (type != Token::COMMENT)
				128	tokens_.push_back(Token(location, type, token_value));
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	129	}
				130	if (err_->has_error())
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	131	tokens_.clear();
				132	return tokens_;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	133	}
				134
				135	// static
				136	size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) {
				137	int cur_line = 1;
				138	size_t cur_byte = 0;
				139
				140	DCHECK(n > 0);
				141
				142	if (n == 1)
				143	return 0;
				144
				145	while (cur_byte < buf.size()) {
				146	if (IsNewline(buf, cur_byte)) {
				147	cur_line++;
				148	if (cur_line == n)
				149	return cur_byte + 1;
				150	}
				151	cur_byte++;
				152	}
				153	return -1;
				154	}
				155
				156	// static
				157	bool Tokenizer::IsNewline(const base::StringPiece& buffer, size_t offset) {
				158	DCHECK(offset < buffer.size());
				159	// We may need more logic here to handle different line ending styles.
				160	return buffer[offset] == '\n';
				161	}
				162
				163
				164	void Tokenizer::AdvanceToNextToken() {
				165	while (!at_end() && IsCurrentWhitespace())
				166	Advance();
				167	}
				168
				169	Token::Type Tokenizer::ClassifyCurrent() const {
				170	DCHECK(!at_end());
				171	char next_char = cur_char();
[email protected]	8ae5d42	2014-05-13 20:17:42	[diff] [blame]	172	if (IsAsciiDigit(next_char))
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	173	return Token::INTEGER;
				174	if (next_char == '"')
				175	return Token::STRING;
				176
				177	// Note: '-' handled specially below.
				178	if (next_char != '-' && CouldBeOperator(next_char))
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	179	return Token::UNCLASSIFIED_OPERATOR;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	180
				181	if (IsIdentifierFirstChar(next_char))
				182	return Token::IDENTIFIER;
				183
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	184	if (next_char == '[')
				185	return Token::LEFT_BRACKET;
				186	if (next_char == ']')
				187	return Token::RIGHT_BRACKET;
				188	if (next_char == '(')
				189	return Token::LEFT_PAREN;
				190	if (next_char == ')')
				191	return Token::RIGHT_PAREN;
				192	if (next_char == '{')
				193	return Token::LEFT_BRACE;
				194	if (next_char == '}')
				195	return Token::RIGHT_BRACE;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	196
[email protected]	51d0172	2014-03-26 16:57:07	[diff] [blame]	197	if (next_char == '.')
				198	return Token::DOT;
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	199	if (next_char == ',')
				200	return Token::COMMA;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	201
				202	if (next_char == '#')
				203	return Token::COMMENT;
				204
				205	// For the case of '-' differentiate between a negative number and anything
				206	// else.
				207	if (next_char == '-') {
				208	if (!CanIncrement())
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	209	return Token::UNCLASSIFIED_OPERATOR; // Just the minus before end of
				210	// file.
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	211	char following_char = input_[cur_ + 1];
[email protected]	8ae5d42	2014-05-13 20:17:42	[diff] [blame]	212	if (IsAsciiDigit(following_char))
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	213	return Token::INTEGER;
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	214	return Token::UNCLASSIFIED_OPERATOR;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	215	}
				216
				217	return Token::INVALID;
				218	}
				219
				220	void Tokenizer::AdvanceToEndOfToken(const Location& location,
				221	Token::Type type) {
				222	switch (type) {
				223	case Token::INTEGER:
				224	do {
				225	Advance();
[email protected]	8ae5d42	2014-05-13 20:17:42	[diff] [blame]	226	} while (!at_end() && IsAsciiDigit(cur_char()));
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	227	if (!at_end()) {
				228	// Require the char after a number to be some kind of space, scope,
				229	// or operator.
				230	char c = cur_char();
				231	if (!IsCurrentWhitespace() && !CouldBeOperator(c) &&
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	232	!IsScoperChar(c) && c != ',') {
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	233	*err_ = Err(GetCurrentLocation(),
[email protected]	df15e82d	2014-05-15 19:41:58	[diff] [blame^]	234	"This is not a valid number.",
				235	"Learn to count.");
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	236	// Highlight the number.
				237	err_->AppendRange(LocationRange(location, GetCurrentLocation()));
				238	}
				239	}
				240	break;
				241
				242	case Token::STRING: {
				243	char initial = cur_char();
				244	Advance(); // Advance past initial "
				245	for (;;) {
				246	if (at_end()) {
[email protected]	df15e82d	2014-05-15 19:41:58	[diff] [blame^]	247	*err_ = Err(LocationRange(location, GetCurrentLocation()),
				248	"Unterminated string literal.",
				249	"Don't leave me hanging like this!");
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	250	break;
				251	}
				252	if (IsCurrentStringTerminator(initial)) {
				253	Advance(); // Skip past last "
				254	break;
				255	} else if (cur_char() == '\n') {
[email protected]	df15e82d	2014-05-15 19:41:58	[diff] [blame^]	256	*err_ = Err(LocationRange(location, GetCurrentLocation()),
				257	"Newline in string constant.");
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	258	}
				259	Advance();
				260	}
				261	break;
				262	}
				263
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	264	case Token::UNCLASSIFIED_OPERATOR:
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	265	// Some operators are two characters, some are one.
				266	if (CouldBeTwoCharOperatorBegin(cur_char())) {
				267	if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1]))
				268	Advance();
				269	}
				270	Advance();
				271	break;
				272
				273	case Token::IDENTIFIER:
				274	while (!at_end() && IsIdentifierContinuingChar(cur_char()))
				275	Advance();
				276	break;
				277
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	278	case Token::LEFT_BRACKET:
				279	case Token::RIGHT_BRACKET:
				280	case Token::LEFT_BRACE:
				281	case Token::RIGHT_BRACE:
				282	case Token::LEFT_PAREN:
				283	case Token::RIGHT_PAREN:
[email protected]	51d0172	2014-03-26 16:57:07	[diff] [blame]	284	case Token::DOT:
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	285	case Token::COMMA:
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	286	Advance(); // All are one char.
				287	break;
				288
				289	case Token::COMMENT:
				290	// Eat to EOL.
				291	while (!at_end() && !IsCurrentNewline())
				292	Advance();
				293	break;
				294
				295	case Token::INVALID:
[email protected]	6ac871ea	2013-08-19 21:04:50	[diff] [blame]	296	default:
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	297	*err_ = Err(location, "Everything is all messed up",
				298	"Please insert system disk in drive A: and press any key.");
				299	NOTREACHED();
				300	return;
				301	}
				302	}
				303
				304	bool Tokenizer::IsCurrentWhitespace() const {
				305	DCHECK(!at_end());
				306	char c = input_[cur_];
				307	// Note that tab (0x09) is illegal.
				308	return c == 0x0A \|\| c == 0x0B \|\| c == 0x0C \|\| c == 0x0D \|\| c == 0x20;
				309	}
				310
				311	bool Tokenizer::IsCurrentStringTerminator(char quote_char) const {
				312	DCHECK(!at_end());
				313	if (cur_char() != quote_char)
				314	return false;
				315
				316	// Check for escaping. \" is not a string terminator, but \\" is. Count
				317	// the number of preceeding backslashes.
				318	int num_backslashes = 0;
				319	for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--)
				320	num_backslashes++;
				321
				322	// Even backslashes mean that they were escaping each other and don't count
				323	// as escaping this quote.
				324	return (num_backslashes % 2) == 0;
				325	}
				326
				327	bool Tokenizer::IsCurrentNewline() const {
				328	return IsNewline(input_, cur_);
				329	}
				330
				331	void Tokenizer::Advance() {
				332	DCHECK(cur_ < input_.size());
				333	if (IsCurrentNewline()) {
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	334	line_number_++;
				335	char_in_line_ = 1;
				336	} else {
				337	char_in_line_++;
				338	}
				339	cur_++;
				340	}
				341
				342	Location Tokenizer::GetCurrentLocation() const {
				343	return Location(input_file_, line_number_, char_in_line_);
				344	}
				345
				346	Err Tokenizer::GetErrorForInvalidToken(const Location& location) const {
				347	std::string help;
				348	if (cur_char() == ';') {
				349	// Semicolon.
				350	help = "Semicolons are not needed, delete this one.";
				351	} else if (cur_char() == '\t') {
				352	// Tab.
				353	help = "You got a tab character in here. Tabs are evil. "
				354	"Convert to spaces.";
				355	} else if (cur_char() == '/' && cur_ + 1 < input_.size() &&
				356	(input_[cur_ + 1] == '/' \|\| input_[cur_ + 1] == '*')) {
				357	// Different types of comments.
				358	help = "Comments should start with # instead";
				359	} else {
				360	help = "I have no idea what this is.";
				361	}
				362
				363	return Err(location, "Invalid token.", help);
				364	}