Blame - tools/gn/tokenizer.cc - chromium/src

blob: 0568becbc8dcb51f92808b74c14a83de5cc45f32 [file] [log] [blame]

[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	1	// Copyright (c) 2013 The Chromium Authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style license that can be
				3	// found in the LICENSE file.
				4
				5	#include "tools/gn/tokenizer.h"
				6
				7	#include "base/logging.h"
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame]	8	#include "base/strings/string_util.h"
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	9	#include "tools/gn/input_file.h"
				10
				11	namespace {
				12
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	13	bool CouldBeTwoCharOperatorBegin(char c) {
				14	return c == '<' \|\| c == '>' \|\| c == '!' \|\| c == '=' \|\| c == '-' \|\|
				15	c == '+' \|\| c == '\|' \|\| c == '&';
				16	}
				17
				18	bool CouldBeTwoCharOperatorEnd(char c) {
				19	return c == '=' \|\| c == '\|' \|\| c == '&';
				20	}
				21
				22	bool CouldBeOneCharOperator(char c) {
				23	return c == '=' \|\| c == '<' \|\| c == '>' \|\| c == '+' \|\| c == '!' \|\|
				24	c == ':' \|\| c == '\|' \|\| c == '&' \|\| c == '-';
				25	}
				26
				27	bool CouldBeOperator(char c) {
				28	return CouldBeOneCharOperator(c) \|\| CouldBeTwoCharOperatorBegin(c);
				29	}
				30
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	31	bool IsScoperChar(char c) {
				32	return c == '(' \|\| c == ')' \|\| c == '[' \|\| c == ']' \|\| c == '{' \|\| c == '}';
				33	}
				34
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	35	Token::Type GetSpecificOperatorType(base::StringPiece value) {
				36	if (value == "=")
				37	return Token::EQUAL;
				38	if (value == "+")
				39	return Token::PLUS;
				40	if (value == "-")
				41	return Token::MINUS;
				42	if (value == "+=")
				43	return Token::PLUS_EQUALS;
				44	if (value == "-=")
				45	return Token::MINUS_EQUALS;
				46	if (value == "==")
				47	return Token::EQUAL_EQUAL;
				48	if (value == "!=")
				49	return Token::NOT_EQUAL;
				50	if (value == "<=")
				51	return Token::LESS_EQUAL;
				52	if (value == ">=")
				53	return Token::GREATER_EQUAL;
				54	if (value == "<")
				55	return Token::LESS_THAN;
				56	if (value == ">")
				57	return Token::GREATER_THAN;
				58	if (value == "&&")
				59	return Token::BOOLEAN_AND;
				60	if (value == "\|\|")
				61	return Token::BOOLEAN_OR;
				62	if (value == "!")
				63	return Token::BANG;
[email protected]	51d0172	2014-03-26 16:57:07	[diff] [blame]	64	if (value == ".")
				65	return Token::DOT;
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	66	return Token::INVALID;
				67	}
				68
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	69	} // namespace
				70
				71	Tokenizer::Tokenizer(const InputFile* input_file, Err* err)
				72	: input_file_(input_file),
				73	input_(input_file->contents()),
				74	err_(err),
				75	cur_(0),
				76	line_number_(1),
tfarina	21aff0d	2016-01-06 19:50:08	[diff] [blame^]	77	column_number_(1) {
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	78	}
				79
				80	Tokenizer::~Tokenizer() {
				81	}
				82
				83	// static
				84	std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) {
				85	Tokenizer t(input_file, err);
				86	return t.Run();
				87	}
				88
				89	std::vector<Token> Tokenizer::Run() {
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	90	DCHECK(tokens_.empty());
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	91	while (!done()) {
				92	AdvanceToNextToken();
				93	if (done())
				94	break;
				95	Location location = GetCurrentLocation();
				96
				97	Token::Type type = ClassifyCurrent();
				98	if (type == Token::INVALID) {
				99	*err_ = GetErrorForInvalidToken(location);
				100	break;
				101	}
				102	size_t token_begin = cur_;
				103	AdvanceToEndOfToken(location, type);
				104	if (has_error())
				105	break;
				106	size_t token_end = cur_;
				107
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	108	base::StringPiece token_value(&input_.data()[token_begin],
				109	token_end - token_begin);
				110
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame]	111	if (type == Token::UNCLASSIFIED_OPERATOR) {
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	112	type = GetSpecificOperatorType(token_value);
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame]	113	} else if (type == Token::IDENTIFIER) {
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	114	if (token_value == "if")
				115	type = Token::IF;
[email protected]	ed7d2be2	2013-08-20 17:23:15	[diff] [blame]	116	else if (token_value == "else")
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	117	type = Token::ELSE;
[email protected]	ed7d2be2	2013-08-20 17:23:15	[diff] [blame]	118	else if (token_value == "true")
				119	type = Token::TRUE_TOKEN;
				120	else if (token_value == "false")
				121	type = Token::FALSE_TOKEN;
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame]	122	} else if (type == Token::UNCLASSIFIED_COMMENT) {
scottmg	7b80f17	2014-09-24 03:35:13	[diff] [blame]	123	if (AtStartOfLine(token_begin) &&
				124	// If it's a standalone comment, but is a continuation of a comment on
				125	// a previous line, then instead make it a continued suffix comment.
				126	(tokens_.empty() \|\| tokens_.back().type() != Token::SUFFIX_COMMENT \|\|
				127	tokens_.back().location().line_number() + 1 !=
				128	location.line_number() \|\|
tfarina	21aff0d	2016-01-06 19:50:08	[diff] [blame^]	129	tokens_.back().location().column_number() !=
				130	location.column_number())) {
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame]	131	type = Token::LINE_COMMENT;
brettw	693c6056	2015-08-07 19:55:17	[diff] [blame]	132	if (!at_end()) // Could be EOF.
				133	Advance(); // The current \n.
scottmg	2dd93e8b	2014-09-26 04:07:05	[diff] [blame]	134	// If this comment is separated from the next syntax element, then we
				135	// want to tag it as a block comment. This will become a standalone
				136	// statement at the parser level to keep this comment separate, rather
				137	// than attached to the subsequent statement.
				138	while (!at_end() && IsCurrentWhitespace()) {
				139	if (IsCurrentNewline()) {
				140	type = Token::BLOCK_COMMENT;
				141	break;
				142	}
				143	Advance();
				144	}
scottmg	7b80f17	2014-09-24 03:35:13	[diff] [blame]	145	} else {
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame]	146	type = Token::SUFFIX_COMMENT;
scottmg	7b80f17	2014-09-24 03:35:13	[diff] [blame]	147	}
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	148	}
				149
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame]	150	tokens_.push_back(Token(location, type, token_value));
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	151	}
				152	if (err_->has_error())
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	153	tokens_.clear();
				154	return tokens_;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	155	}
				156
				157	// static
				158	size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) {
[email protected]	481c3e8	2014-07-18 01:40:47	[diff] [blame]	159	DCHECK_GT(n, 0);
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	160
				161	if (n == 1)
				162	return 0;
				163
[email protected]	481c3e8	2014-07-18 01:40:47	[diff] [blame]	164	int cur_line = 1;
				165	size_t cur_byte = 0;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	166	while (cur_byte < buf.size()) {
				167	if (IsNewline(buf, cur_byte)) {
				168	cur_line++;
				169	if (cur_line == n)
				170	return cur_byte + 1;
				171	}
				172	cur_byte++;
				173	}
[email protected]	481c3e8	2014-07-18 01:40:47	[diff] [blame]	174	return static_cast<size_t>(-1);
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	175	}
				176
				177	// static
				178	bool Tokenizer::IsNewline(const base::StringPiece& buffer, size_t offset) {
				179	DCHECK(offset < buffer.size());
				180	// We may need more logic here to handle different line ending styles.
				181	return buffer[offset] == '\n';
				182	}
				183
tfarina	0c60e3a9	2015-12-01 00:10:50	[diff] [blame]	184	// static
				185	bool Tokenizer::IsIdentifierFirstChar(char c) {
				186	return base::IsAsciiAlpha(c) \|\| c == '_';
				187	}
				188
				189	// static
				190	bool Tokenizer::IsIdentifierContinuingChar(char c) {
				191	// Also allow digits after the first char.
				192	return IsIdentifierFirstChar(c) \|\| base::IsAsciiDigit(c);
				193	}
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	194
				195	void Tokenizer::AdvanceToNextToken() {
				196	while (!at_end() && IsCurrentWhitespace())
				197	Advance();
				198	}
				199
				200	Token::Type Tokenizer::ClassifyCurrent() const {
				201	DCHECK(!at_end());
				202	char next_char = cur_char();
brettw	b341306	2015-06-24 00:39:02	[diff] [blame]	203	if (base::IsAsciiDigit(next_char))
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	204	return Token::INTEGER;
				205	if (next_char == '"')
				206	return Token::STRING;
				207
				208	// Note: '-' handled specially below.
				209	if (next_char != '-' && CouldBeOperator(next_char))
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	210	return Token::UNCLASSIFIED_OPERATOR;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	211
				212	if (IsIdentifierFirstChar(next_char))
				213	return Token::IDENTIFIER;
				214
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	215	if (next_char == '[')
				216	return Token::LEFT_BRACKET;
				217	if (next_char == ']')
				218	return Token::RIGHT_BRACKET;
				219	if (next_char == '(')
				220	return Token::LEFT_PAREN;
				221	if (next_char == ')')
				222	return Token::RIGHT_PAREN;
				223	if (next_char == '{')
				224	return Token::LEFT_BRACE;
				225	if (next_char == '}')
				226	return Token::RIGHT_BRACE;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	227
[email protected]	51d0172	2014-03-26 16:57:07	[diff] [blame]	228	if (next_char == '.')
				229	return Token::DOT;
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	230	if (next_char == ',')
				231	return Token::COMMA;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	232
				233	if (next_char == '#')
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame]	234	return Token::UNCLASSIFIED_COMMENT;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	235
				236	// For the case of '-' differentiate between a negative number and anything
				237	// else.
				238	if (next_char == '-') {
				239	if (!CanIncrement())
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	240	return Token::UNCLASSIFIED_OPERATOR; // Just the minus before end of
				241	// file.
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	242	char following_char = input_[cur_ + 1];
brettw	b341306	2015-06-24 00:39:02	[diff] [blame]	243	if (base::IsAsciiDigit(following_char))
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	244	return Token::INTEGER;
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	245	return Token::UNCLASSIFIED_OPERATOR;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	246	}
				247
				248	return Token::INVALID;
				249	}
				250
				251	void Tokenizer::AdvanceToEndOfToken(const Location& location,
				252	Token::Type type) {
				253	switch (type) {
				254	case Token::INTEGER:
				255	do {
				256	Advance();
brettw	b341306	2015-06-24 00:39:02	[diff] [blame]	257	} while (!at_end() && base::IsAsciiDigit(cur_char()));
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	258	if (!at_end()) {
				259	// Require the char after a number to be some kind of space, scope,
				260	// or operator.
				261	char c = cur_char();
				262	if (!IsCurrentWhitespace() && !CouldBeOperator(c) &&
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	263	!IsScoperChar(c) && c != ',') {
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	264	*err_ = Err(GetCurrentLocation(),
[email protected]	df15e82d	2014-05-15 19:41:58	[diff] [blame]	265	"This is not a valid number.",
				266	"Learn to count.");
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	267	// Highlight the number.
				268	err_->AppendRange(LocationRange(location, GetCurrentLocation()));
				269	}
				270	}
				271	break;
				272
				273	case Token::STRING: {
				274	char initial = cur_char();
				275	Advance(); // Advance past initial "
				276	for (;;) {
				277	if (at_end()) {
[email protected]	df15e82d	2014-05-15 19:41:58	[diff] [blame]	278	*err_ = Err(LocationRange(location, GetCurrentLocation()),
				279	"Unterminated string literal.",
				280	"Don't leave me hanging like this!");
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	281	break;
				282	}
				283	if (IsCurrentStringTerminator(initial)) {
				284	Advance(); // Skip past last "
				285	break;
tfarina	9d6f544	2014-12-18 04:42:21	[diff] [blame]	286	} else if (IsCurrentNewline()) {
[email protected]	df15e82d	2014-05-15 19:41:58	[diff] [blame]	287	*err_ = Err(LocationRange(location, GetCurrentLocation()),
				288	"Newline in string constant.");
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	289	}
				290	Advance();
				291	}
				292	break;
				293	}
				294
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	295	case Token::UNCLASSIFIED_OPERATOR:
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	296	// Some operators are two characters, some are one.
				297	if (CouldBeTwoCharOperatorBegin(cur_char())) {
				298	if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1]))
				299	Advance();
				300	}
				301	Advance();
				302	break;
				303
				304	case Token::IDENTIFIER:
				305	while (!at_end() && IsIdentifierContinuingChar(cur_char()))
				306	Advance();
				307	break;
				308
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	309	case Token::LEFT_BRACKET:
				310	case Token::RIGHT_BRACKET:
				311	case Token::LEFT_BRACE:
				312	case Token::RIGHT_BRACE:
				313	case Token::LEFT_PAREN:
				314	case Token::RIGHT_PAREN:
[email protected]	51d0172	2014-03-26 16:57:07	[diff] [blame]	315	case Token::DOT:
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	316	case Token::COMMA:
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	317	Advance(); // All are one char.
				318	break;
				319
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame]	320	case Token::UNCLASSIFIED_COMMENT:
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	321	// Eat to EOL.
				322	while (!at_end() && !IsCurrentNewline())
				323	Advance();
				324	break;
				325
				326	case Token::INVALID:
[email protected]	6ac871ea	2013-08-19 21:04:50	[diff] [blame]	327	default:
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	328	*err_ = Err(location, "Everything is all messed up",
				329	"Please insert system disk in drive A: and press any key.");
				330	NOTREACHED();
				331	return;
				332	}
				333	}
				334
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame]	335	bool Tokenizer::AtStartOfLine(size_t location) const {
				336	while (location > 0) {
				337	--location;
				338	char c = input_[location];
				339	if (c == '\n')
				340	return true;
				341	if (c != ' ')
				342	return false;
				343	}
				344	return true;
				345	}
				346
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	347	bool Tokenizer::IsCurrentWhitespace() const {
				348	DCHECK(!at_end());
				349	char c = input_[cur_];
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame]	350	// Note that tab (0x09), vertical tab (0x0B), and formfeed (0x0C) are illegal.
				351	return c == 0x0A \|\| c == 0x0D \|\| c == 0x20;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	352	}
				353
				354	bool Tokenizer::IsCurrentStringTerminator(char quote_char) const {
				355	DCHECK(!at_end());
				356	if (cur_char() != quote_char)
				357	return false;
				358
				359	// Check for escaping. \" is not a string terminator, but \\" is. Count
				360	// the number of preceeding backslashes.
				361	int num_backslashes = 0;
				362	for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--)
				363	num_backslashes++;
				364
				365	// Even backslashes mean that they were escaping each other and don't count
				366	// as escaping this quote.
				367	return (num_backslashes % 2) == 0;
				368	}
				369
				370	bool Tokenizer::IsCurrentNewline() const {
				371	return IsNewline(input_, cur_);
				372	}
				373
				374	void Tokenizer::Advance() {
				375	DCHECK(cur_ < input_.size());
				376	if (IsCurrentNewline()) {
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	377	line_number_++;
tfarina	21aff0d	2016-01-06 19:50:08	[diff] [blame^]	378	column_number_ = 1;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	379	} else {
tfarina	21aff0d	2016-01-06 19:50:08	[diff] [blame^]	380	column_number_++;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	381	}
				382	cur_++;
				383	}
				384
				385	Location Tokenizer::GetCurrentLocation() const {
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame]	386	return Location(
tfarina	21aff0d	2016-01-06 19:50:08	[diff] [blame^]	387	input_file_, line_number_, column_number_, static_cast<int>(cur_));
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	388	}
				389
				390	Err Tokenizer::GetErrorForInvalidToken(const Location& location) const {
				391	std::string help;
				392	if (cur_char() == ';') {
				393	// Semicolon.
				394	help = "Semicolons are not needed, delete this one.";
				395	} else if (cur_char() == '\t') {
				396	// Tab.
				397	help = "You got a tab character in here. Tabs are evil. "
				398	"Convert to spaces.";
				399	} else if (cur_char() == '/' && cur_ + 1 < input_.size() &&
				400	(input_[cur_ + 1] == '/' \|\| input_[cur_ + 1] == '*')) {
				401	// Different types of comments.
				402	help = "Comments should start with # instead";
tfarina	dd49500	2015-03-23 18:34:19	[diff] [blame]	403	} else if (cur_char() == '\'') {
				404	help = "Strings are delimited by \" characters, not apostrophes.";
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	405	} else {
				406	help = "I have no idea what this is.";
				407	}
				408
				409	return Err(location, "Invalid token.", help);
				410	}