Blame - tools/gn/tokenizer.cc - chromium/src

blob: b33e29a66391f1e39ad9b6169139f0dcfacff0bf [file] [log] [blame]

[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	1	// Copyright (c) 2013 The Chromium Authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style license that can be
				3	// found in the LICENSE file.
				4
				5	#include "tools/gn/tokenizer.h"
				6
				7	#include "base/logging.h"
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame]	8	#include "base/strings/string_util.h"
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	9	#include "tools/gn/input_file.h"
				10
				11	namespace {
				12
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	13	bool CouldBeTwoCharOperatorBegin(char c) {
				14	return c == '<' \|\| c == '>' \|\| c == '!' \|\| c == '=' \|\| c == '-' \|\|
				15	c == '+' \|\| c == '\|' \|\| c == '&';
				16	}
				17
				18	bool CouldBeTwoCharOperatorEnd(char c) {
				19	return c == '=' \|\| c == '\|' \|\| c == '&';
				20	}
				21
				22	bool CouldBeOneCharOperator(char c) {
				23	return c == '=' \|\| c == '<' \|\| c == '>' \|\| c == '+' \|\| c == '!' \|\|
				24	c == ':' \|\| c == '\|' \|\| c == '&' \|\| c == '-';
				25	}
				26
				27	bool CouldBeOperator(char c) {
				28	return CouldBeOneCharOperator(c) \|\| CouldBeTwoCharOperatorBegin(c);
				29	}
				30
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	31	bool IsScoperChar(char c) {
				32	return c == '(' \|\| c == ')' \|\| c == '[' \|\| c == ']' \|\| c == '{' \|\| c == '}';
				33	}
				34
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	35	Token::Type GetSpecificOperatorType(base::StringPiece value) {
				36	if (value == "=")
				37	return Token::EQUAL;
				38	if (value == "+")
				39	return Token::PLUS;
				40	if (value == "-")
				41	return Token::MINUS;
				42	if (value == "+=")
				43	return Token::PLUS_EQUALS;
				44	if (value == "-=")
				45	return Token::MINUS_EQUALS;
				46	if (value == "==")
				47	return Token::EQUAL_EQUAL;
				48	if (value == "!=")
				49	return Token::NOT_EQUAL;
				50	if (value == "<=")
				51	return Token::LESS_EQUAL;
				52	if (value == ">=")
				53	return Token::GREATER_EQUAL;
				54	if (value == "<")
				55	return Token::LESS_THAN;
				56	if (value == ">")
				57	return Token::GREATER_THAN;
				58	if (value == "&&")
				59	return Token::BOOLEAN_AND;
				60	if (value == "\|\|")
				61	return Token::BOOLEAN_OR;
				62	if (value == "!")
				63	return Token::BANG;
[email protected]	51d0172	2014-03-26 16:57:07	[diff] [blame]	64	if (value == ".")
				65	return Token::DOT;
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	66	return Token::INVALID;
				67	}
				68
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	69	} // namespace
				70
				71	Tokenizer::Tokenizer(const InputFile* input_file, Err* err)
				72	: input_file_(input_file),
				73	input_(input_file->contents()),
				74	err_(err),
				75	cur_(0),
				76	line_number_(1),
				77	char_in_line_(1) {
				78	}
				79
				80	Tokenizer::~Tokenizer() {
				81	}
				82
				83	// static
				84	std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) {
				85	Tokenizer t(input_file, err);
				86	return t.Run();
				87	}
				88
				89	std::vector<Token> Tokenizer::Run() {
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	90	DCHECK(tokens_.empty());
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	91	while (!done()) {
				92	AdvanceToNextToken();
				93	if (done())
				94	break;
				95	Location location = GetCurrentLocation();
				96
				97	Token::Type type = ClassifyCurrent();
				98	if (type == Token::INVALID) {
				99	*err_ = GetErrorForInvalidToken(location);
				100	break;
				101	}
				102	size_t token_begin = cur_;
				103	AdvanceToEndOfToken(location, type);
				104	if (has_error())
				105	break;
				106	size_t token_end = cur_;
				107
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	108	base::StringPiece token_value(&input_.data()[token_begin],
				109	token_end - token_begin);
				110
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame]	111	if (type == Token::UNCLASSIFIED_OPERATOR) {
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	112	type = GetSpecificOperatorType(token_value);
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame]	113	} else if (type == Token::IDENTIFIER) {
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	114	if (token_value == "if")
				115	type = Token::IF;
[email protected]	ed7d2be2	2013-08-20 17:23:15	[diff] [blame]	116	else if (token_value == "else")
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	117	type = Token::ELSE;
[email protected]	ed7d2be2	2013-08-20 17:23:15	[diff] [blame]	118	else if (token_value == "true")
				119	type = Token::TRUE_TOKEN;
				120	else if (token_value == "false")
				121	type = Token::FALSE_TOKEN;
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame]	122	} else if (type == Token::UNCLASSIFIED_COMMENT) {
scottmg	7b80f17	2014-09-24 03:35:13	[diff] [blame]	123	if (AtStartOfLine(token_begin) &&
				124	// If it's a standalone comment, but is a continuation of a comment on
				125	// a previous line, then instead make it a continued suffix comment.
				126	(tokens_.empty() \|\| tokens_.back().type() != Token::SUFFIX_COMMENT \|\|
				127	tokens_.back().location().line_number() + 1 !=
				128	location.line_number() \|\|
				129	tokens_.back().location().char_offset() != location.char_offset())) {
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame]	130	type = Token::LINE_COMMENT;
scottmg	2dd93e8b	2014-09-26 04:07:05	[diff] [blame^]	131	Advance(); // The current \n.
				132	// If this comment is separated from the next syntax element, then we
				133	// want to tag it as a block comment. This will become a standalone
				134	// statement at the parser level to keep this comment separate, rather
				135	// than attached to the subsequent statement.
				136	while (!at_end() && IsCurrentWhitespace()) {
				137	if (IsCurrentNewline()) {
				138	type = Token::BLOCK_COMMENT;
				139	break;
				140	}
				141	Advance();
				142	}
scottmg	7b80f17	2014-09-24 03:35:13	[diff] [blame]	143	} else {
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame]	144	type = Token::SUFFIX_COMMENT;
scottmg	7b80f17	2014-09-24 03:35:13	[diff] [blame]	145	}
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	146	}
				147
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame]	148	tokens_.push_back(Token(location, type, token_value));
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	149	}
				150	if (err_->has_error())
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	151	tokens_.clear();
				152	return tokens_;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	153	}
				154
				155	// static
				156	size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) {
[email protected]	481c3e8	2014-07-18 01:40:47	[diff] [blame]	157	DCHECK_GT(n, 0);
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	158
				159	if (n == 1)
				160	return 0;
				161
[email protected]	481c3e8	2014-07-18 01:40:47	[diff] [blame]	162	int cur_line = 1;
				163	size_t cur_byte = 0;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	164	while (cur_byte < buf.size()) {
				165	if (IsNewline(buf, cur_byte)) {
				166	cur_line++;
				167	if (cur_line == n)
				168	return cur_byte + 1;
				169	}
				170	cur_byte++;
				171	}
[email protected]	481c3e8	2014-07-18 01:40:47	[diff] [blame]	172	return static_cast<size_t>(-1);
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	173	}
				174
				175	// static
				176	bool Tokenizer::IsNewline(const base::StringPiece& buffer, size_t offset) {
				177	DCHECK(offset < buffer.size());
				178	// We may need more logic here to handle different line ending styles.
				179	return buffer[offset] == '\n';
				180	}
				181
				182
				183	void Tokenizer::AdvanceToNextToken() {
				184	while (!at_end() && IsCurrentWhitespace())
				185	Advance();
				186	}
				187
				188	Token::Type Tokenizer::ClassifyCurrent() const {
				189	DCHECK(!at_end());
				190	char next_char = cur_char();
[email protected]	8ae5d42	2014-05-13 20:17:42	[diff] [blame]	191	if (IsAsciiDigit(next_char))
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	192	return Token::INTEGER;
				193	if (next_char == '"')
				194	return Token::STRING;
				195
				196	// Note: '-' handled specially below.
				197	if (next_char != '-' && CouldBeOperator(next_char))
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	198	return Token::UNCLASSIFIED_OPERATOR;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	199
				200	if (IsIdentifierFirstChar(next_char))
				201	return Token::IDENTIFIER;
				202
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	203	if (next_char == '[')
				204	return Token::LEFT_BRACKET;
				205	if (next_char == ']')
				206	return Token::RIGHT_BRACKET;
				207	if (next_char == '(')
				208	return Token::LEFT_PAREN;
				209	if (next_char == ')')
				210	return Token::RIGHT_PAREN;
				211	if (next_char == '{')
				212	return Token::LEFT_BRACE;
				213	if (next_char == '}')
				214	return Token::RIGHT_BRACE;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	215
[email protected]	51d0172	2014-03-26 16:57:07	[diff] [blame]	216	if (next_char == '.')
				217	return Token::DOT;
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	218	if (next_char == ',')
				219	return Token::COMMA;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	220
				221	if (next_char == '#')
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame]	222	return Token::UNCLASSIFIED_COMMENT;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	223
				224	// For the case of '-' differentiate between a negative number and anything
				225	// else.
				226	if (next_char == '-') {
				227	if (!CanIncrement())
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	228	return Token::UNCLASSIFIED_OPERATOR; // Just the minus before end of
				229	// file.
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	230	char following_char = input_[cur_ + 1];
[email protected]	8ae5d42	2014-05-13 20:17:42	[diff] [blame]	231	if (IsAsciiDigit(following_char))
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	232	return Token::INTEGER;
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	233	return Token::UNCLASSIFIED_OPERATOR;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	234	}
				235
				236	return Token::INVALID;
				237	}
				238
				239	void Tokenizer::AdvanceToEndOfToken(const Location& location,
				240	Token::Type type) {
				241	switch (type) {
				242	case Token::INTEGER:
				243	do {
				244	Advance();
[email protected]	8ae5d42	2014-05-13 20:17:42	[diff] [blame]	245	} while (!at_end() && IsAsciiDigit(cur_char()));
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	246	if (!at_end()) {
				247	// Require the char after a number to be some kind of space, scope,
				248	// or operator.
				249	char c = cur_char();
				250	if (!IsCurrentWhitespace() && !CouldBeOperator(c) &&
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	251	!IsScoperChar(c) && c != ',') {
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	252	*err_ = Err(GetCurrentLocation(),
[email protected]	df15e82d	2014-05-15 19:41:58	[diff] [blame]	253	"This is not a valid number.",
				254	"Learn to count.");
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	255	// Highlight the number.
				256	err_->AppendRange(LocationRange(location, GetCurrentLocation()));
				257	}
				258	}
				259	break;
				260
				261	case Token::STRING: {
				262	char initial = cur_char();
				263	Advance(); // Advance past initial "
				264	for (;;) {
				265	if (at_end()) {
[email protected]	df15e82d	2014-05-15 19:41:58	[diff] [blame]	266	*err_ = Err(LocationRange(location, GetCurrentLocation()),
				267	"Unterminated string literal.",
				268	"Don't leave me hanging like this!");
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	269	break;
				270	}
				271	if (IsCurrentStringTerminator(initial)) {
				272	Advance(); // Skip past last "
				273	break;
				274	} else if (cur_char() == '\n') {
[email protected]	df15e82d	2014-05-15 19:41:58	[diff] [blame]	275	*err_ = Err(LocationRange(location, GetCurrentLocation()),
				276	"Newline in string constant.");
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	277	}
				278	Advance();
				279	}
				280	break;
				281	}
				282
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	283	case Token::UNCLASSIFIED_OPERATOR:
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	284	// Some operators are two characters, some are one.
				285	if (CouldBeTwoCharOperatorBegin(cur_char())) {
				286	if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1]))
				287	Advance();
				288	}
				289	Advance();
				290	break;
				291
				292	case Token::IDENTIFIER:
				293	while (!at_end() && IsIdentifierContinuingChar(cur_char()))
				294	Advance();
				295	break;
				296
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	297	case Token::LEFT_BRACKET:
				298	case Token::RIGHT_BRACKET:
				299	case Token::LEFT_BRACE:
				300	case Token::RIGHT_BRACE:
				301	case Token::LEFT_PAREN:
				302	case Token::RIGHT_PAREN:
[email protected]	51d0172	2014-03-26 16:57:07	[diff] [blame]	303	case Token::DOT:
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	304	case Token::COMMA:
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	305	Advance(); // All are one char.
				306	break;
				307
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame]	308	case Token::UNCLASSIFIED_COMMENT:
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	309	// Eat to EOL.
				310	while (!at_end() && !IsCurrentNewline())
				311	Advance();
				312	break;
				313
				314	case Token::INVALID:
[email protected]	6ac871ea	2013-08-19 21:04:50	[diff] [blame]	315	default:
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	316	*err_ = Err(location, "Everything is all messed up",
				317	"Please insert system disk in drive A: and press any key.");
				318	NOTREACHED();
				319	return;
				320	}
				321	}
				322
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame]	323	bool Tokenizer::AtStartOfLine(size_t location) const {
				324	while (location > 0) {
				325	--location;
				326	char c = input_[location];
				327	if (c == '\n')
				328	return true;
				329	if (c != ' ')
				330	return false;
				331	}
				332	return true;
				333	}
				334
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	335	bool Tokenizer::IsCurrentWhitespace() const {
				336	DCHECK(!at_end());
				337	char c = input_[cur_];
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame]	338	// Note that tab (0x09), vertical tab (0x0B), and formfeed (0x0C) are illegal.
				339	return c == 0x0A \|\| c == 0x0D \|\| c == 0x20;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	340	}
				341
				342	bool Tokenizer::IsCurrentStringTerminator(char quote_char) const {
				343	DCHECK(!at_end());
				344	if (cur_char() != quote_char)
				345	return false;
				346
				347	// Check for escaping. \" is not a string terminator, but \\" is. Count
				348	// the number of preceeding backslashes.
				349	int num_backslashes = 0;
				350	for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--)
				351	num_backslashes++;
				352
				353	// Even backslashes mean that they were escaping each other and don't count
				354	// as escaping this quote.
				355	return (num_backslashes % 2) == 0;
				356	}
				357
				358	bool Tokenizer::IsCurrentNewline() const {
				359	return IsNewline(input_, cur_);
				360	}
				361
				362	void Tokenizer::Advance() {
				363	DCHECK(cur_ < input_.size());
				364	if (IsCurrentNewline()) {
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	365	line_number_++;
				366	char_in_line_ = 1;
				367	} else {
				368	char_in_line_++;
				369	}
				370	cur_++;
				371	}
				372
				373	Location Tokenizer::GetCurrentLocation() const {
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame]	374	return Location(
				375	input_file_, line_number_, char_in_line_, static_cast<int>(cur_));
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	376	}
				377
				378	Err Tokenizer::GetErrorForInvalidToken(const Location& location) const {
				379	std::string help;
				380	if (cur_char() == ';') {
				381	// Semicolon.
				382	help = "Semicolons are not needed, delete this one.";
				383	} else if (cur_char() == '\t') {
				384	// Tab.
				385	help = "You got a tab character in here. Tabs are evil. "
				386	"Convert to spaces.";
				387	} else if (cur_char() == '/' && cur_ + 1 < input_.size() &&
				388	(input_[cur_ + 1] == '/' \|\| input_[cur_ + 1] == '*')) {
				389	// Different types of comments.
				390	help = "Comments should start with # instead";
				391	} else {
				392	help = "I have no idea what this is.";
				393	}
				394
				395	return Err(location, "Invalid token.", help);
				396	}