Blame - tools/gn/tokenizer.cc - chromium/src

blob: bc0b6384df0f1ef517275e6348ddde8d52903ea9 [file] [log] [blame]

[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	1	// Copyright (c) 2013 The Chromium Authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style license that can be
				3	// found in the LICENSE file.
				4
				5	#include "tools/gn/tokenizer.h"
				6
				7	#include "base/logging.h"
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame]	8	#include "base/strings/string_util.h"
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	9	#include "tools/gn/input_file.h"
				10
				11	namespace {
				12
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	13	bool CouldBeTwoCharOperatorBegin(char c) {
				14	return c == '<' \|\| c == '>' \|\| c == '!' \|\| c == '=' \|\| c == '-' \|\|
				15	c == '+' \|\| c == '\|' \|\| c == '&';
				16	}
				17
				18	bool CouldBeTwoCharOperatorEnd(char c) {
				19	return c == '=' \|\| c == '\|' \|\| c == '&';
				20	}
				21
				22	bool CouldBeOneCharOperator(char c) {
				23	return c == '=' \|\| c == '<' \|\| c == '>' \|\| c == '+' \|\| c == '!' \|\|
				24	c == ':' \|\| c == '\|' \|\| c == '&' \|\| c == '-';
				25	}
				26
				27	bool CouldBeOperator(char c) {
				28	return CouldBeOneCharOperator(c) \|\| CouldBeTwoCharOperatorBegin(c);
				29	}
				30
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	31	bool IsScoperChar(char c) {
				32	return c == '(' \|\| c == ')' \|\| c == '[' \|\| c == ']' \|\| c == '{' \|\| c == '}';
				33	}
				34
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	35	Token::Type GetSpecificOperatorType(base::StringPiece value) {
				36	if (value == "=")
				37	return Token::EQUAL;
				38	if (value == "+")
				39	return Token::PLUS;
				40	if (value == "-")
				41	return Token::MINUS;
				42	if (value == "+=")
				43	return Token::PLUS_EQUALS;
				44	if (value == "-=")
				45	return Token::MINUS_EQUALS;
				46	if (value == "==")
				47	return Token::EQUAL_EQUAL;
				48	if (value == "!=")
				49	return Token::NOT_EQUAL;
				50	if (value == "<=")
				51	return Token::LESS_EQUAL;
				52	if (value == ">=")
				53	return Token::GREATER_EQUAL;
				54	if (value == "<")
				55	return Token::LESS_THAN;
				56	if (value == ">")
				57	return Token::GREATER_THAN;
				58	if (value == "&&")
				59	return Token::BOOLEAN_AND;
				60	if (value == "\|\|")
				61	return Token::BOOLEAN_OR;
				62	if (value == "!")
				63	return Token::BANG;
[email protected]	51d0172	2014-03-26 16:57:07	[diff] [blame]	64	if (value == ".")
				65	return Token::DOT;
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	66	return Token::INVALID;
				67	}
				68
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	69	} // namespace
				70
				71	Tokenizer::Tokenizer(const InputFile* input_file, Err* err)
				72	: input_file_(input_file),
				73	input_(input_file->contents()),
				74	err_(err),
				75	cur_(0),
				76	line_number_(1),
				77	char_in_line_(1) {
				78	}
				79
				80	Tokenizer::~Tokenizer() {
				81	}
				82
				83	// static
				84	std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) {
				85	Tokenizer t(input_file, err);
				86	return t.Run();
				87	}
				88
				89	std::vector<Token> Tokenizer::Run() {
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	90	DCHECK(tokens_.empty());
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	91	while (!done()) {
				92	AdvanceToNextToken();
				93	if (done())
				94	break;
				95	Location location = GetCurrentLocation();
				96
				97	Token::Type type = ClassifyCurrent();
				98	if (type == Token::INVALID) {
				99	*err_ = GetErrorForInvalidToken(location);
				100	break;
				101	}
				102	size_t token_begin = cur_;
				103	AdvanceToEndOfToken(location, type);
				104	if (has_error())
				105	break;
				106	size_t token_end = cur_;
				107
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	108	base::StringPiece token_value(&input_.data()[token_begin],
				109	token_end - token_begin);
				110
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame]	111	if (type == Token::UNCLASSIFIED_OPERATOR) {
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	112	type = GetSpecificOperatorType(token_value);
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame]	113	} else if (type == Token::IDENTIFIER) {
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	114	if (token_value == "if")
				115	type = Token::IF;
[email protected]	ed7d2be2	2013-08-20 17:23:15	[diff] [blame]	116	else if (token_value == "else")
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	117	type = Token::ELSE;
[email protected]	ed7d2be2	2013-08-20 17:23:15	[diff] [blame]	118	else if (token_value == "true")
				119	type = Token::TRUE_TOKEN;
				120	else if (token_value == "false")
				121	type = Token::FALSE_TOKEN;
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame]	122	} else if (type == Token::UNCLASSIFIED_COMMENT) {
scottmg	7b80f17	2014-09-24 03:35:13	[diff] [blame^]	123	if (AtStartOfLine(token_begin) &&
				124	// If it's a standalone comment, but is a continuation of a comment on
				125	// a previous line, then instead make it a continued suffix comment.
				126	(tokens_.empty() \|\| tokens_.back().type() != Token::SUFFIX_COMMENT \|\|
				127	tokens_.back().location().line_number() + 1 !=
				128	location.line_number() \|\|
				129	tokens_.back().location().char_offset() != location.char_offset())) {
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame]	130	type = Token::LINE_COMMENT;
scottmg	7b80f17	2014-09-24 03:35:13	[diff] [blame^]	131	} else {
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame]	132	type = Token::SUFFIX_COMMENT;
scottmg	7b80f17	2014-09-24 03:35:13	[diff] [blame^]	133	}
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	134	}
				135
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame]	136	tokens_.push_back(Token(location, type, token_value));
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	137	}
				138	if (err_->has_error())
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	139	tokens_.clear();
				140	return tokens_;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	141	}
				142
				143	// static
				144	size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) {
[email protected]	481c3e8	2014-07-18 01:40:47	[diff] [blame]	145	DCHECK_GT(n, 0);
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	146
				147	if (n == 1)
				148	return 0;
				149
[email protected]	481c3e8	2014-07-18 01:40:47	[diff] [blame]	150	int cur_line = 1;
				151	size_t cur_byte = 0;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	152	while (cur_byte < buf.size()) {
				153	if (IsNewline(buf, cur_byte)) {
				154	cur_line++;
				155	if (cur_line == n)
				156	return cur_byte + 1;
				157	}
				158	cur_byte++;
				159	}
[email protected]	481c3e8	2014-07-18 01:40:47	[diff] [blame]	160	return static_cast<size_t>(-1);
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	161	}
				162
				163	// static
				164	bool Tokenizer::IsNewline(const base::StringPiece& buffer, size_t offset) {
				165	DCHECK(offset < buffer.size());
				166	// We may need more logic here to handle different line ending styles.
				167	return buffer[offset] == '\n';
				168	}
				169
				170
				171	void Tokenizer::AdvanceToNextToken() {
				172	while (!at_end() && IsCurrentWhitespace())
				173	Advance();
				174	}
				175
				176	Token::Type Tokenizer::ClassifyCurrent() const {
				177	DCHECK(!at_end());
				178	char next_char = cur_char();
[email protected]	8ae5d42	2014-05-13 20:17:42	[diff] [blame]	179	if (IsAsciiDigit(next_char))
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	180	return Token::INTEGER;
				181	if (next_char == '"')
				182	return Token::STRING;
				183
				184	// Note: '-' handled specially below.
				185	if (next_char != '-' && CouldBeOperator(next_char))
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	186	return Token::UNCLASSIFIED_OPERATOR;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	187
				188	if (IsIdentifierFirstChar(next_char))
				189	return Token::IDENTIFIER;
				190
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	191	if (next_char == '[')
				192	return Token::LEFT_BRACKET;
				193	if (next_char == ']')
				194	return Token::RIGHT_BRACKET;
				195	if (next_char == '(')
				196	return Token::LEFT_PAREN;
				197	if (next_char == ')')
				198	return Token::RIGHT_PAREN;
				199	if (next_char == '{')
				200	return Token::LEFT_BRACE;
				201	if (next_char == '}')
				202	return Token::RIGHT_BRACE;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	203
[email protected]	51d0172	2014-03-26 16:57:07	[diff] [blame]	204	if (next_char == '.')
				205	return Token::DOT;
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	206	if (next_char == ',')
				207	return Token::COMMA;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	208
				209	if (next_char == '#')
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame]	210	return Token::UNCLASSIFIED_COMMENT;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	211
				212	// For the case of '-' differentiate between a negative number and anything
				213	// else.
				214	if (next_char == '-') {
				215	if (!CanIncrement())
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	216	return Token::UNCLASSIFIED_OPERATOR; // Just the minus before end of
				217	// file.
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	218	char following_char = input_[cur_ + 1];
[email protected]	8ae5d42	2014-05-13 20:17:42	[diff] [blame]	219	if (IsAsciiDigit(following_char))
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	220	return Token::INTEGER;
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	221	return Token::UNCLASSIFIED_OPERATOR;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	222	}
				223
				224	return Token::INVALID;
				225	}
				226
				227	void Tokenizer::AdvanceToEndOfToken(const Location& location,
				228	Token::Type type) {
				229	switch (type) {
				230	case Token::INTEGER:
				231	do {
				232	Advance();
[email protected]	8ae5d42	2014-05-13 20:17:42	[diff] [blame]	233	} while (!at_end() && IsAsciiDigit(cur_char()));
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	234	if (!at_end()) {
				235	// Require the char after a number to be some kind of space, scope,
				236	// or operator.
				237	char c = cur_char();
				238	if (!IsCurrentWhitespace() && !CouldBeOperator(c) &&
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	239	!IsScoperChar(c) && c != ',') {
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	240	*err_ = Err(GetCurrentLocation(),
[email protected]	df15e82d	2014-05-15 19:41:58	[diff] [blame]	241	"This is not a valid number.",
				242	"Learn to count.");
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	243	// Highlight the number.
				244	err_->AppendRange(LocationRange(location, GetCurrentLocation()));
				245	}
				246	}
				247	break;
				248
				249	case Token::STRING: {
				250	char initial = cur_char();
				251	Advance(); // Advance past initial "
				252	for (;;) {
				253	if (at_end()) {
[email protected]	df15e82d	2014-05-15 19:41:58	[diff] [blame]	254	*err_ = Err(LocationRange(location, GetCurrentLocation()),
				255	"Unterminated string literal.",
				256	"Don't leave me hanging like this!");
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	257	break;
				258	}
				259	if (IsCurrentStringTerminator(initial)) {
				260	Advance(); // Skip past last "
				261	break;
				262	} else if (cur_char() == '\n') {
[email protected]	df15e82d	2014-05-15 19:41:58	[diff] [blame]	263	*err_ = Err(LocationRange(location, GetCurrentLocation()),
				264	"Newline in string constant.");
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	265	}
				266	Advance();
				267	}
				268	break;
				269	}
				270
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	271	case Token::UNCLASSIFIED_OPERATOR:
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	272	// Some operators are two characters, some are one.
				273	if (CouldBeTwoCharOperatorBegin(cur_char())) {
				274	if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1]))
				275	Advance();
				276	}
				277	Advance();
				278	break;
				279
				280	case Token::IDENTIFIER:
				281	while (!at_end() && IsIdentifierContinuingChar(cur_char()))
				282	Advance();
				283	break;
				284
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	285	case Token::LEFT_BRACKET:
				286	case Token::RIGHT_BRACKET:
				287	case Token::LEFT_BRACE:
				288	case Token::RIGHT_BRACE:
				289	case Token::LEFT_PAREN:
				290	case Token::RIGHT_PAREN:
[email protected]	51d0172	2014-03-26 16:57:07	[diff] [blame]	291	case Token::DOT:
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	292	case Token::COMMA:
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	293	Advance(); // All are one char.
				294	break;
				295
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame]	296	case Token::UNCLASSIFIED_COMMENT:
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	297	// Eat to EOL.
				298	while (!at_end() && !IsCurrentNewline())
				299	Advance();
				300	break;
				301
				302	case Token::INVALID:
[email protected]	6ac871ea	2013-08-19 21:04:50	[diff] [blame]	303	default:
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	304	*err_ = Err(location, "Everything is all messed up",
				305	"Please insert system disk in drive A: and press any key.");
				306	NOTREACHED();
				307	return;
				308	}
				309	}
				310
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame]	311	bool Tokenizer::AtStartOfLine(size_t location) const {
				312	while (location > 0) {
				313	--location;
				314	char c = input_[location];
				315	if (c == '\n')
				316	return true;
				317	if (c != ' ')
				318	return false;
				319	}
				320	return true;
				321	}
				322
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	323	bool Tokenizer::IsCurrentWhitespace() const {
				324	DCHECK(!at_end());
				325	char c = input_[cur_];
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame]	326	// Note that tab (0x09), vertical tab (0x0B), and formfeed (0x0C) are illegal.
				327	return c == 0x0A \|\| c == 0x0D \|\| c == 0x20;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	328	}
				329
				330	bool Tokenizer::IsCurrentStringTerminator(char quote_char) const {
				331	DCHECK(!at_end());
				332	if (cur_char() != quote_char)
				333	return false;
				334
				335	// Check for escaping. \" is not a string terminator, but \\" is. Count
				336	// the number of preceeding backslashes.
				337	int num_backslashes = 0;
				338	for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--)
				339	num_backslashes++;
				340
				341	// Even backslashes mean that they were escaping each other and don't count
				342	// as escaping this quote.
				343	return (num_backslashes % 2) == 0;
				344	}
				345
				346	bool Tokenizer::IsCurrentNewline() const {
				347	return IsNewline(input_, cur_);
				348	}
				349
				350	void Tokenizer::Advance() {
				351	DCHECK(cur_ < input_.size());
				352	if (IsCurrentNewline()) {
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	353	line_number_++;
				354	char_in_line_ = 1;
				355	} else {
				356	char_in_line_++;
				357	}
				358	cur_++;
				359	}
				360
				361	Location Tokenizer::GetCurrentLocation() const {
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame]	362	return Location(
				363	input_file_, line_number_, char_in_line_, static_cast<int>(cur_));
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	364	}
				365
				366	Err Tokenizer::GetErrorForInvalidToken(const Location& location) const {
				367	std::string help;
				368	if (cur_char() == ';') {
				369	// Semicolon.
				370	help = "Semicolons are not needed, delete this one.";
				371	} else if (cur_char() == '\t') {
				372	// Tab.
				373	help = "You got a tab character in here. Tabs are evil. "
				374	"Convert to spaces.";
				375	} else if (cur_char() == '/' && cur_ + 1 < input_.size() &&
				376	(input_[cur_ + 1] == '/' \|\| input_[cur_ + 1] == '*')) {
				377	// Different types of comments.
				378	help = "Comments should start with # instead";
				379	} else {
				380	help = "I have no idea what this is.";
				381	}
				382
				383	return Err(location, "Invalid token.", help);
				384	}