Blame - tools/gn/tokenizer.cc - chromium/src

blob: b4f364f2a6e382125abcd5a9e852cc6efd8afa83 [file] [log] [blame]

[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	1	// Copyright (c) 2013 The Chromium Authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style license that can be
				3	// found in the LICENSE file.
				4
				5	#include "tools/gn/tokenizer.h"
				6
				7	#include "base/logging.h"
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame^]	8	#include "base/strings/string_util.h"
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	9	#include "tools/gn/input_file.h"
				10
				11	namespace {
				12
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	13	bool CouldBeTwoCharOperatorBegin(char c) {
				14	return c == '<' \|\| c == '>' \|\| c == '!' \|\| c == '=' \|\| c == '-' \|\|
				15	c == '+' \|\| c == '\|' \|\| c == '&';
				16	}
				17
				18	bool CouldBeTwoCharOperatorEnd(char c) {
				19	return c == '=' \|\| c == '\|' \|\| c == '&';
				20	}
				21
				22	bool CouldBeOneCharOperator(char c) {
				23	return c == '=' \|\| c == '<' \|\| c == '>' \|\| c == '+' \|\| c == '!' \|\|
				24	c == ':' \|\| c == '\|' \|\| c == '&' \|\| c == '-';
				25	}
				26
				27	bool CouldBeOperator(char c) {
				28	return CouldBeOneCharOperator(c) \|\| CouldBeTwoCharOperatorBegin(c);
				29	}
				30
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	31	bool IsScoperChar(char c) {
				32	return c == '(' \|\| c == ')' \|\| c == '[' \|\| c == ']' \|\| c == '{' \|\| c == '}';
				33	}
				34
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	35	Token::Type GetSpecificOperatorType(base::StringPiece value) {
				36	if (value == "=")
				37	return Token::EQUAL;
				38	if (value == "+")
				39	return Token::PLUS;
				40	if (value == "-")
				41	return Token::MINUS;
				42	if (value == "+=")
				43	return Token::PLUS_EQUALS;
				44	if (value == "-=")
				45	return Token::MINUS_EQUALS;
				46	if (value == "==")
				47	return Token::EQUAL_EQUAL;
				48	if (value == "!=")
				49	return Token::NOT_EQUAL;
				50	if (value == "<=")
				51	return Token::LESS_EQUAL;
				52	if (value == ">=")
				53	return Token::GREATER_EQUAL;
				54	if (value == "<")
				55	return Token::LESS_THAN;
				56	if (value == ">")
				57	return Token::GREATER_THAN;
				58	if (value == "&&")
				59	return Token::BOOLEAN_AND;
				60	if (value == "\|\|")
				61	return Token::BOOLEAN_OR;
				62	if (value == "!")
				63	return Token::BANG;
[email protected]	51d0172	2014-03-26 16:57:07	[diff] [blame]	64	if (value == ".")
				65	return Token::DOT;
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	66	return Token::INVALID;
				67	}
				68
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	69	} // namespace
				70
				71	Tokenizer::Tokenizer(const InputFile* input_file, Err* err)
				72	: input_file_(input_file),
				73	input_(input_file->contents()),
				74	err_(err),
				75	cur_(0),
				76	line_number_(1),
				77	char_in_line_(1) {
				78	}
				79
				80	Tokenizer::~Tokenizer() {
				81	}
				82
				83	// static
				84	std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) {
				85	Tokenizer t(input_file, err);
				86	return t.Run();
				87	}
				88
				89	std::vector<Token> Tokenizer::Run() {
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	90	DCHECK(tokens_.empty());
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	91	while (!done()) {
				92	AdvanceToNextToken();
				93	if (done())
				94	break;
				95	Location location = GetCurrentLocation();
				96
				97	Token::Type type = ClassifyCurrent();
				98	if (type == Token::INVALID) {
				99	*err_ = GetErrorForInvalidToken(location);
				100	break;
				101	}
				102	size_t token_begin = cur_;
				103	AdvanceToEndOfToken(location, type);
				104	if (has_error())
				105	break;
				106	size_t token_end = cur_;
				107
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	108	base::StringPiece token_value(&input_.data()[token_begin],
				109	token_end - token_begin);
				110
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame^]	111	if (type == Token::UNCLASSIFIED_OPERATOR) {
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	112	type = GetSpecificOperatorType(token_value);
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame^]	113	} else if (type == Token::IDENTIFIER) {
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	114	if (token_value == "if")
				115	type = Token::IF;
[email protected]	ed7d2be2	2013-08-20 17:23:15	[diff] [blame]	116	else if (token_value == "else")
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	117	type = Token::ELSE;
[email protected]	ed7d2be2	2013-08-20 17:23:15	[diff] [blame]	118	else if (token_value == "true")
				119	type = Token::TRUE_TOKEN;
				120	else if (token_value == "false")
				121	type = Token::FALSE_TOKEN;
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame^]	122	} else if (type == Token::UNCLASSIFIED_COMMENT) {
				123	if (AtStartOfLine(token_begin))
				124	type = Token::LINE_COMMENT;
				125	else
				126	type = Token::SUFFIX_COMMENT;
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	127	}
				128
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame^]	129	tokens_.push_back(Token(location, type, token_value));
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	130	}
				131	if (err_->has_error())
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	132	tokens_.clear();
				133	return tokens_;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	134	}
				135
				136	// static
				137	size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) {
[email protected]	481c3e8	2014-07-18 01:40:47	[diff] [blame]	138	DCHECK_GT(n, 0);
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	139
				140	if (n == 1)
				141	return 0;
				142
[email protected]	481c3e8	2014-07-18 01:40:47	[diff] [blame]	143	int cur_line = 1;
				144	size_t cur_byte = 0;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	145	while (cur_byte < buf.size()) {
				146	if (IsNewline(buf, cur_byte)) {
				147	cur_line++;
				148	if (cur_line == n)
				149	return cur_byte + 1;
				150	}
				151	cur_byte++;
				152	}
[email protected]	481c3e8	2014-07-18 01:40:47	[diff] [blame]	153	return static_cast<size_t>(-1);
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	154	}
				155
				156	// static
				157	bool Tokenizer::IsNewline(const base::StringPiece& buffer, size_t offset) {
				158	DCHECK(offset < buffer.size());
				159	// We may need more logic here to handle different line ending styles.
				160	return buffer[offset] == '\n';
				161	}
				162
				163
				164	void Tokenizer::AdvanceToNextToken() {
				165	while (!at_end() && IsCurrentWhitespace())
				166	Advance();
				167	}
				168
				169	Token::Type Tokenizer::ClassifyCurrent() const {
				170	DCHECK(!at_end());
				171	char next_char = cur_char();
[email protected]	8ae5d42	2014-05-13 20:17:42	[diff] [blame]	172	if (IsAsciiDigit(next_char))
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	173	return Token::INTEGER;
				174	if (next_char == '"')
				175	return Token::STRING;
				176
				177	// Note: '-' handled specially below.
				178	if (next_char != '-' && CouldBeOperator(next_char))
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	179	return Token::UNCLASSIFIED_OPERATOR;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	180
				181	if (IsIdentifierFirstChar(next_char))
				182	return Token::IDENTIFIER;
				183
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	184	if (next_char == '[')
				185	return Token::LEFT_BRACKET;
				186	if (next_char == ']')
				187	return Token::RIGHT_BRACKET;
				188	if (next_char == '(')
				189	return Token::LEFT_PAREN;
				190	if (next_char == ')')
				191	return Token::RIGHT_PAREN;
				192	if (next_char == '{')
				193	return Token::LEFT_BRACE;
				194	if (next_char == '}')
				195	return Token::RIGHT_BRACE;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	196
[email protected]	51d0172	2014-03-26 16:57:07	[diff] [blame]	197	if (next_char == '.')
				198	return Token::DOT;
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	199	if (next_char == ',')
				200	return Token::COMMA;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	201
				202	if (next_char == '#')
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame^]	203	return Token::UNCLASSIFIED_COMMENT;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	204
				205	// For the case of '-' differentiate between a negative number and anything
				206	// else.
				207	if (next_char == '-') {
				208	if (!CanIncrement())
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	209	return Token::UNCLASSIFIED_OPERATOR; // Just the minus before end of
				210	// file.
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	211	char following_char = input_[cur_ + 1];
[email protected]	8ae5d42	2014-05-13 20:17:42	[diff] [blame]	212	if (IsAsciiDigit(following_char))
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	213	return Token::INTEGER;
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	214	return Token::UNCLASSIFIED_OPERATOR;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	215	}
				216
				217	return Token::INVALID;
				218	}
				219
				220	void Tokenizer::AdvanceToEndOfToken(const Location& location,
				221	Token::Type type) {
				222	switch (type) {
				223	case Token::INTEGER:
				224	do {
				225	Advance();
[email protected]	8ae5d42	2014-05-13 20:17:42	[diff] [blame]	226	} while (!at_end() && IsAsciiDigit(cur_char()));
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	227	if (!at_end()) {
				228	// Require the char after a number to be some kind of space, scope,
				229	// or operator.
				230	char c = cur_char();
				231	if (!IsCurrentWhitespace() && !CouldBeOperator(c) &&
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	232	!IsScoperChar(c) && c != ',') {
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	233	*err_ = Err(GetCurrentLocation(),
[email protected]	df15e82d	2014-05-15 19:41:58	[diff] [blame]	234	"This is not a valid number.",
				235	"Learn to count.");
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	236	// Highlight the number.
				237	err_->AppendRange(LocationRange(location, GetCurrentLocation()));
				238	}
				239	}
				240	break;
				241
				242	case Token::STRING: {
				243	char initial = cur_char();
				244	Advance(); // Advance past initial "
				245	for (;;) {
				246	if (at_end()) {
[email protected]	df15e82d	2014-05-15 19:41:58	[diff] [blame]	247	*err_ = Err(LocationRange(location, GetCurrentLocation()),
				248	"Unterminated string literal.",
				249	"Don't leave me hanging like this!");
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	250	break;
				251	}
				252	if (IsCurrentStringTerminator(initial)) {
				253	Advance(); // Skip past last "
				254	break;
				255	} else if (cur_char() == '\n') {
[email protected]	df15e82d	2014-05-15 19:41:58	[diff] [blame]	256	*err_ = Err(LocationRange(location, GetCurrentLocation()),
				257	"Newline in string constant.");
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	258	}
				259	Advance();
				260	}
				261	break;
				262	}
				263
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	264	case Token::UNCLASSIFIED_OPERATOR:
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	265	// Some operators are two characters, some are one.
				266	if (CouldBeTwoCharOperatorBegin(cur_char())) {
				267	if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1]))
				268	Advance();
				269	}
				270	Advance();
				271	break;
				272
				273	case Token::IDENTIFIER:
				274	while (!at_end() && IsIdentifierContinuingChar(cur_char()))
				275	Advance();
				276	break;
				277
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	278	case Token::LEFT_BRACKET:
				279	case Token::RIGHT_BRACKET:
				280	case Token::LEFT_BRACE:
				281	case Token::RIGHT_BRACE:
				282	case Token::LEFT_PAREN:
				283	case Token::RIGHT_PAREN:
[email protected]	51d0172	2014-03-26 16:57:07	[diff] [blame]	284	case Token::DOT:
[email protected]	f38ddec	2013-08-15 23:59:11	[diff] [blame]	285	case Token::COMMA:
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	286	Advance(); // All are one char.
				287	break;
				288
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame^]	289	case Token::UNCLASSIFIED_COMMENT:
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	290	// Eat to EOL.
				291	while (!at_end() && !IsCurrentNewline())
				292	Advance();
				293	break;
				294
				295	case Token::INVALID:
[email protected]	6ac871ea	2013-08-19 21:04:50	[diff] [blame]	296	default:
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	297	*err_ = Err(location, "Everything is all messed up",
				298	"Please insert system disk in drive A: and press any key.");
				299	NOTREACHED();
				300	return;
				301	}
				302	}
				303
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame^]	304	bool Tokenizer::AtStartOfLine(size_t location) const {
				305	while (location > 0) {
				306	--location;
				307	char c = input_[location];
				308	if (c == '\n')
				309	return true;
				310	if (c != ' ')
				311	return false;
				312	}
				313	return true;
				314	}
				315
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	316	bool Tokenizer::IsCurrentWhitespace() const {
				317	DCHECK(!at_end());
				318	char c = input_[cur_];
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame^]	319	// Note that tab (0x09), vertical tab (0x0B), and formfeed (0x0C) are illegal.
				320	return c == 0x0A \|\| c == 0x0D \|\| c == 0x20;
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	321	}
				322
				323	bool Tokenizer::IsCurrentStringTerminator(char quote_char) const {
				324	DCHECK(!at_end());
				325	if (cur_char() != quote_char)
				326	return false;
				327
				328	// Check for escaping. \" is not a string terminator, but \\" is. Count
				329	// the number of preceeding backslashes.
				330	int num_backslashes = 0;
				331	for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--)
				332	num_backslashes++;
				333
				334	// Even backslashes mean that they were escaping each other and don't count
				335	// as escaping this quote.
				336	return (num_backslashes % 2) == 0;
				337	}
				338
				339	bool Tokenizer::IsCurrentNewline() const {
				340	return IsNewline(input_, cur_);
				341	}
				342
				343	void Tokenizer::Advance() {
				344	DCHECK(cur_ < input_.size());
				345	if (IsCurrentNewline()) {
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	346	line_number_++;
				347	char_in_line_ = 1;
				348	} else {
				349	char_in_line_++;
				350	}
				351	cur_++;
				352	}
				353
				354	Location Tokenizer::GetCurrentLocation() const {
scottmg	be5d451	2014-09-24 02:29:12	[diff] [blame^]	355	return Location(
				356	input_file_, line_number_, char_in_line_, static_cast<int>(cur_));
[email protected]	96ea63d	2013-07-30 10:17:07	[diff] [blame]	357	}
				358
				359	Err Tokenizer::GetErrorForInvalidToken(const Location& location) const {
				360	std::string help;
				361	if (cur_char() == ';') {
				362	// Semicolon.
				363	help = "Semicolons are not needed, delete this one.";
				364	} else if (cur_char() == '\t') {
				365	// Tab.
				366	help = "You got a tab character in here. Tabs are evil. "
				367	"Convert to spaces.";
				368	} else if (cur_char() == '/' && cur_ + 1 < input_.size() &&
				369	(input_[cur_ + 1] == '/' \|\| input_[cur_ + 1] == '*')) {
				370	// Different types of comments.
				371	help = "Comments should start with # instead";
				372	} else {
				373	help = "I have no idea what this is.";
				374	}
				375
				376	return Err(location, "Invalid token.", help);
				377	}