Blame - base/string_tokenizer.h - chromium/src.git

blob: f4f4c28f4bcbd9d4baee9ec6ee58465e2d0cba05 [file] [log] [blame]

license.bot	bf09a50	2008-08-24 00:55:55	[diff] [blame]	1	// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style license that can be
				3	// found in the LICENSE file.
initial.commit	d7cae12	2008-07-26 21:49:38	[diff] [blame]	4
[email protected]	036d877	2008-09-06 01:00:53	[diff] [blame]	5	#ifndef BASE_STRING_TOKENIZER_H_
				6	#define BASE_STRING_TOKENIZER_H_
initial.commit	d7cae12	2008-07-26 21:49:38	[diff] [blame]	7
				8	#include <string>
				9
				10	// StringTokenizerT is a simple string tokenizer class. It works like an
				11	// iterator that with each step (see the Advance method) updates members that
				12	// refer to the next token in the input string. The user may optionally
				13	// configure the tokenizer to return delimiters.
				14	//
				15	//
				16	// EXAMPLE 1:
				17	//
				18	// StringTokenizer t("this is a test", " ");
				19	// while (t.GetNext()) {
				20	// printf("%s\n", t.token().c_str());
				21	// }
				22	//
				23	// Output:
				24	//
				25	// this
				26	// is
				27	// a
				28	// test
				29	//
				30	//
				31	// EXAMPLE 2:
				32	//
				33	// StringTokenizer t("no-cache=\"foo, bar\", private", ", ");
				34	// t.set_quote_chars("\"");
				35	// while (t.GetNext()) {
				36	// printf("%s\n", t.token().c_str());
				37	// }
				38	//
				39	// Output:
				40	//
				41	// no-cache="foo, bar"
				42	// private
				43	//
				44	//
				45	// EXAMPLE 3:
				46	//
				47	// bool next_is_option = false, next_is_value = false;
				48	// std::string input = "text/html; charset=UTF-8; foo=bar";
				49	// StringTokenizer t(input, "; =");
				50	// t.set_options(StringTokenizer::RETURN_DELIMS);
				51	// while (t.GetNext()) {
				52	// if (t.token_is_delim()) {
				53	// switch (*t.token_begin()) {
				54	// case ';':
				55	// next_is_option = true;
				56	// break;
				57	// case '=':
				58	// next_is_value = true;
				59	// break;
				60	// }
				61	// } else {
				62	// const char* label;
				63	// if (next_is_option) {
				64	// label = "option-name";
				65	// next_is_option = false;
				66	// } else if (next_is_value) {
				67	// label = "option-value";
				68	// next_is_value = false;
				69	// } else {
				70	// label = "mime-type";
				71	// }
				72	// printf("%s: %s\n", label, t.token().c_str());
				73	// }
				74	// }
				75	//
				76	//
[email protected]	036d877	2008-09-06 01:00:53	[diff] [blame]	77	template <class str, class const_iterator>
initial.commit	d7cae12	2008-07-26 21:49:38	[diff] [blame]	78	class StringTokenizerT {
				79	public:
initial.commit	d7cae12	2008-07-26 21:49:38	[diff] [blame]	80	typedef typename str::value_type char_type;
				81
				82	// Options that may be pass to set_options()
				83	enum {
				84	// Specifies the delimiters should be returned as tokens
				85	RETURN_DELIMS = 1 << 0,
				86	};
				87
				88	StringTokenizerT(const str& string,
				89	const str& delims) {
				90	Init(string.begin(), string.end(), delims);
				91	}
				92
				93	StringTokenizerT(const_iterator string_begin,
				94	const_iterator string_end,
				95	const str& delims) {
				96	Init(string_begin, string_end, delims);
				97	}
				98
				99	// Set the options for this tokenizer. By default, this is 0.
				100	void set_options(int options) { options_ = options; }
				101
				102	// Set the characters to regard as quotes. By default, this is empty. When
				103	// a quote char is encountered, the tokenizer will switch into a mode where
				104	// it ignores delimiters that it finds. It switches out of this mode once it
				105	// finds another instance of the quote char. If a backslash is encountered
				106	// within a quoted string, then the next character is skipped.
[email protected]	c550756	2009-02-04 18:55:43	[diff] [blame^]	107	void set_quote_chars(const str& quotes) { quotes_ = quotes; }
initial.commit	d7cae12	2008-07-26 21:49:38	[diff] [blame]	108
				109	// Call this method to advance the tokenizer to the next delimiter. This
				110	// returns false if the tokenizer is complete. This method must be called
				111	// before calling any of the token* methods.
				112	bool GetNext() {
				113	AdvanceState state;
				114	token_is_delim_ = false;
				115	for (;;) {
				116	token_begin_ = token_end_;
				117	if (token_end_ == end_)
				118	return false;
				119	++token_end_;
				120	if (AdvanceOne(&state, *token_begin_))
				121	break;
				122	if (options_ & RETURN_DELIMS) {
				123	token_is_delim_ = true;
				124	return true;
				125	}
				126	// else skip over delim
				127	}
				128	while (token_end_ != end_ && AdvanceOne(&state, *token_end_))
				129	++token_end_;
				130	return true;
				131	}
				132
				133	// Returns true if token is a delimiter. When the tokenizer is constructed
				134	// with the RETURN_DELIMS option, this method can be used to check if the
				135	// returned token is actually a delimiter.
				136	bool token_is_delim() const { return token_is_delim_; }
				137
				138	// If GetNext() returned true, then these methods may be used to read the
				139	// value of the token.
				140	const_iterator token_begin() const { return token_begin_; }
				141	const_iterator token_end() const { return token_end_; }
				142	str token() const { return str(token_begin_, token_end_); }
				143
				144	private:
				145	void Init(const_iterator string_begin,
				146	const_iterator string_end,
				147	const str& delims) {
				148	token_end_ = string_begin;
				149	end_ = string_end;
				150	delims_ = delims;
				151	options_ = 0;
				152	}
				153
				154	bool IsDelim(char_type c) const {
				155	return delims_.find(c) != str::npos;
				156	}
				157
				158	bool IsQuote(char_type c) const {
				159	return quotes_.find(c) != str::npos;
				160	}
				161
				162	struct AdvanceState {
				163	bool in_quote;
				164	bool in_escape;
				165	char_type quote_char;
				166	AdvanceState() : in_quote(false), in_escape(false) {}
				167	};
				168
				169	// Returns true if a delimiter was not hit.
				170	bool AdvanceOne(AdvanceState* state, char_type c) {
				171	if (state->in_quote) {
				172	if (state->in_escape) {
				173	state->in_escape = false;
				174	} else if (c == '\\') {
				175	state->in_escape = true;
				176	} else if (c == state->quote_char) {
				177	state->in_quote = false;
				178	}
				179	} else {
				180	if (IsDelim(c))
				181	return false;
				182	state->in_quote = IsQuote(state->quote_char = c);
				183	}
				184	return true;
				185	}
				186
				187	const_iterator token_begin_;
				188	const_iterator token_end_;
				189	const_iterator end_;
				190	str delims_;
				191	str quotes_;
				192	int options_;
				193	bool token_is_delim_;
				194	};
				195
[email protected]	036d877	2008-09-06 01:00:53	[diff] [blame]	196	typedef StringTokenizerT<std::string, std::string::const_iterator>
				197	StringTokenizer;
				198	typedef StringTokenizerT<std::wstring, std::wstring::const_iterator>
				199	WStringTokenizer;
				200	typedef StringTokenizerT<std::string, const char*> CStringTokenizer;
initial.commit	d7cae12	2008-07-26 21:49:38	[diff] [blame]	201
[email protected]	036d877	2008-09-06 01:00:53	[diff] [blame]	202	#endif // BASE_STRING_TOKENIZER_H_
license.bot	bf09a50	2008-08-24 00:55:55	[diff] [blame]	203