Blame - chrome_frame/html_utils.cc - chromium/src.git

blob: 0f7d8ab45d0d185183c6c38cf58692199388520f [file] [log] [blame]

[email protected]	f55bd486	2010-05-27 15:38:07	[diff] [blame]	1	// Copyright (c) 2010 The Chromium Authors. All rights reserved.
[email protected]	1632eb2	2009-10-01 18:14:12	[diff] [blame]	2	// Use of this source code is governed by a BSD-style license that can be
				3	// found in the LICENSE file.
				4	//
				5	#include "chrome_frame/html_utils.h"
				6
[email protected]	3f55e87	2009-10-17 04:48:37	[diff] [blame]	7	#include <atlbase.h>
				8	#include <urlmon.h>
				9
[email protected]	1632eb2	2009-10-01 18:14:12	[diff] [blame]	10	#include "base/string_util.h"
				11	#include "base/string_tokenizer.h"
[email protected]	3f55e87	2009-10-17 04:48:37	[diff] [blame]	12	#include "chrome_frame/utils.h"
[email protected]	bc2ff519	2010-06-01 22:05:45	[diff] [blame]	13	#include "net/base/net_util.h"
[email protected]	e67a73f4	2010-08-31 15:05:02	[diff] [blame^]	14	#include "webkit/glue/user_agent.h"
[email protected]	1632eb2	2009-10-01 18:14:12	[diff] [blame]	15
[email protected]	3f55e87	2009-10-17 04:48:37	[diff] [blame]	16	const wchar_t kQuotes[] = L"\"'";
[email protected]	d578d30	2009-11-19 02:25:42	[diff] [blame]	17	const char kXFrameOptionsHeader[] = "X-Frame-Options";
				18	const char kXFrameOptionsValueAllowAll[] = "allowall";
[email protected]	1632eb2	2009-10-01 18:14:12	[diff] [blame]	19
				20	HTMLScanner::StringRange::StringRange() {
				21	}
				22
				23	HTMLScanner::StringRange::StringRange(StrPos start, StrPos end)
				24	: start_(start), end_(end) {
				25	}
				26
				27	bool HTMLScanner::StringRange::LowerCaseEqualsASCII(const char* other) const {
				28	return ::LowerCaseEqualsASCII(start_, end_, other);
				29	}
				30
				31	bool HTMLScanner::StringRange::Equals(const wchar_t* other) const {
				32	int ret = wcsncmp(&start_[0], other, end_ - start_);
				33	if (ret == 0)
				34	ret = (other[end_ - start_] == L'\0') ? 0 : -1;
				35	return ret == 0;
				36	}
				37
				38	std::wstring HTMLScanner::StringRange::Copy() const {
				39	return std::wstring(start_, end_);
				40	}
				41
				42	bool HTMLScanner::StringRange::GetTagName(std::wstring* tag_name) const {
				43	if (*start_ != L'<') {
				44	LOG(ERROR) << "Badly formatted tag found";
				45	return false;
				46	}
				47
				48	StrPos name_start = start_;
				49	name_start++;
				50	while (name_start < end_ && IsWhitespace(*name_start))
				51	name_start++;
				52
				53	if (name_start >= end_) {
				54	// We seem to have a degenerate tag (i.e. < >). Return false here.
				55	return false;
				56	}
				57
				58	StrPos name_end = name_start + 1;
				59	while (name_end < end_ && !IsWhitespace(*name_end))
				60	name_end++;
				61
				62	if (name_end > end_) {
				63	// This looks like an improperly formatted tab ('<foo'). Return false here.
				64	return false;
				65	}
				66
				67	tag_name->assign(name_start, name_end);
				68	return true;
				69	}
				70
				71
				72	bool HTMLScanner::StringRange::GetTagAttribute(const wchar_t* attribute_name,
				73	StringRange* attribute_value) const {
				74	if (NULL == attribute_name \|\| NULL == attribute_value) {
				75	NOTREACHED();
				76	return false;
				77	}
				78
				79	// Use this so we can use the convenience method LowerCaseEqualsASCII()
				80	// from string_util.h.
				81	std::string search_name_ascii(WideToASCII(attribute_name));
				82
				83	WStringTokenizer tokenizer(start_, end_, L" =/");
				84	tokenizer.set_options(WStringTokenizer::RETURN_DELIMS);
				85
				86	// Set up the quote chars so that we get quoted attribute values as single
				87	// tokens.
				88	tokenizer.set_quote_chars(L"\"'");
				89
				90	const bool PARSE_STATE_NAME = true;
				91	const bool PARSE_STATE_VALUE = false;
				92	bool parse_state = PARSE_STATE_NAME;
				93
				94	// Used to skip the first token, which is the tag name.
				95	bool first_token_skipped = false;
				96
				97	// This is set during a loop iteration in which an '=' sign was spotted.
				98	// It is used to filter out degenerate tags such as:
				99	// <meta foo==bar>
				100	bool last_token_was_delim = false;
				101
				102	// Set this if the attribute name has been found that we might then
				103	// pick up the value in the next loop iteration.
				104	bool attribute_name_found = false;
				105
				106	while (tokenizer.GetNext()) {
				107	// If we have a whitespace delimiter, just keep going. Cases of this should
				108	// be reduced by the CollapseWhitespace call. If we have an '=' character,
				109	// we update our state and reiterate.
				110	if (tokenizer.token_is_delim()) {
				111	if (*tokenizer.token_begin() == L'=') {
				112	if (last_token_was_delim) {
				113	// Looks like we have a badly formed tag, just stop parsing now.
				114	return false;
				115	}
				116	parse_state = !parse_state;
				117	last_token_was_delim = true;
				118	}
				119	continue;
				120	}
				121
				122	last_token_was_delim = false;
				123
				124	// The first non-delimiter token is the tag name, which we don't want.
				125	if (!first_token_skipped) {
				126	first_token_skipped = true;
				127	continue;
				128	}
				129
				130	if (PARSE_STATE_NAME == parse_state) {
				131	// We have a tag name, check to see if it matches our target name:
				132	if (::LowerCaseEqualsASCII(tokenizer.token_begin(), tokenizer.token_end(),
				133	search_name_ascii.c_str())) {
				134	attribute_name_found = true;
				135	continue;
				136	}
				137	} else if (PARSE_STATE_VALUE == parse_state && attribute_name_found) {
				138	attribute_value->start_ = tokenizer.token_begin();
				139	attribute_value->end_ = tokenizer.token_end();
				140
				141	// Unquote the attribute value if need be.
				142	attribute_value->UnQuote();
				143
				144	return true;
				145	} else if (PARSE_STATE_VALUE == parse_state) {
				146	// If we haven't found the attribute name we want yet, ignore this token
				147	// and go back to looking for our name.
				148	parse_state = PARSE_STATE_NAME;
				149	}
				150	}
				151
				152	return false;
				153	}
				154
				155	bool HTMLScanner::StringRange::UnQuote() {
				156	if (start_ + 2 > end_) {
				157	// String's too short to be quoted, bail.
				158	return false;
				159	}
				160
				161	if ((start_ == L'\'' && (end_ - 1) == L'\'') \|\|
				162	(start_ == L'"' && (end_ - 1) == L'"')) {
				163	start_ = start_ + 1;
				164	end_ = end_ - 1;
				165	return true;
				166	}
				167
				168	return false;
				169	}
				170
				171	HTMLScanner::HTMLScanner(const wchar_t* html_string)
				172	: html_string_(CollapseWhitespace(html_string, true)),
				173	quotes_(kQuotes) {
				174	}
				175
				176	void HTMLScanner::GetTagsByName(const wchar_t* name, StringRangeList* tag_list,
				177	const wchar_t* stop_tag) {
				178	DCHECK(NULL != name);
				179	DCHECK(NULL != tag_list);
				180	DCHECK(NULL != stop_tag);
				181
				182	StringRange remaining_html(html_string_.begin(), html_string_.end());
				183
				184	std::wstring search_name(name);
				185	TrimWhitespace(search_name, TRIM_ALL, &search_name);
				186
				187	// Use this so we can use the convenience method LowerCaseEqualsASCII()
				188	// from string_util.h.
				189	std::string search_name_ascii(WideToASCII(search_name));
				190	std::string stop_tag_ascii(WideToASCII(stop_tag));
				191
				192	StringRange current_tag;
				193	std::wstring current_name;
				194	while (NextTag(&remaining_html, &current_tag)) {
				195	if (current_tag.GetTagName(&current_name)) {
				196	if (LowerCaseEqualsASCII(current_name, search_name_ascii.c_str())) {
				197	tag_list->push_back(current_tag);
				198	} else if (LowerCaseEqualsASCII(current_name, stop_tag_ascii.c_str())) {
				199	// We hit the stop tag so it's time to go home.
				200	break;
				201	}
				202	}
				203	}
				204	}
				205
				206	struct ScanState {
				207	bool in_quote;
				208	bool in_escape;
				209	wchar_t quote_char;
				210	ScanState() : in_quote(false), in_escape(false) {}
				211	};
				212
				213	bool HTMLScanner::IsQuote(wchar_t c) {
				214	return quotes_.find(c) != std::wstring::npos;
				215	}
				216
				217	bool HTMLScanner::IsHTMLCommentClose(StringRange* html_string, StrPos pos) {
				218	if (pos < html_string->end_ && pos > html_string->start_ + 2 &&
				219	*pos == L'>') {
				220	return (pos-1) == L'-' && (pos-2) == L'-';
				221	}
				222	return false;
				223	}
				224
				225	bool HTMLScanner::NextTag(StringRange* html_string, StringRange* tag) {
				226	DCHECK(NULL != html_string);
				227	DCHECK(NULL != tag);
				228
				229	tag->start_ = html_string->start_;
				230	while (tag->start_ < html_string->end_ && *tag->start_ != L'<') {
				231	tag->start_++;
				232	}
				233
				234	// we went past the end of the string.
				235	if (tag->start_ >= html_string->end_) {
				236	return false;
				237	}
				238
				239	tag->end_ = tag->start_ + 1;
				240
				241	// Get the tag name to see if we are in an HTML comment. If we are, then
				242	// don't consider quotes. This should work for example:
				243	// <!-- foo ' --> <meta foo='bar'>
				244	std::wstring tag_name;
				245	StringRange start_range(tag->start_, html_string->end_);
				246	start_range.GetTagName(&tag_name);
				247	if (StartsWith(tag_name, L"!--", true)) {
				248	// We're inside a comment tag, keep going until we get out of it.
				249	while (tag->end_ < html_string->end_ &&
				250	!IsHTMLCommentClose(html_string, tag->end_)) {
				251	tag->end_++;
				252	}
				253	} else {
				254	// Properly handle quoted strings within non-comment tags by maintaining
				255	// some state while scanning. Specifically, we have to maintain state on
				256	// whether we are inside a string, what the string terminating character
				257	// will be and whether we are inside an escape sequence.
				258	ScanState state;
				259	while (tag->end_ < html_string->end_) {
				260	if (state.in_quote) {
				261	if (state.in_escape) {
				262	state.in_escape = false;
				263	} else if (*tag->end_ == '\\') {
				264	state.in_escape = true;
				265	} else if (*tag->end_ == state.quote_char) {
				266	state.in_quote = false;
				267	}
				268	} else {
				269	state.in_quote = IsQuote(state.quote_char = *tag->end_);
				270	}
				271
				272	if (!state.in_quote && *tag->end_ == L'>') {
				273	break;
				274	}
				275	tag->end_++;
				276	}
				277	}
				278
				279	// We hit the end_ but found no matching tag closure. Consider this an
				280	// incomplete tag and do not report it.
				281	if (tag->end_ >= html_string->end_)
				282	return false;
				283
				284	// Modify html_string to point to just beyond the end_ of the current tag.
				285	html_string->start_ = tag->end_ + 1;
				286
				287	return true;
				288	}
				289
[email protected]	3f55e87	2009-10-17 04:48:37	[diff] [blame]	290	namespace http_utils {
				291
				292	const char kChromeFrameUserAgent[] = "chromeframe";
[email protected]	e67a73f4	2010-08-31 15:05:02	[diff] [blame^]	293	static char g_cf_user_agent[100] = {0};
				294	static char g_chrome_user_agent[255] = {0};
[email protected]	3f55e87	2009-10-17 04:48:37	[diff] [blame]	295
				296	const char* GetChromeFrameUserAgent() {
[email protected]	e67a73f4	2010-08-31 15:05:02	[diff] [blame^]	297	if (!g_cf_user_agent[0]) {
[email protected]	3f55e87	2009-10-17 04:48:37	[diff] [blame]	298	_pAtlModule->m_csStaticDataInitAndTypeInfo.Lock();
[email protected]	e67a73f4	2010-08-31 15:05:02	[diff] [blame^]	299	if (!g_cf_user_agent[0]) {
[email protected]	a38056a	2010-01-22 21:36:41	[diff] [blame]	300	uint32 high_version = 0, low_version = 0;
				301	GetModuleVersion(reinterpret_cast<HMODULE>(&__ImageBase), &high_version,
				302	&low_version);
[email protected]	e67a73f4	2010-08-31 15:05:02	[diff] [blame^]	303	wsprintfA(g_cf_user_agent, "%s/%i.%i.%i.%i", kChromeFrameUserAgent,
[email protected]	a38056a	2010-01-22 21:36:41	[diff] [blame]	304	HIWORD(high_version), LOWORD(high_version),
				305	HIWORD(low_version), LOWORD(low_version));
[email protected]	3f55e87	2009-10-17 04:48:37	[diff] [blame]	306	}
				307	_pAtlModule->m_csStaticDataInitAndTypeInfo.Unlock();
				308	}
[email protected]	e67a73f4	2010-08-31 15:05:02	[diff] [blame^]	309	return g_cf_user_agent;
[email protected]	3f55e87	2009-10-17 04:48:37	[diff] [blame]	310	}
				311
				312	std::string AddChromeFrameToUserAgentValue(const std::string& value) {
				313	if (value.empty()) {
				314	DLOG(WARNING) << "empty user agent value";
				315	return "";
				316	}
				317
				318	DCHECK_EQ(false, StartsWithASCII(value, "User-Agent:", true));
				319
				320	if (value.find(kChromeFrameUserAgent) != std::string::npos) {
				321	// Our user agent has already been added.
				322	return value;
				323	}
				324
				325	std::string ret(value);
				326	ret += " ";
				327	ret += GetChromeFrameUserAgent();
				328
				329	return ret;
				330	}
				331
				332	std::string GetDefaultUserAgentHeaderWithCFTag() {
				333	std::string ua(GetDefaultUserAgent());
				334	return "User-Agent: " + AddChromeFrameToUserAgentValue(ua);
				335	}
				336
[email protected]	e67a73f4	2010-08-31 15:05:02	[diff] [blame^]	337	const char* GetChromeUserAgent() {
				338	if (!g_chrome_user_agent[0]) {
				339	_pAtlModule->m_csStaticDataInitAndTypeInfo.Lock();
				340	if (!g_chrome_user_agent[0]) {
				341	std::string ua;
				342	webkit_glue::BuildUserAgent(false, &ua);
				343	DCHECK(ua.length() < arraysize(g_chrome_user_agent));
				344	lstrcpynA(g_chrome_user_agent, ua.c_str(),
				345	arraysize(g_chrome_user_agent) - 1);
				346	}
				347	_pAtlModule->m_csStaticDataInitAndTypeInfo.Unlock();
				348	}
				349	return g_chrome_user_agent;
				350	}
				351
[email protected]	3f55e87	2009-10-17 04:48:37	[diff] [blame]	352	std::string GetDefaultUserAgent() {
				353	std::string ret;
[email protected]	bc2ff519	2010-06-01 22:05:45	[diff] [blame]	354	DWORD size = MAX_PATH;
[email protected]	3f55e87	2009-10-17 04:48:37	[diff] [blame]	355	HRESULT hr = E_OUTOFMEMORY;
				356	for (int retries = 1; hr == E_OUTOFMEMORY && retries <= 10; ++retries) {
				357	hr = ::ObtainUserAgentString(0, WriteInto(&ret, size + 1), &size);
				358	if (hr == E_OUTOFMEMORY) {
				359	size = MAX_PATH * retries;
				360	} else if (SUCCEEDED(hr)) {
				361	// Truncate the extra allocation.
[email protected]	bc2ff519	2010-06-01 22:05:45	[diff] [blame]	362	DCHECK_GT(size, 0U);
				363	ret.resize(size - 1);
[email protected]	3f55e87	2009-10-17 04:48:37	[diff] [blame]	364	}
				365	}
				366
				367	if (FAILED(hr)) {
				368	NOTREACHED() << StringPrintf("ObtainUserAgentString==0x%08X", hr);
[email protected]	f55bd486	2010-05-27 15:38:07	[diff] [blame]	369	return std::string();
[email protected]	3f55e87	2009-10-17 04:48:37	[diff] [blame]	370	}
				371
				372	return ret;
				373	}
				374
[email protected]	d578d30	2009-11-19 02:25:42	[diff] [blame]	375	bool HasFrameBustingHeader(const std::string& http_headers) {
[email protected]	bc2ff519	2010-06-01 22:05:45	[diff] [blame]	376	// NOTE: We cannot use net::GetSpecificHeader() here since when there are
				377	// multiple instances of a header that returns the first value seen, and we
				378	// need to look at all instances.
				379	net::HttpUtil::HeadersIterator it(http_headers.begin(), http_headers.end(),
				380	"\r\n");
[email protected]	d578d30	2009-11-19 02:25:42	[diff] [blame]	381	while (it.GetNext()) {
[email protected]	bc2ff519	2010-06-01 22:05:45	[diff] [blame]	382	if (!lstrcmpiA(it.name().c_str(), kXFrameOptionsHeader) &&
				383	lstrcmpiA(it.values().c_str(), kXFrameOptionsValueAllowAll))
				384	return true;
[email protected]	d578d30	2009-11-19 02:25:42	[diff] [blame]	385	}
[email protected]	d578d30	2009-11-19 02:25:42	[diff] [blame]	386	return false;
				387	}
				388
[email protected]	3f55e87	2009-10-17 04:48:37	[diff] [blame]	389	} // namespace http_utils