Blame - url/url_canon_query.cc - chromium/src.git

blob: cee8774c48924fce34212ca050f3537cbe66e9c5 [file] [log] [blame]

[email protected]	e7bba5f8	2013-04-10 20:10:52	[diff] [blame^]	1	// Copyright 2007, Google Inc.
				2	// All rights reserved.
				3	//
				4	// Redistribution and use in source and binary forms, with or without
				5	// modification, are permitted provided that the following conditions are
				6	// met:
				7	//
				8	// * Redistributions of source code must retain the above copyright
				9	// notice, this list of conditions and the following disclaimer.
				10	// * Redistributions in binary form must reproduce the above
				11	// copyright notice, this list of conditions and the following disclaimer
				12	// in the documentation and/or other materials provided with the
				13	// distribution.
				14	// * Neither the name of Google Inc. nor the names of its
				15	// contributors may be used to endorse or promote products derived from
				16	// this software without specific prior written permission.
				17	//
				18	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				19	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				20	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				21	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				22	// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				23	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				24	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				25	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				26	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				27	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				28	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				29
				30	#include "googleurl/src/url_canon.h"
				31	#include "googleurl/src/url_canon_internal.h"
				32
				33	// Query canonicalization in IE
				34	// ----------------------------
				35	// IE is very permissive for query parameters specified in links on the page
				36	// (in contrast to links that it constructs itself based on form data). It does
				37	// not unescape any character. It does not reject any escape sequence (be they
				38	// invalid like "%2y" or freaky like %00).
				39	//
				40	// IE only escapes spaces and nothing else. Embedded NULLs, tabs (0x09),
				41	// LF (0x0a), and CR (0x0d) are removed (this probably happens at an earlier
				42	// layer since they are removed from all portions of the URL). All other
				43	// characters are passed unmodified. Invalid UTF-16 sequences are preserved as
				44	// well, with each character in the input being converted to UTF-8. It is the
				45	// server's job to make sense of this invalid query.
				46	//
				47	// Invalid multibyte sequences (for example, invalid UTF-8 on a UTF-8 page)
				48	// are converted to the invalid character and sent as unescaped UTF-8 (0xef,
				49	// 0xbf, 0xbd). This may not be canonicalization, the parser may generate these
				50	// strings before the URL handler ever sees them.
				51	//
				52	// Our query canonicalization
				53	// --------------------------
				54	// We escape all non-ASCII characters and control characters, like Firefox.
				55	// This is more conformant to the URL spec, and there do not seem to be many
				56	// problems relating to Firefox's behavior.
				57	//
				58	// Like IE, we will never unescape (although the application may want to try
				59	// unescaping to present the user with a more understandable URL). We will
				60	// replace all invalid sequences (including invalid UTF-16 sequences, which IE
				61	// doesn't) with the "invalid character," and we will escape it.
				62
				63	namespace url_canon {
				64
				65	namespace {
				66
				67	// Returns true if the characters starting at \|begin\| and going until \|end\|
				68	// (non-inclusive) are all representable in 7-bits.
				69	template<typename CHAR, typename UCHAR>
				70	bool IsAllASCII(const CHAR* spec, const url_parse::Component& query) {
				71	int end = query.end();
				72	for (int i = query.begin; i < end; i++) {
				73	if (static_cast<UCHAR>(spec[i]) >= 0x80)
				74	return false;
				75	}
				76	return true;
				77	}
				78
				79	// Appends the given string to the output, escaping characters that do not
				80	// match the given \|type\| in SharedCharTypes. This version will accept 8 or 16
				81	// bit characters, but assumes that they have only 7-bit values. It also assumes
				82	// that all UTF-8 values are correct, so doesn't bother checking
				83	template<typename CHAR>
				84	void AppendRaw8BitQueryString(const CHAR* source, int length,
				85	CanonOutput* output) {
				86	for (int i = 0; i < length; i++) {
				87	if (!IsQueryChar(static_cast<unsigned char>(source[i])))
				88	AppendEscapedChar(static_cast<unsigned char>(source[i]), output);
				89	else // Doesn't need escaping.
				90	output->push_back(static_cast<char>(source[i]));
				91	}
				92	}
				93
				94	// Runs the converter on the given UTF-8 input. Since the converter expects
				95	// UTF-16, we have to convert first. The converter must be non-NULL.
				96	void RunConverter(const char* spec,
				97	const url_parse::Component& query,
				98	CharsetConverter* converter,
				99	CanonOutput* output) {
				100	// This function will replace any misencoded values with the invalid
				101	// character. This is what we want so we don't have to check for error.
				102	RawCanonOutputW<1024> utf16;
				103	ConvertUTF8ToUTF16(&spec[query.begin], query.len, &utf16);
				104	converter->ConvertFromUTF16(utf16.data(), utf16.length(), output);
				105	}
				106
				107	// Runs the converter with the given UTF-16 input. We don't have to do
				108	// anything, but this overriddden function allows us to use the same code
				109	// for both UTF-8 and UTF-16 input.
				110	void RunConverter(const char16* spec,
				111	const url_parse::Component& query,
				112	CharsetConverter* converter,
				113	CanonOutput* output) {
				114	converter->ConvertFromUTF16(&spec[query.begin], query.len, output);
				115	}
				116
				117	template<typename CHAR, typename UCHAR>
				118	void DoConvertToQueryEncoding(const CHAR* spec,
				119	const url_parse::Component& query,
				120	CharsetConverter* converter,
				121	CanonOutput* output) {
				122	if (IsAllASCII<CHAR, UCHAR>(spec, query)) {
				123	// Easy: the input can just appended with no character set conversions.
				124	AppendRaw8BitQueryString(&spec[query.begin], query.len, output);
				125
				126	} else {
				127	// Harder: convert to the proper encoding first.
				128	if (converter) {
				129	// Run the converter to get an 8-bit string, then append it, escaping
				130	// necessary values.
				131	RawCanonOutput<1024> eight_bit;
				132	RunConverter(spec, query, converter, &eight_bit);
				133	AppendRaw8BitQueryString(eight_bit.data(), eight_bit.length(), output);
				134
				135	} else {
				136	// No converter, do our own UTF-8 conversion.
				137	AppendStringOfType(&spec[query.begin], query.len, CHAR_QUERY, output);
				138	}
				139	}
				140	}
				141
				142	template<typename CHAR, typename UCHAR>
				143	void DoCanonicalizeQuery(const CHAR* spec,
				144	const url_parse::Component& query,
				145	CharsetConverter* converter,
				146	CanonOutput* output,
				147	url_parse::Component* out_query) {
				148	if (query.len < 0) {
				149	*out_query = url_parse::Component();
				150	return;
				151	}
				152
				153	output->push_back('?');
				154	out_query->begin = output->length();
				155
				156	DoConvertToQueryEncoding<CHAR, UCHAR>(spec, query, converter, output);
				157
				158	out_query->len = output->length() - out_query->begin;
				159	}
				160
				161	} // namespace
				162
				163	void CanonicalizeQuery(const char* spec,
				164	const url_parse::Component& query,
				165	CharsetConverter* converter,
				166	CanonOutput* output,
				167	url_parse::Component* out_query) {
				168	DoCanonicalizeQuery<char, unsigned char>(spec, query, converter,
				169	output, out_query);
				170	}
				171
				172	void CanonicalizeQuery(const char16* spec,
				173	const url_parse::Component& query,
				174	CharsetConverter* converter,
				175	CanonOutput* output,
				176	url_parse::Component* out_query) {
				177	DoCanonicalizeQuery<char16, char16>(spec, query, converter,
				178	output, out_query);
				179	}
				180
				181	void ConvertUTF16ToQueryEncoding(const char16* input,
				182	const url_parse::Component& query,
				183	CharsetConverter* converter,
				184	CanonOutput* output) {
				185	DoConvertToQueryEncoding<char16, char16>(input, query,
				186	converter, output);
				187	}
				188
				189	} // namespace url_canon