Blame - components/safe_browsing_db/util.cc - chromium/src.git

blob: 18dd9f88daebe45d854ea8fc152f0328c23dab86 [file] [log] [blame]

vakh	d0a17ec	2015-11-05 23:14:34	[diff] [blame]	1	// Copyright (c) 2015 The Chromium Authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style license that can be
				3	// found in the LICENSE file.
				4
				5	#include "components/safe_browsing_db/util.h"
				6
avi	f57136c1	2015-12-25 23:27:45	[diff] [blame]	7	#include <stddef.h>
				8
				9	#include "base/macros.h"
vakh	d0a17ec	2015-11-05 23:14:34	[diff] [blame]	10	#include "base/strings/string_util.h"
joenotcharles	a6c026ad	2016-01-13 16:10:09	[diff] [blame]	11	#include "base/trace_event/trace_event.h"
vakh	d0a17ec	2015-11-05 23:14:34	[diff] [blame]	12	#include "crypto/sha2.h"
				13	#include "net/base/escape.h"
				14	#include "url/gurl.h"
				15	#include "url/url_util.h"
				16
vakh	9a474d83	2015-11-13 01:43:09	[diff] [blame]	17	namespace safe_browsing {
				18
vakh	d0a17ec	2015-11-05 23:14:34	[diff] [blame]	19	// Utility functions -----------------------------------------------------------
				20
				21	namespace {
				22	bool IsKnownList(const std::string& name) {
vakh	9a474d83	2015-11-13 01:43:09	[diff] [blame]	23	for (size_t i = 0; i < arraysize(kAllLists); ++i) {
				24	if (!strcmp(kAllLists[i], name.c_str())) {
vakh	d0a17ec	2015-11-05 23:14:34	[diff] [blame]	25	return true;
				26	}
				27	}
				28	return false;
				29	}
				30	} // namespace
				31
vakh	d0a17ec	2015-11-05 23:14:34	[diff] [blame]	32	// SBCachedFullHashResult ------------------------------------------------------
				33
				34	SBCachedFullHashResult::SBCachedFullHashResult() {}
				35
				36	SBCachedFullHashResult::SBCachedFullHashResult(
				37	const base::Time& in_expire_after)
				38	: expire_after(in_expire_after) {}
				39
vmpstr	b6449d51	2016-02-25 23:55:40	[diff] [blame^]	40	SBCachedFullHashResult::SBCachedFullHashResult(
				41	const SBCachedFullHashResult& other) = default;
				42
vakh	d0a17ec	2015-11-05 23:14:34	[diff] [blame]	43	SBCachedFullHashResult::~SBCachedFullHashResult() {}
				44
vakh	d0a17ec	2015-11-05 23:14:34	[diff] [blame]	45	// Listnames that browser can process.
				46	const char kMalwareList[] = "goog-malware-shavar";
				47	const char kPhishingList[] = "goog-phish-shavar";
				48	const char kBinUrlList[] = "goog-badbinurl-shavar";
				49	const char kCsdWhiteList[] = "goog-csdwhite-sha256";
				50	const char kDownloadWhiteList[] = "goog-downloadwhite-digest256";
				51	const char kExtensionBlacklist[] = "goog-badcrxids-digestvar";
				52	const char kIPBlacklist[] = "goog-badip-digest256";
				53	const char kUnwantedUrlList[] = "goog-unwanted-shavar";
				54	const char kInclusionWhitelist[] = "goog-csdinclusionwhite-sha256";
proberge	a933956	2016-02-17 18:40:20	[diff] [blame]	55	const char kModuleWhitelist[] = "goog-whitemodule-digest256";
veranika	fbe7992	2016-02-19 16:58:06	[diff] [blame]	56	const char kResourceBlacklist[] = "goog-badresource-shavar";
vakh	d0a17ec	2015-11-05 23:14:34	[diff] [blame]	57
veranika	fbe7992	2016-02-19 16:58:06	[diff] [blame]	58	const char* kAllLists[11] = {
proberge	a933956	2016-02-17 18:40:20	[diff] [blame]	59	kMalwareList, kPhishingList, kBinUrlList, kCsdWhiteList,
				60	kDownloadWhiteList, kExtensionBlacklist, kIPBlacklist, kUnwantedUrlList,
veranika	fbe7992	2016-02-19 16:58:06	[diff] [blame]	61	kInclusionWhitelist, kModuleWhitelist, kResourceBlacklist,
vakh	d0a17ec	2015-11-05 23:14:34	[diff] [blame]	62	};
				63
				64	ListType GetListId(const base::StringPiece& name) {
				65	ListType id;
				66	if (name == kMalwareList) {
				67	id = MALWARE;
				68	} else if (name == kPhishingList) {
				69	id = PHISH;
				70	} else if (name == kBinUrlList) {
				71	id = BINURL;
				72	} else if (name == kCsdWhiteList) {
				73	id = CSDWHITELIST;
				74	} else if (name == kDownloadWhiteList) {
				75	id = DOWNLOADWHITELIST;
				76	} else if (name == kExtensionBlacklist) {
				77	id = EXTENSIONBLACKLIST;
				78	} else if (name == kIPBlacklist) {
				79	id = IPBLACKLIST;
				80	} else if (name == kUnwantedUrlList) {
				81	id = UNWANTEDURL;
				82	} else if (name == kInclusionWhitelist) {
				83	id = INCLUSIONWHITELIST;
proberge	a933956	2016-02-17 18:40:20	[diff] [blame]	84	} else if (name == kModuleWhitelist) {
				85	id = MODULEWHITELIST;
veranika	fbe7992	2016-02-19 16:58:06	[diff] [blame]	86	} else if (name == kResourceBlacklist) {
				87	id = RESOURCEBLACKLIST;
vakh	d0a17ec	2015-11-05 23:14:34	[diff] [blame]	88	} else {
				89	id = INVALID;
				90	}
				91	return id;
				92	}
				93
				94	bool GetListName(ListType list_id, std::string* list) {
				95	switch (list_id) {
				96	case MALWARE:
				97	*list = kMalwareList;
				98	break;
				99	case PHISH:
				100	*list = kPhishingList;
				101	break;
				102	case BINURL:
				103	*list = kBinUrlList;
				104	break;
				105	case CSDWHITELIST:
				106	*list = kCsdWhiteList;
				107	break;
				108	case DOWNLOADWHITELIST:
				109	*list = kDownloadWhiteList;
				110	break;
				111	case EXTENSIONBLACKLIST:
				112	*list = kExtensionBlacklist;
				113	break;
				114	case IPBLACKLIST:
				115	*list = kIPBlacklist;
				116	break;
				117	case UNWANTEDURL:
				118	*list = kUnwantedUrlList;
				119	break;
				120	case INCLUSIONWHITELIST:
				121	*list = kInclusionWhitelist;
				122	break;
proberge	a933956	2016-02-17 18:40:20	[diff] [blame]	123	case MODULEWHITELIST:
				124	*list = kModuleWhitelist;
veranika	fbe7992	2016-02-19 16:58:06	[diff] [blame]	125	case RESOURCEBLACKLIST:
				126	*list = kResourceBlacklist;
proberge	a933956	2016-02-17 18:40:20	[diff] [blame]	127	break;
vakh	d0a17ec	2015-11-05 23:14:34	[diff] [blame]	128	default:
				129	return false;
				130	}
				131	DCHECK(IsKnownList(*list));
				132	return true;
				133	}
				134
				135
				136	SBFullHash SBFullHashForString(const base::StringPiece& str) {
				137	SBFullHash h;
				138	crypto::SHA256HashString(str, &h.full_hash, sizeof(h.full_hash));
				139	return h;
				140	}
				141
				142	SBFullHash StringToSBFullHash(const std::string& hash_in) {
				143	DCHECK_EQ(crypto::kSHA256Length, hash_in.size());
				144	SBFullHash hash_out;
				145	memcpy(hash_out.full_hash, hash_in.data(), crypto::kSHA256Length);
				146	return hash_out;
				147	}
				148
				149	std::string SBFullHashToString(const SBFullHash& hash) {
				150	DCHECK_EQ(crypto::kSHA256Length, sizeof(hash.full_hash));
				151	return std::string(hash.full_hash, sizeof(hash.full_hash));
				152	}
				153
				154
				155	std::string Unescape(const std::string& url) {
				156	std::string unescaped_str(url);
vakh	d0a17ec	2015-11-05 23:14:34	[diff] [blame]	157	const int kMaxLoopIterations = 1024;
georgesak	0770d08	2015-12-04 16:21:57	[diff] [blame]	158	size_t old_size = 0;
vakh	d0a17ec	2015-11-05 23:14:34	[diff] [blame]	159	int loop_var = 0;
				160	do {
georgesak	0770d08	2015-12-04 16:21:57	[diff] [blame]	161	old_size = unescaped_str.size();
vakh	d0a17ec	2015-11-05 23:14:34	[diff] [blame]	162	unescaped_str = net::UnescapeURLComponent(
georgesak	0770d08	2015-12-04 16:21:57	[diff] [blame]	163	unescaped_str, net::UnescapeRule::SPOOFING_AND_CONTROL_CHARS \|
				164	net::UnescapeRule::SPACES \|
				165	net::UnescapeRule::URL_SPECIAL_CHARS);
				166	} while (old_size != unescaped_str.size() &&
				167	++loop_var <= kMaxLoopIterations);
vakh	d0a17ec	2015-11-05 23:14:34	[diff] [blame]	168
				169	return unescaped_str;
				170	}
				171
				172	std::string Escape(const std::string& url) {
				173	std::string escaped_str;
georgesak	0770d08	2015-12-04 16:21:57	[diff] [blame]	174	// The escaped string is larger so allocate double the length to reduce the
				175	// chance of the string being grown.
				176	escaped_str.reserve(url.length() * 2);
vakh	d0a17ec	2015-11-05 23:14:34	[diff] [blame]	177	const char* kHexString = "0123456789ABCDEF";
				178	for (size_t i = 0; i < url.length(); i++) {
				179	unsigned char c = static_cast<unsigned char>(url[i]);
				180	if (c <= ' ' \|\| c > '~' \|\| c == '#' \|\| c == '%') {
georgesak	0770d08	2015-12-04 16:21:57	[diff] [blame]	181	escaped_str += '%';
				182	escaped_str += kHexString[c >> 4];
				183	escaped_str += kHexString[c & 0xf];
vakh	d0a17ec	2015-11-05 23:14:34	[diff] [blame]	184	} else {
georgesak	0770d08	2015-12-04 16:21:57	[diff] [blame]	185	escaped_str += c;
vakh	d0a17ec	2015-11-05 23:14:34	[diff] [blame]	186	}
				187	}
				188
				189	return escaped_str;
				190	}
				191
georgesak	3e440e019	2015-12-02 16:24:16	[diff] [blame]	192	std::string RemoveConsecutiveChars(base::StringPiece str, const char c) {
				193	std::string output;
				194	// Output is at most the length of the original string.
				195	output.reserve(str.size());
				196
				197	size_t i = 0;
				198	while (i < str.size()) {
				199	output.append(1, str[i++]);
				200	if (str[i - 1] == c) {
				201	while (i < str.size() && str[i] == c) {
				202	i++;
				203	}
				204	}
vakh	d0a17ec	2015-11-05 23:14:34	[diff] [blame]	205	}
				206
				207	return output;
				208	}
				209
				210	// Canonicalizes url as per Google Safe Browsing Specification.
				211	// See section 6.1 in
				212	// http://code.google.com/p/google-safe-browsing/wiki/Protocolv2Spec.
				213	void CanonicalizeUrl(const GURL& url,
				214	std::string* canonicalized_hostname,
				215	std::string* canonicalized_path,
				216	std::string* canonicalized_query) {
				217	DCHECK(url.is_valid());
				218
				219	// We only canonicalize "normal" URLs.
				220	if (!url.IsStandard())
				221	return;
				222
				223	// Following canonicalization steps are excluded since url parsing takes care
				224	// of those :-
				225	// 1. Remove any tab (0x09), CR (0x0d), and LF (0x0a) chars from url.
				226	// (Exclude escaped version of these chars).
				227	// 2. Normalize hostname to 4 dot-seperated decimal values.
				228	// 3. Lowercase hostname.
				229	// 4. Resolve path sequences "/../" and "/./".
				230
				231	// That leaves us with the following :-
				232	// 1. Remove fragment in URL.
				233	GURL url_without_fragment;
				234	GURL::Replacements f_replacements;
				235	f_replacements.ClearRef();
				236	f_replacements.ClearUsername();
				237	f_replacements.ClearPassword();
				238	url_without_fragment = url.ReplaceComponents(f_replacements);
				239
				240	// 2. Do URL unescaping until no more hex encoded characters exist.
				241	std::string url_unescaped_str(Unescape(url_without_fragment.spec()));
				242	url::Parsed parsed;
				243	url::ParseStandardURL(url_unescaped_str.data(), url_unescaped_str.length(),
				244	&parsed);
				245
				246	// 3. In hostname, remove all leading and trailing dots.
georgesak	3f9f4d7	2015-12-03 20:57:08	[diff] [blame]	247	base::StringPiece host;
				248	if (parsed.host.len > 0)
				249	host.set(url_unescaped_str.data() + parsed.host.begin, parsed.host.len);
				250
				251	base::StringPiece host_without_end_dots =
				252	base::TrimString(host, ".", base::TrimPositions::TRIM_ALL);
vakh	d0a17ec	2015-11-05 23:14:34	[diff] [blame]	253
				254	// 4. In hostname, replace consecutive dots with a single dot.
				255	std::string host_without_consecutive_dots(RemoveConsecutiveChars(
				256	host_without_end_dots, '.'));
				257
				258	// 5. In path, replace runs of consecutive slashes with a single slash.
georgesak	3f9f4d7	2015-12-03 20:57:08	[diff] [blame]	259	base::StringPiece path;
				260	if (parsed.path.len > 0)
				261	path.set(url_unescaped_str.data() + parsed.path.begin, parsed.path.len);
vakh	d0a17ec	2015-11-05 23:14:34	[diff] [blame]	262	std::string path_without_consecutive_slash(RemoveConsecutiveChars(path, '/'));
				263
				264	url::Replacements<char> hp_replacements;
				265	hp_replacements.SetHost(
				266	host_without_consecutive_dots.data(),
				267	url::Component(0, host_without_consecutive_dots.length()));
				268	hp_replacements.SetPath(
				269	path_without_consecutive_slash.data(),
				270	url::Component(0, path_without_consecutive_slash.length()));
				271
				272	std::string url_unescaped_with_can_hostpath;
				273	url::StdStringCanonOutput output(&url_unescaped_with_can_hostpath);
				274	url::Parsed temp_parsed;
				275	url::ReplaceComponents(url_unescaped_str.data(),
				276	url_unescaped_str.length(),
				277	parsed,
				278	hp_replacements,
				279	NULL,
				280	&output,
				281	&temp_parsed);
				282	output.Complete();
				283
				284	// 6. Step needed to revert escaping done in url::ReplaceComponents.
				285	url_unescaped_with_can_hostpath = Unescape(url_unescaped_with_can_hostpath);
				286
				287	// 7. After performing all above steps, percent-escape all chars in url which
				288	// are <= ASCII 32, >= 127, #, %. Escapes must be uppercase hex characters.
				289	std::string escaped_canon_url_str(Escape(url_unescaped_with_can_hostpath));
				290	url::Parsed final_parsed;
				291	url::ParseStandardURL(escaped_canon_url_str.data(),
				292	escaped_canon_url_str.length(),
				293	&final_parsed);
				294
				295	if (canonicalized_hostname && final_parsed.host.len > 0) {
				296	*canonicalized_hostname =
				297	escaped_canon_url_str.substr(final_parsed.host.begin,
				298	final_parsed.host.len);
				299	}
				300	if (canonicalized_path && final_parsed.path.len > 0) {
				301	*canonicalized_path = escaped_canon_url_str.substr(final_parsed.path.begin,
				302	final_parsed.path.len);
				303	}
				304	if (canonicalized_query && final_parsed.query.len > 0) {
				305	*canonicalized_query = escaped_canon_url_str.substr(
				306	final_parsed.query.begin, final_parsed.query.len);
				307	}
				308	}
				309
kcarattini	365d506	2015-12-10 06:05:29	[diff] [blame]	310	void UrlToFullHashes(const GURL& url,
				311	bool include_whitelist_hashes,
				312	std::vector<SBFullHash>* full_hashes) {
joenotcharles	a6c026ad	2016-01-13 16:10:09	[diff] [blame]	313	// Include this function in traces because it's not cheap so it should be
				314	// called sparingly.
joenotcharles	81ebeb8	2016-01-14 06:40:38	[diff] [blame]	315	TRACE_EVENT2("loader", "safe_browsing::UrlToFullHashes", "url", url.spec(),
joenotcharles	a6c026ad	2016-01-13 16:10:09	[diff] [blame]	316	"include_whitelist_hashes", include_whitelist_hashes);
kcarattini	365d506	2015-12-10 06:05:29	[diff] [blame]	317	std::vector<std::string> hosts;
				318	if (url.HostIsIPAddress()) {
				319	hosts.push_back(url.host());
				320	} else {
				321	GenerateHostsToCheck(url, &hosts);
				322	}
				323
				324	std::vector<std::string> paths;
				325	GeneratePathsToCheck(url, &paths);
				326
				327	for (const std::string& host : hosts) {
				328	for (const std::string& path : paths) {
				329	full_hashes->push_back(
				330	SBFullHashForString(host + path));
				331
				332	// We may have /foo as path-prefix in the whitelist which should
				333	// also match with /foo/bar and /foo?bar. Hence, for every path
				334	// that ends in '/' we also add the path without the slash.
				335	if (include_whitelist_hashes && path.size() > 1 &&
				336	path[path.size() - 1] == '/') {
				337	full_hashes->push_back(SBFullHashForString(
				338	host + path.substr(0, path.size() - 1)));
				339	}
				340	}
				341	}
				342	}
				343
vakh	d0a17ec	2015-11-05 23:14:34	[diff] [blame]	344	void GenerateHostsToCheck(const GURL& url, std::vector<std::string>* hosts) {
				345	hosts->clear();
				346
				347	std::string canon_host;
				348	CanonicalizeUrl(url, &canon_host, NULL, NULL);
				349
				350	const std::string host = canon_host; // const sidesteps GCC bugs below!
				351	if (host.empty())
				352	return;
				353
				354	// Per the Safe Browsing Protocol v2 spec, we try the host, and also up to 4
				355	// hostnames formed by starting with the last 5 components and successively
				356	// removing the leading component. The last component isn't examined alone,
				357	// since it's the TLD or a subcomponent thereof.
				358	//
				359	// Note that we don't need to be clever about stopping at the "real" eTLD --
				360	// the data on the server side has been filtered to ensure it will not
				361	// blacklist a whole TLD, and it's not significantly slower on our side to
				362	// just check too much.
				363	//
				364	// Also note that because we have a simple blacklist, not some sort of complex
				365	// whitelist-in-blacklist or vice versa, it doesn't matter what order we check
				366	// these in.
				367	const size_t kMaxHostsToCheck = 4;
				368	bool skipped_last_component = false;
				369	for (std::string::const_reverse_iterator i(host.rbegin());
				370	i != host.rend() && hosts->size() < kMaxHostsToCheck; ++i) {
				371	if (*i == '.') {
				372	if (skipped_last_component)
				373	hosts->push_back(std::string(i.base(), host.end()));
				374	else
				375	skipped_last_component = true;
				376	}
				377	}
				378	hosts->push_back(host);
				379	}
				380
				381	void GeneratePathsToCheck(const GURL& url, std::vector<std::string>* paths) {
				382	paths->clear();
				383
				384	std::string canon_path;
				385	std::string canon_query;
				386	CanonicalizeUrl(url, NULL, &canon_path, &canon_query);
				387
				388	const std::string path = canon_path; // const sidesteps GCC bugs below!
				389	const std::string query = canon_query;
				390	if (path.empty())
				391	return;
				392
				393	// Per the Safe Browsing Protocol v2 spec, we try the exact path with/without
				394	// the query parameters, and also up to 4 paths formed by starting at the root
				395	// and adding more path components.
				396	//
				397	// As with the hosts above, it doesn't matter what order we check these in.
				398	const size_t kMaxPathsToCheck = 4;
				399	for (std::string::const_iterator i(path.begin());
				400	i != path.end() && paths->size() < kMaxPathsToCheck; ++i) {
				401	if (*i == '/')
				402	paths->push_back(std::string(path.begin(), i + 1));
				403	}
				404
				405	if (!paths->empty() && paths->back() != path)
				406	paths->push_back(path);
				407
				408	if (!query.empty())
				409	paths->push_back(path + "?" + query);
				410	}
				411
				412	void GeneratePatternsToCheck(const GURL& url, std::vector<std::string>* urls) {
				413	std::vector<std::string> hosts, paths;
				414	GenerateHostsToCheck(url, &hosts);
				415	GeneratePathsToCheck(url, &paths);
				416	for (size_t h = 0; h < hosts.size(); ++h) {
				417	for (size_t p = 0; p < paths.size(); ++p) {
				418	urls->push_back(hosts[h] + paths[p]);
				419	}
				420	}
				421	}
				422
				423	} // namespace safe_browsing