blob: 18dd9f88daebe45d854ea8fc152f0328c23dab86 [file] [log] [blame]
vakhd0a17ec2015-11-05 23:14:341// Copyright (c) 2015 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "components/safe_browsing_db/util.h"
6
avif57136c12015-12-25 23:27:457#include <stddef.h>
8
9#include "base/macros.h"
vakhd0a17ec2015-11-05 23:14:3410#include "base/strings/string_util.h"
joenotcharlesa6c026ad2016-01-13 16:10:0911#include "base/trace_event/trace_event.h"
vakhd0a17ec2015-11-05 23:14:3412#include "crypto/sha2.h"
13#include "net/base/escape.h"
14#include "url/gurl.h"
15#include "url/url_util.h"
16
vakh9a474d832015-11-13 01:43:0917namespace safe_browsing {
18
vakhd0a17ec2015-11-05 23:14:3419// Utility functions -----------------------------------------------------------
20
21namespace {
22bool IsKnownList(const std::string& name) {
vakh9a474d832015-11-13 01:43:0923 for (size_t i = 0; i < arraysize(kAllLists); ++i) {
24 if (!strcmp(kAllLists[i], name.c_str())) {
vakhd0a17ec2015-11-05 23:14:3425 return true;
26 }
27 }
28 return false;
29}
30} // namespace
31
vakhd0a17ec2015-11-05 23:14:3432// SBCachedFullHashResult ------------------------------------------------------
33
34SBCachedFullHashResult::SBCachedFullHashResult() {}
35
36SBCachedFullHashResult::SBCachedFullHashResult(
37 const base::Time& in_expire_after)
38 : expire_after(in_expire_after) {}
39
vmpstrb6449d512016-02-25 23:55:4040SBCachedFullHashResult::SBCachedFullHashResult(
41 const SBCachedFullHashResult& other) = default;
42
vakhd0a17ec2015-11-05 23:14:3443SBCachedFullHashResult::~SBCachedFullHashResult() {}
44
vakhd0a17ec2015-11-05 23:14:3445// Listnames that browser can process.
46const char kMalwareList[] = "goog-malware-shavar";
47const char kPhishingList[] = "goog-phish-shavar";
48const char kBinUrlList[] = "goog-badbinurl-shavar";
49const char kCsdWhiteList[] = "goog-csdwhite-sha256";
50const char kDownloadWhiteList[] = "goog-downloadwhite-digest256";
51const char kExtensionBlacklist[] = "goog-badcrxids-digestvar";
52const char kIPBlacklist[] = "goog-badip-digest256";
53const char kUnwantedUrlList[] = "goog-unwanted-shavar";
54const char kInclusionWhitelist[] = "goog-csdinclusionwhite-sha256";
probergea9339562016-02-17 18:40:2055const char kModuleWhitelist[] = "goog-whitemodule-digest256";
veranikafbe79922016-02-19 16:58:0656const char kResourceBlacklist[] = "goog-badresource-shavar";
vakhd0a17ec2015-11-05 23:14:3457
veranikafbe79922016-02-19 16:58:0658const char* kAllLists[11] = {
probergea9339562016-02-17 18:40:2059 kMalwareList, kPhishingList, kBinUrlList, kCsdWhiteList,
60 kDownloadWhiteList, kExtensionBlacklist, kIPBlacklist, kUnwantedUrlList,
veranikafbe79922016-02-19 16:58:0661 kInclusionWhitelist, kModuleWhitelist, kResourceBlacklist,
vakhd0a17ec2015-11-05 23:14:3462};
63
64ListType GetListId(const base::StringPiece& name) {
65 ListType id;
66 if (name == kMalwareList) {
67 id = MALWARE;
68 } else if (name == kPhishingList) {
69 id = PHISH;
70 } else if (name == kBinUrlList) {
71 id = BINURL;
72 } else if (name == kCsdWhiteList) {
73 id = CSDWHITELIST;
74 } else if (name == kDownloadWhiteList) {
75 id = DOWNLOADWHITELIST;
76 } else if (name == kExtensionBlacklist) {
77 id = EXTENSIONBLACKLIST;
78 } else if (name == kIPBlacklist) {
79 id = IPBLACKLIST;
80 } else if (name == kUnwantedUrlList) {
81 id = UNWANTEDURL;
82 } else if (name == kInclusionWhitelist) {
83 id = INCLUSIONWHITELIST;
probergea9339562016-02-17 18:40:2084 } else if (name == kModuleWhitelist) {
85 id = MODULEWHITELIST;
veranikafbe79922016-02-19 16:58:0686 } else if (name == kResourceBlacklist) {
87 id = RESOURCEBLACKLIST;
vakhd0a17ec2015-11-05 23:14:3488 } else {
89 id = INVALID;
90 }
91 return id;
92}
93
94bool GetListName(ListType list_id, std::string* list) {
95 switch (list_id) {
96 case MALWARE:
97 *list = kMalwareList;
98 break;
99 case PHISH:
100 *list = kPhishingList;
101 break;
102 case BINURL:
103 *list = kBinUrlList;
104 break;
105 case CSDWHITELIST:
106 *list = kCsdWhiteList;
107 break;
108 case DOWNLOADWHITELIST:
109 *list = kDownloadWhiteList;
110 break;
111 case EXTENSIONBLACKLIST:
112 *list = kExtensionBlacklist;
113 break;
114 case IPBLACKLIST:
115 *list = kIPBlacklist;
116 break;
117 case UNWANTEDURL:
118 *list = kUnwantedUrlList;
119 break;
120 case INCLUSIONWHITELIST:
121 *list = kInclusionWhitelist;
122 break;
probergea9339562016-02-17 18:40:20123 case MODULEWHITELIST:
124 *list = kModuleWhitelist;
veranikafbe79922016-02-19 16:58:06125 case RESOURCEBLACKLIST:
126 *list = kResourceBlacklist;
probergea9339562016-02-17 18:40:20127 break;
vakhd0a17ec2015-11-05 23:14:34128 default:
129 return false;
130 }
131 DCHECK(IsKnownList(*list));
132 return true;
133}
134
135
136SBFullHash SBFullHashForString(const base::StringPiece& str) {
137 SBFullHash h;
138 crypto::SHA256HashString(str, &h.full_hash, sizeof(h.full_hash));
139 return h;
140}
141
142SBFullHash StringToSBFullHash(const std::string& hash_in) {
143 DCHECK_EQ(crypto::kSHA256Length, hash_in.size());
144 SBFullHash hash_out;
145 memcpy(hash_out.full_hash, hash_in.data(), crypto::kSHA256Length);
146 return hash_out;
147}
148
149std::string SBFullHashToString(const SBFullHash& hash) {
150 DCHECK_EQ(crypto::kSHA256Length, sizeof(hash.full_hash));
151 return std::string(hash.full_hash, sizeof(hash.full_hash));
152}
153
154
155std::string Unescape(const std::string& url) {
156 std::string unescaped_str(url);
vakhd0a17ec2015-11-05 23:14:34157 const int kMaxLoopIterations = 1024;
georgesak0770d082015-12-04 16:21:57158 size_t old_size = 0;
vakhd0a17ec2015-11-05 23:14:34159 int loop_var = 0;
160 do {
georgesak0770d082015-12-04 16:21:57161 old_size = unescaped_str.size();
vakhd0a17ec2015-11-05 23:14:34162 unescaped_str = net::UnescapeURLComponent(
georgesak0770d082015-12-04 16:21:57163 unescaped_str, net::UnescapeRule::SPOOFING_AND_CONTROL_CHARS |
164 net::UnescapeRule::SPACES |
165 net::UnescapeRule::URL_SPECIAL_CHARS);
166 } while (old_size != unescaped_str.size() &&
167 ++loop_var <= kMaxLoopIterations);
vakhd0a17ec2015-11-05 23:14:34168
169 return unescaped_str;
170}
171
172std::string Escape(const std::string& url) {
173 std::string escaped_str;
georgesak0770d082015-12-04 16:21:57174 // The escaped string is larger so allocate double the length to reduce the
175 // chance of the string being grown.
176 escaped_str.reserve(url.length() * 2);
vakhd0a17ec2015-11-05 23:14:34177 const char* kHexString = "0123456789ABCDEF";
178 for (size_t i = 0; i < url.length(); i++) {
179 unsigned char c = static_cast<unsigned char>(url[i]);
180 if (c <= ' ' || c > '~' || c == '#' || c == '%') {
georgesak0770d082015-12-04 16:21:57181 escaped_str += '%';
182 escaped_str += kHexString[c >> 4];
183 escaped_str += kHexString[c & 0xf];
vakhd0a17ec2015-11-05 23:14:34184 } else {
georgesak0770d082015-12-04 16:21:57185 escaped_str += c;
vakhd0a17ec2015-11-05 23:14:34186 }
187 }
188
189 return escaped_str;
190}
191
georgesak3e440e0192015-12-02 16:24:16192std::string RemoveConsecutiveChars(base::StringPiece str, const char c) {
193 std::string output;
194 // Output is at most the length of the original string.
195 output.reserve(str.size());
196
197 size_t i = 0;
198 while (i < str.size()) {
199 output.append(1, str[i++]);
200 if (str[i - 1] == c) {
201 while (i < str.size() && str[i] == c) {
202 i++;
203 }
204 }
vakhd0a17ec2015-11-05 23:14:34205 }
206
207 return output;
208}
209
210// Canonicalizes url as per Google Safe Browsing Specification.
211// See section 6.1 in
212// http://code.google.com/p/google-safe-browsing/wiki/Protocolv2Spec.
213void CanonicalizeUrl(const GURL& url,
214 std::string* canonicalized_hostname,
215 std::string* canonicalized_path,
216 std::string* canonicalized_query) {
217 DCHECK(url.is_valid());
218
219 // We only canonicalize "normal" URLs.
220 if (!url.IsStandard())
221 return;
222
223 // Following canonicalization steps are excluded since url parsing takes care
224 // of those :-
225 // 1. Remove any tab (0x09), CR (0x0d), and LF (0x0a) chars from url.
226 // (Exclude escaped version of these chars).
227 // 2. Normalize hostname to 4 dot-seperated decimal values.
228 // 3. Lowercase hostname.
229 // 4. Resolve path sequences "/../" and "/./".
230
231 // That leaves us with the following :-
232 // 1. Remove fragment in URL.
233 GURL url_without_fragment;
234 GURL::Replacements f_replacements;
235 f_replacements.ClearRef();
236 f_replacements.ClearUsername();
237 f_replacements.ClearPassword();
238 url_without_fragment = url.ReplaceComponents(f_replacements);
239
240 // 2. Do URL unescaping until no more hex encoded characters exist.
241 std::string url_unescaped_str(Unescape(url_without_fragment.spec()));
242 url::Parsed parsed;
243 url::ParseStandardURL(url_unescaped_str.data(), url_unescaped_str.length(),
244 &parsed);
245
246 // 3. In hostname, remove all leading and trailing dots.
georgesak3f9f4d72015-12-03 20:57:08247 base::StringPiece host;
248 if (parsed.host.len > 0)
249 host.set(url_unescaped_str.data() + parsed.host.begin, parsed.host.len);
250
251 base::StringPiece host_without_end_dots =
252 base::TrimString(host, ".", base::TrimPositions::TRIM_ALL);
vakhd0a17ec2015-11-05 23:14:34253
254 // 4. In hostname, replace consecutive dots with a single dot.
255 std::string host_without_consecutive_dots(RemoveConsecutiveChars(
256 host_without_end_dots, '.'));
257
258 // 5. In path, replace runs of consecutive slashes with a single slash.
georgesak3f9f4d72015-12-03 20:57:08259 base::StringPiece path;
260 if (parsed.path.len > 0)
261 path.set(url_unescaped_str.data() + parsed.path.begin, parsed.path.len);
vakhd0a17ec2015-11-05 23:14:34262 std::string path_without_consecutive_slash(RemoveConsecutiveChars(path, '/'));
263
264 url::Replacements<char> hp_replacements;
265 hp_replacements.SetHost(
266 host_without_consecutive_dots.data(),
267 url::Component(0, host_without_consecutive_dots.length()));
268 hp_replacements.SetPath(
269 path_without_consecutive_slash.data(),
270 url::Component(0, path_without_consecutive_slash.length()));
271
272 std::string url_unescaped_with_can_hostpath;
273 url::StdStringCanonOutput output(&url_unescaped_with_can_hostpath);
274 url::Parsed temp_parsed;
275 url::ReplaceComponents(url_unescaped_str.data(),
276 url_unescaped_str.length(),
277 parsed,
278 hp_replacements,
279 NULL,
280 &output,
281 &temp_parsed);
282 output.Complete();
283
284 // 6. Step needed to revert escaping done in url::ReplaceComponents.
285 url_unescaped_with_can_hostpath = Unescape(url_unescaped_with_can_hostpath);
286
287 // 7. After performing all above steps, percent-escape all chars in url which
288 // are <= ASCII 32, >= 127, #, %. Escapes must be uppercase hex characters.
289 std::string escaped_canon_url_str(Escape(url_unescaped_with_can_hostpath));
290 url::Parsed final_parsed;
291 url::ParseStandardURL(escaped_canon_url_str.data(),
292 escaped_canon_url_str.length(),
293 &final_parsed);
294
295 if (canonicalized_hostname && final_parsed.host.len > 0) {
296 *canonicalized_hostname =
297 escaped_canon_url_str.substr(final_parsed.host.begin,
298 final_parsed.host.len);
299 }
300 if (canonicalized_path && final_parsed.path.len > 0) {
301 *canonicalized_path = escaped_canon_url_str.substr(final_parsed.path.begin,
302 final_parsed.path.len);
303 }
304 if (canonicalized_query && final_parsed.query.len > 0) {
305 *canonicalized_query = escaped_canon_url_str.substr(
306 final_parsed.query.begin, final_parsed.query.len);
307 }
308}
309
kcarattini365d5062015-12-10 06:05:29310void UrlToFullHashes(const GURL& url,
311 bool include_whitelist_hashes,
312 std::vector<SBFullHash>* full_hashes) {
joenotcharlesa6c026ad2016-01-13 16:10:09313 // Include this function in traces because it's not cheap so it should be
314 // called sparingly.
joenotcharles81ebeb82016-01-14 06:40:38315 TRACE_EVENT2("loader", "safe_browsing::UrlToFullHashes", "url", url.spec(),
joenotcharlesa6c026ad2016-01-13 16:10:09316 "include_whitelist_hashes", include_whitelist_hashes);
kcarattini365d5062015-12-10 06:05:29317 std::vector<std::string> hosts;
318 if (url.HostIsIPAddress()) {
319 hosts.push_back(url.host());
320 } else {
321 GenerateHostsToCheck(url, &hosts);
322 }
323
324 std::vector<std::string> paths;
325 GeneratePathsToCheck(url, &paths);
326
327 for (const std::string& host : hosts) {
328 for (const std::string& path : paths) {
329 full_hashes->push_back(
330 SBFullHashForString(host + path));
331
332 // We may have /foo as path-prefix in the whitelist which should
333 // also match with /foo/bar and /foo?bar. Hence, for every path
334 // that ends in '/' we also add the path without the slash.
335 if (include_whitelist_hashes && path.size() > 1 &&
336 path[path.size() - 1] == '/') {
337 full_hashes->push_back(SBFullHashForString(
338 host + path.substr(0, path.size() - 1)));
339 }
340 }
341 }
342}
343
vakhd0a17ec2015-11-05 23:14:34344void GenerateHostsToCheck(const GURL& url, std::vector<std::string>* hosts) {
345 hosts->clear();
346
347 std::string canon_host;
348 CanonicalizeUrl(url, &canon_host, NULL, NULL);
349
350 const std::string host = canon_host; // const sidesteps GCC bugs below!
351 if (host.empty())
352 return;
353
354 // Per the Safe Browsing Protocol v2 spec, we try the host, and also up to 4
355 // hostnames formed by starting with the last 5 components and successively
356 // removing the leading component. The last component isn't examined alone,
357 // since it's the TLD or a subcomponent thereof.
358 //
359 // Note that we don't need to be clever about stopping at the "real" eTLD --
360 // the data on the server side has been filtered to ensure it will not
361 // blacklist a whole TLD, and it's not significantly slower on our side to
362 // just check too much.
363 //
364 // Also note that because we have a simple blacklist, not some sort of complex
365 // whitelist-in-blacklist or vice versa, it doesn't matter what order we check
366 // these in.
367 const size_t kMaxHostsToCheck = 4;
368 bool skipped_last_component = false;
369 for (std::string::const_reverse_iterator i(host.rbegin());
370 i != host.rend() && hosts->size() < kMaxHostsToCheck; ++i) {
371 if (*i == '.') {
372 if (skipped_last_component)
373 hosts->push_back(std::string(i.base(), host.end()));
374 else
375 skipped_last_component = true;
376 }
377 }
378 hosts->push_back(host);
379}
380
381void GeneratePathsToCheck(const GURL& url, std::vector<std::string>* paths) {
382 paths->clear();
383
384 std::string canon_path;
385 std::string canon_query;
386 CanonicalizeUrl(url, NULL, &canon_path, &canon_query);
387
388 const std::string path = canon_path; // const sidesteps GCC bugs below!
389 const std::string query = canon_query;
390 if (path.empty())
391 return;
392
393 // Per the Safe Browsing Protocol v2 spec, we try the exact path with/without
394 // the query parameters, and also up to 4 paths formed by starting at the root
395 // and adding more path components.
396 //
397 // As with the hosts above, it doesn't matter what order we check these in.
398 const size_t kMaxPathsToCheck = 4;
399 for (std::string::const_iterator i(path.begin());
400 i != path.end() && paths->size() < kMaxPathsToCheck; ++i) {
401 if (*i == '/')
402 paths->push_back(std::string(path.begin(), i + 1));
403 }
404
405 if (!paths->empty() && paths->back() != path)
406 paths->push_back(path);
407
408 if (!query.empty())
409 paths->push_back(path + "?" + query);
410}
411
412void GeneratePatternsToCheck(const GURL& url, std::vector<std::string>* urls) {
413 std::vector<std::string> hosts, paths;
414 GenerateHostsToCheck(url, &hosts);
415 GeneratePathsToCheck(url, &paths);
416 for (size_t h = 0; h < hosts.size(); ++h) {
417 for (size_t p = 0; p < paths.size(); ++p) {
418 urls->push_back(hosts[h] + paths[p]);
419 }
420 }
421}
422
423} // namespace safe_browsing