vakh | d0a17ec | 2015-11-05 23:14:34 | [diff] [blame] | 1 | // Copyright (c) 2015 The Chromium Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
| 5 | #include "components/safe_browsing_db/util.h" |
| 6 | |
avi | f57136c1 | 2015-12-25 23:27:45 | [diff] [blame] | 7 | #include <stddef.h> |
| 8 | |
| 9 | #include "base/macros.h" |
vakh | d0a17ec | 2015-11-05 23:14:34 | [diff] [blame] | 10 | #include "base/strings/string_util.h" |
joenotcharles | a6c026ad | 2016-01-13 16:10:09 | [diff] [blame] | 11 | #include "base/trace_event/trace_event.h" |
vakh | d0a17ec | 2015-11-05 23:14:34 | [diff] [blame] | 12 | #include "crypto/sha2.h" |
| 13 | #include "net/base/escape.h" |
| 14 | #include "url/gurl.h" |
| 15 | #include "url/url_util.h" |
| 16 | |
vakh | 9a474d83 | 2015-11-13 01:43:09 | [diff] [blame] | 17 | namespace safe_browsing { |
| 18 | |
vakh | d0a17ec | 2015-11-05 23:14:34 | [diff] [blame] | 19 | // Utility functions ----------------------------------------------------------- |
| 20 | |
| 21 | namespace { |
| 22 | bool IsKnownList(const std::string& name) { |
vakh | 9a474d83 | 2015-11-13 01:43:09 | [diff] [blame] | 23 | for (size_t i = 0; i < arraysize(kAllLists); ++i) { |
| 24 | if (!strcmp(kAllLists[i], name.c_str())) { |
vakh | d0a17ec | 2015-11-05 23:14:34 | [diff] [blame] | 25 | return true; |
| 26 | } |
| 27 | } |
| 28 | return false; |
| 29 | } |
| 30 | } // namespace |
| 31 | |
vakh | d0a17ec | 2015-11-05 23:14:34 | [diff] [blame] | 32 | // SBCachedFullHashResult ------------------------------------------------------ |
| 33 | |
| 34 | SBCachedFullHashResult::SBCachedFullHashResult() {} |
| 35 | |
| 36 | SBCachedFullHashResult::SBCachedFullHashResult( |
| 37 | const base::Time& in_expire_after) |
| 38 | : expire_after(in_expire_after) {} |
| 39 | |
vmpstr | b6449d51 | 2016-02-25 23:55:40 | [diff] [blame^] | 40 | SBCachedFullHashResult::SBCachedFullHashResult( |
| 41 | const SBCachedFullHashResult& other) = default; |
| 42 | |
vakh | d0a17ec | 2015-11-05 23:14:34 | [diff] [blame] | 43 | SBCachedFullHashResult::~SBCachedFullHashResult() {} |
| 44 | |
vakh | d0a17ec | 2015-11-05 23:14:34 | [diff] [blame] | 45 | // Listnames that browser can process. |
| 46 | const char kMalwareList[] = "goog-malware-shavar"; |
| 47 | const char kPhishingList[] = "goog-phish-shavar"; |
| 48 | const char kBinUrlList[] = "goog-badbinurl-shavar"; |
| 49 | const char kCsdWhiteList[] = "goog-csdwhite-sha256"; |
| 50 | const char kDownloadWhiteList[] = "goog-downloadwhite-digest256"; |
| 51 | const char kExtensionBlacklist[] = "goog-badcrxids-digestvar"; |
| 52 | const char kIPBlacklist[] = "goog-badip-digest256"; |
| 53 | const char kUnwantedUrlList[] = "goog-unwanted-shavar"; |
| 54 | const char kInclusionWhitelist[] = "goog-csdinclusionwhite-sha256"; |
proberge | a933956 | 2016-02-17 18:40:20 | [diff] [blame] | 55 | const char kModuleWhitelist[] = "goog-whitemodule-digest256"; |
veranika | fbe7992 | 2016-02-19 16:58:06 | [diff] [blame] | 56 | const char kResourceBlacklist[] = "goog-badresource-shavar"; |
vakh | d0a17ec | 2015-11-05 23:14:34 | [diff] [blame] | 57 | |
veranika | fbe7992 | 2016-02-19 16:58:06 | [diff] [blame] | 58 | const char* kAllLists[11] = { |
proberge | a933956 | 2016-02-17 18:40:20 | [diff] [blame] | 59 | kMalwareList, kPhishingList, kBinUrlList, kCsdWhiteList, |
| 60 | kDownloadWhiteList, kExtensionBlacklist, kIPBlacklist, kUnwantedUrlList, |
veranika | fbe7992 | 2016-02-19 16:58:06 | [diff] [blame] | 61 | kInclusionWhitelist, kModuleWhitelist, kResourceBlacklist, |
vakh | d0a17ec | 2015-11-05 23:14:34 | [diff] [blame] | 62 | }; |
| 63 | |
| 64 | ListType GetListId(const base::StringPiece& name) { |
| 65 | ListType id; |
| 66 | if (name == kMalwareList) { |
| 67 | id = MALWARE; |
| 68 | } else if (name == kPhishingList) { |
| 69 | id = PHISH; |
| 70 | } else if (name == kBinUrlList) { |
| 71 | id = BINURL; |
| 72 | } else if (name == kCsdWhiteList) { |
| 73 | id = CSDWHITELIST; |
| 74 | } else if (name == kDownloadWhiteList) { |
| 75 | id = DOWNLOADWHITELIST; |
| 76 | } else if (name == kExtensionBlacklist) { |
| 77 | id = EXTENSIONBLACKLIST; |
| 78 | } else if (name == kIPBlacklist) { |
| 79 | id = IPBLACKLIST; |
| 80 | } else if (name == kUnwantedUrlList) { |
| 81 | id = UNWANTEDURL; |
| 82 | } else if (name == kInclusionWhitelist) { |
| 83 | id = INCLUSIONWHITELIST; |
proberge | a933956 | 2016-02-17 18:40:20 | [diff] [blame] | 84 | } else if (name == kModuleWhitelist) { |
| 85 | id = MODULEWHITELIST; |
veranika | fbe7992 | 2016-02-19 16:58:06 | [diff] [blame] | 86 | } else if (name == kResourceBlacklist) { |
| 87 | id = RESOURCEBLACKLIST; |
vakh | d0a17ec | 2015-11-05 23:14:34 | [diff] [blame] | 88 | } else { |
| 89 | id = INVALID; |
| 90 | } |
| 91 | return id; |
| 92 | } |
| 93 | |
| 94 | bool GetListName(ListType list_id, std::string* list) { |
| 95 | switch (list_id) { |
| 96 | case MALWARE: |
| 97 | *list = kMalwareList; |
| 98 | break; |
| 99 | case PHISH: |
| 100 | *list = kPhishingList; |
| 101 | break; |
| 102 | case BINURL: |
| 103 | *list = kBinUrlList; |
| 104 | break; |
| 105 | case CSDWHITELIST: |
| 106 | *list = kCsdWhiteList; |
| 107 | break; |
| 108 | case DOWNLOADWHITELIST: |
| 109 | *list = kDownloadWhiteList; |
| 110 | break; |
| 111 | case EXTENSIONBLACKLIST: |
| 112 | *list = kExtensionBlacklist; |
| 113 | break; |
| 114 | case IPBLACKLIST: |
| 115 | *list = kIPBlacklist; |
| 116 | break; |
| 117 | case UNWANTEDURL: |
| 118 | *list = kUnwantedUrlList; |
| 119 | break; |
| 120 | case INCLUSIONWHITELIST: |
| 121 | *list = kInclusionWhitelist; |
| 122 | break; |
proberge | a933956 | 2016-02-17 18:40:20 | [diff] [blame] | 123 | case MODULEWHITELIST: |
| 124 | *list = kModuleWhitelist; |
veranika | fbe7992 | 2016-02-19 16:58:06 | [diff] [blame] | 125 | case RESOURCEBLACKLIST: |
| 126 | *list = kResourceBlacklist; |
proberge | a933956 | 2016-02-17 18:40:20 | [diff] [blame] | 127 | break; |
vakh | d0a17ec | 2015-11-05 23:14:34 | [diff] [blame] | 128 | default: |
| 129 | return false; |
| 130 | } |
| 131 | DCHECK(IsKnownList(*list)); |
| 132 | return true; |
| 133 | } |
| 134 | |
| 135 | |
| 136 | SBFullHash SBFullHashForString(const base::StringPiece& str) { |
| 137 | SBFullHash h; |
| 138 | crypto::SHA256HashString(str, &h.full_hash, sizeof(h.full_hash)); |
| 139 | return h; |
| 140 | } |
| 141 | |
| 142 | SBFullHash StringToSBFullHash(const std::string& hash_in) { |
| 143 | DCHECK_EQ(crypto::kSHA256Length, hash_in.size()); |
| 144 | SBFullHash hash_out; |
| 145 | memcpy(hash_out.full_hash, hash_in.data(), crypto::kSHA256Length); |
| 146 | return hash_out; |
| 147 | } |
| 148 | |
| 149 | std::string SBFullHashToString(const SBFullHash& hash) { |
| 150 | DCHECK_EQ(crypto::kSHA256Length, sizeof(hash.full_hash)); |
| 151 | return std::string(hash.full_hash, sizeof(hash.full_hash)); |
| 152 | } |
| 153 | |
| 154 | |
| 155 | std::string Unescape(const std::string& url) { |
| 156 | std::string unescaped_str(url); |
vakh | d0a17ec | 2015-11-05 23:14:34 | [diff] [blame] | 157 | const int kMaxLoopIterations = 1024; |
georgesak | 0770d08 | 2015-12-04 16:21:57 | [diff] [blame] | 158 | size_t old_size = 0; |
vakh | d0a17ec | 2015-11-05 23:14:34 | [diff] [blame] | 159 | int loop_var = 0; |
| 160 | do { |
georgesak | 0770d08 | 2015-12-04 16:21:57 | [diff] [blame] | 161 | old_size = unescaped_str.size(); |
vakh | d0a17ec | 2015-11-05 23:14:34 | [diff] [blame] | 162 | unescaped_str = net::UnescapeURLComponent( |
georgesak | 0770d08 | 2015-12-04 16:21:57 | [diff] [blame] | 163 | unescaped_str, net::UnescapeRule::SPOOFING_AND_CONTROL_CHARS | |
| 164 | net::UnescapeRule::SPACES | |
| 165 | net::UnescapeRule::URL_SPECIAL_CHARS); |
| 166 | } while (old_size != unescaped_str.size() && |
| 167 | ++loop_var <= kMaxLoopIterations); |
vakh | d0a17ec | 2015-11-05 23:14:34 | [diff] [blame] | 168 | |
| 169 | return unescaped_str; |
| 170 | } |
| 171 | |
| 172 | std::string Escape(const std::string& url) { |
| 173 | std::string escaped_str; |
georgesak | 0770d08 | 2015-12-04 16:21:57 | [diff] [blame] | 174 | // The escaped string is larger so allocate double the length to reduce the |
| 175 | // chance of the string being grown. |
| 176 | escaped_str.reserve(url.length() * 2); |
vakh | d0a17ec | 2015-11-05 23:14:34 | [diff] [blame] | 177 | const char* kHexString = "0123456789ABCDEF"; |
| 178 | for (size_t i = 0; i < url.length(); i++) { |
| 179 | unsigned char c = static_cast<unsigned char>(url[i]); |
| 180 | if (c <= ' ' || c > '~' || c == '#' || c == '%') { |
georgesak | 0770d08 | 2015-12-04 16:21:57 | [diff] [blame] | 181 | escaped_str += '%'; |
| 182 | escaped_str += kHexString[c >> 4]; |
| 183 | escaped_str += kHexString[c & 0xf]; |
vakh | d0a17ec | 2015-11-05 23:14:34 | [diff] [blame] | 184 | } else { |
georgesak | 0770d08 | 2015-12-04 16:21:57 | [diff] [blame] | 185 | escaped_str += c; |
vakh | d0a17ec | 2015-11-05 23:14:34 | [diff] [blame] | 186 | } |
| 187 | } |
| 188 | |
| 189 | return escaped_str; |
| 190 | } |
| 191 | |
georgesak | 3e440e019 | 2015-12-02 16:24:16 | [diff] [blame] | 192 | std::string RemoveConsecutiveChars(base::StringPiece str, const char c) { |
| 193 | std::string output; |
| 194 | // Output is at most the length of the original string. |
| 195 | output.reserve(str.size()); |
| 196 | |
| 197 | size_t i = 0; |
| 198 | while (i < str.size()) { |
| 199 | output.append(1, str[i++]); |
| 200 | if (str[i - 1] == c) { |
| 201 | while (i < str.size() && str[i] == c) { |
| 202 | i++; |
| 203 | } |
| 204 | } |
vakh | d0a17ec | 2015-11-05 23:14:34 | [diff] [blame] | 205 | } |
| 206 | |
| 207 | return output; |
| 208 | } |
| 209 | |
| 210 | // Canonicalizes url as per Google Safe Browsing Specification. |
| 211 | // See section 6.1 in |
| 212 | // http://code.google.com/p/google-safe-browsing/wiki/Protocolv2Spec. |
| 213 | void CanonicalizeUrl(const GURL& url, |
| 214 | std::string* canonicalized_hostname, |
| 215 | std::string* canonicalized_path, |
| 216 | std::string* canonicalized_query) { |
| 217 | DCHECK(url.is_valid()); |
| 218 | |
| 219 | // We only canonicalize "normal" URLs. |
| 220 | if (!url.IsStandard()) |
| 221 | return; |
| 222 | |
| 223 | // Following canonicalization steps are excluded since url parsing takes care |
| 224 | // of those :- |
| 225 | // 1. Remove any tab (0x09), CR (0x0d), and LF (0x0a) chars from url. |
| 226 | // (Exclude escaped version of these chars). |
| 227 | // 2. Normalize hostname to 4 dot-seperated decimal values. |
| 228 | // 3. Lowercase hostname. |
| 229 | // 4. Resolve path sequences "/../" and "/./". |
| 230 | |
| 231 | // That leaves us with the following :- |
| 232 | // 1. Remove fragment in URL. |
| 233 | GURL url_without_fragment; |
| 234 | GURL::Replacements f_replacements; |
| 235 | f_replacements.ClearRef(); |
| 236 | f_replacements.ClearUsername(); |
| 237 | f_replacements.ClearPassword(); |
| 238 | url_without_fragment = url.ReplaceComponents(f_replacements); |
| 239 | |
| 240 | // 2. Do URL unescaping until no more hex encoded characters exist. |
| 241 | std::string url_unescaped_str(Unescape(url_without_fragment.spec())); |
| 242 | url::Parsed parsed; |
| 243 | url::ParseStandardURL(url_unescaped_str.data(), url_unescaped_str.length(), |
| 244 | &parsed); |
| 245 | |
| 246 | // 3. In hostname, remove all leading and trailing dots. |
georgesak | 3f9f4d7 | 2015-12-03 20:57:08 | [diff] [blame] | 247 | base::StringPiece host; |
| 248 | if (parsed.host.len > 0) |
| 249 | host.set(url_unescaped_str.data() + parsed.host.begin, parsed.host.len); |
| 250 | |
| 251 | base::StringPiece host_without_end_dots = |
| 252 | base::TrimString(host, ".", base::TrimPositions::TRIM_ALL); |
vakh | d0a17ec | 2015-11-05 23:14:34 | [diff] [blame] | 253 | |
| 254 | // 4. In hostname, replace consecutive dots with a single dot. |
| 255 | std::string host_without_consecutive_dots(RemoveConsecutiveChars( |
| 256 | host_without_end_dots, '.')); |
| 257 | |
| 258 | // 5. In path, replace runs of consecutive slashes with a single slash. |
georgesak | 3f9f4d7 | 2015-12-03 20:57:08 | [diff] [blame] | 259 | base::StringPiece path; |
| 260 | if (parsed.path.len > 0) |
| 261 | path.set(url_unescaped_str.data() + parsed.path.begin, parsed.path.len); |
vakh | d0a17ec | 2015-11-05 23:14:34 | [diff] [blame] | 262 | std::string path_without_consecutive_slash(RemoveConsecutiveChars(path, '/')); |
| 263 | |
| 264 | url::Replacements<char> hp_replacements; |
| 265 | hp_replacements.SetHost( |
| 266 | host_without_consecutive_dots.data(), |
| 267 | url::Component(0, host_without_consecutive_dots.length())); |
| 268 | hp_replacements.SetPath( |
| 269 | path_without_consecutive_slash.data(), |
| 270 | url::Component(0, path_without_consecutive_slash.length())); |
| 271 | |
| 272 | std::string url_unescaped_with_can_hostpath; |
| 273 | url::StdStringCanonOutput output(&url_unescaped_with_can_hostpath); |
| 274 | url::Parsed temp_parsed; |
| 275 | url::ReplaceComponents(url_unescaped_str.data(), |
| 276 | url_unescaped_str.length(), |
| 277 | parsed, |
| 278 | hp_replacements, |
| 279 | NULL, |
| 280 | &output, |
| 281 | &temp_parsed); |
| 282 | output.Complete(); |
| 283 | |
| 284 | // 6. Step needed to revert escaping done in url::ReplaceComponents. |
| 285 | url_unescaped_with_can_hostpath = Unescape(url_unescaped_with_can_hostpath); |
| 286 | |
| 287 | // 7. After performing all above steps, percent-escape all chars in url which |
| 288 | // are <= ASCII 32, >= 127, #, %. Escapes must be uppercase hex characters. |
| 289 | std::string escaped_canon_url_str(Escape(url_unescaped_with_can_hostpath)); |
| 290 | url::Parsed final_parsed; |
| 291 | url::ParseStandardURL(escaped_canon_url_str.data(), |
| 292 | escaped_canon_url_str.length(), |
| 293 | &final_parsed); |
| 294 | |
| 295 | if (canonicalized_hostname && final_parsed.host.len > 0) { |
| 296 | *canonicalized_hostname = |
| 297 | escaped_canon_url_str.substr(final_parsed.host.begin, |
| 298 | final_parsed.host.len); |
| 299 | } |
| 300 | if (canonicalized_path && final_parsed.path.len > 0) { |
| 301 | *canonicalized_path = escaped_canon_url_str.substr(final_parsed.path.begin, |
| 302 | final_parsed.path.len); |
| 303 | } |
| 304 | if (canonicalized_query && final_parsed.query.len > 0) { |
| 305 | *canonicalized_query = escaped_canon_url_str.substr( |
| 306 | final_parsed.query.begin, final_parsed.query.len); |
| 307 | } |
| 308 | } |
| 309 | |
kcarattini | 365d506 | 2015-12-10 06:05:29 | [diff] [blame] | 310 | void UrlToFullHashes(const GURL& url, |
| 311 | bool include_whitelist_hashes, |
| 312 | std::vector<SBFullHash>* full_hashes) { |
joenotcharles | a6c026ad | 2016-01-13 16:10:09 | [diff] [blame] | 313 | // Include this function in traces because it's not cheap so it should be |
| 314 | // called sparingly. |
joenotcharles | 81ebeb8 | 2016-01-14 06:40:38 | [diff] [blame] | 315 | TRACE_EVENT2("loader", "safe_browsing::UrlToFullHashes", "url", url.spec(), |
joenotcharles | a6c026ad | 2016-01-13 16:10:09 | [diff] [blame] | 316 | "include_whitelist_hashes", include_whitelist_hashes); |
kcarattini | 365d506 | 2015-12-10 06:05:29 | [diff] [blame] | 317 | std::vector<std::string> hosts; |
| 318 | if (url.HostIsIPAddress()) { |
| 319 | hosts.push_back(url.host()); |
| 320 | } else { |
| 321 | GenerateHostsToCheck(url, &hosts); |
| 322 | } |
| 323 | |
| 324 | std::vector<std::string> paths; |
| 325 | GeneratePathsToCheck(url, &paths); |
| 326 | |
| 327 | for (const std::string& host : hosts) { |
| 328 | for (const std::string& path : paths) { |
| 329 | full_hashes->push_back( |
| 330 | SBFullHashForString(host + path)); |
| 331 | |
| 332 | // We may have /foo as path-prefix in the whitelist which should |
| 333 | // also match with /foo/bar and /foo?bar. Hence, for every path |
| 334 | // that ends in '/' we also add the path without the slash. |
| 335 | if (include_whitelist_hashes && path.size() > 1 && |
| 336 | path[path.size() - 1] == '/') { |
| 337 | full_hashes->push_back(SBFullHashForString( |
| 338 | host + path.substr(0, path.size() - 1))); |
| 339 | } |
| 340 | } |
| 341 | } |
| 342 | } |
| 343 | |
vakh | d0a17ec | 2015-11-05 23:14:34 | [diff] [blame] | 344 | void GenerateHostsToCheck(const GURL& url, std::vector<std::string>* hosts) { |
| 345 | hosts->clear(); |
| 346 | |
| 347 | std::string canon_host; |
| 348 | CanonicalizeUrl(url, &canon_host, NULL, NULL); |
| 349 | |
| 350 | const std::string host = canon_host; // const sidesteps GCC bugs below! |
| 351 | if (host.empty()) |
| 352 | return; |
| 353 | |
| 354 | // Per the Safe Browsing Protocol v2 spec, we try the host, and also up to 4 |
| 355 | // hostnames formed by starting with the last 5 components and successively |
| 356 | // removing the leading component. The last component isn't examined alone, |
| 357 | // since it's the TLD or a subcomponent thereof. |
| 358 | // |
| 359 | // Note that we don't need to be clever about stopping at the "real" eTLD -- |
| 360 | // the data on the server side has been filtered to ensure it will not |
| 361 | // blacklist a whole TLD, and it's not significantly slower on our side to |
| 362 | // just check too much. |
| 363 | // |
| 364 | // Also note that because we have a simple blacklist, not some sort of complex |
| 365 | // whitelist-in-blacklist or vice versa, it doesn't matter what order we check |
| 366 | // these in. |
| 367 | const size_t kMaxHostsToCheck = 4; |
| 368 | bool skipped_last_component = false; |
| 369 | for (std::string::const_reverse_iterator i(host.rbegin()); |
| 370 | i != host.rend() && hosts->size() < kMaxHostsToCheck; ++i) { |
| 371 | if (*i == '.') { |
| 372 | if (skipped_last_component) |
| 373 | hosts->push_back(std::string(i.base(), host.end())); |
| 374 | else |
| 375 | skipped_last_component = true; |
| 376 | } |
| 377 | } |
| 378 | hosts->push_back(host); |
| 379 | } |
| 380 | |
| 381 | void GeneratePathsToCheck(const GURL& url, std::vector<std::string>* paths) { |
| 382 | paths->clear(); |
| 383 | |
| 384 | std::string canon_path; |
| 385 | std::string canon_query; |
| 386 | CanonicalizeUrl(url, NULL, &canon_path, &canon_query); |
| 387 | |
| 388 | const std::string path = canon_path; // const sidesteps GCC bugs below! |
| 389 | const std::string query = canon_query; |
| 390 | if (path.empty()) |
| 391 | return; |
| 392 | |
| 393 | // Per the Safe Browsing Protocol v2 spec, we try the exact path with/without |
| 394 | // the query parameters, and also up to 4 paths formed by starting at the root |
| 395 | // and adding more path components. |
| 396 | // |
| 397 | // As with the hosts above, it doesn't matter what order we check these in. |
| 398 | const size_t kMaxPathsToCheck = 4; |
| 399 | for (std::string::const_iterator i(path.begin()); |
| 400 | i != path.end() && paths->size() < kMaxPathsToCheck; ++i) { |
| 401 | if (*i == '/') |
| 402 | paths->push_back(std::string(path.begin(), i + 1)); |
| 403 | } |
| 404 | |
| 405 | if (!paths->empty() && paths->back() != path) |
| 406 | paths->push_back(path); |
| 407 | |
| 408 | if (!query.empty()) |
| 409 | paths->push_back(path + "?" + query); |
| 410 | } |
| 411 | |
| 412 | void GeneratePatternsToCheck(const GURL& url, std::vector<std::string>* urls) { |
| 413 | std::vector<std::string> hosts, paths; |
| 414 | GenerateHostsToCheck(url, &hosts); |
| 415 | GeneratePathsToCheck(url, &paths); |
| 416 | for (size_t h = 0; h < hosts.size(); ++h) { |
| 417 | for (size_t p = 0; p < paths.size(); ++p) { |
| 418 | urls->push_back(hosts[h] + paths[p]); |
| 419 | } |
| 420 | } |
| 421 | } |
| 422 | |
| 423 | } // namespace safe_browsing |