blob: a6a6215b36857b3d3e1c50a42252262f50babe1f [file] [log] [blame]
[email protected]fb5bcc02012-02-17 14:05:421// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
[email protected]63f1a9e2012-05-09 12:51:005#include "chrome/common/extensions/matcher/url_matcher.h"
[email protected]fb5bcc02012-02-17 14:05:426
7#include <algorithm>
[email protected]9947d0e2012-02-23 22:36:538#include <iterator>
[email protected]fb5bcc02012-02-17 14:05:429
10#include "base/logging.h"
[email protected]c640fd7c2012-08-17 08:19:2511#include "content/public/common/url_constants.h"
[email protected]fb5bcc02012-02-17 14:05:4212#include "googleurl/src/gurl.h"
[email protected]c640fd7c2012-08-17 08:19:2513#include "googleurl/src/url_canon.h"
[email protected]fb5bcc02012-02-17 14:05:4214
15namespace extensions {
16
17// This set of classes implement a mapping of URL Component Patterns, such as
[email protected]5bcf3b72012-09-14 00:20:2818// host_prefix, host_suffix, host_equals, ..., etc., to StringPatterns
19// for use in substring comparisons.
[email protected]fb5bcc02012-02-17 14:05:4220//
21// The idea of this mapping is to reduce the problem of comparing many
22// URL Component Patterns against one URL to the problem of searching many
23// substrings in one string:
24//
[email protected]5bcf3b72012-09-14 00:20:2825// ---------------------- -----------------
26// | URL Query operator | ----translate----> | StringPattern |
27// ---------------------- -----------------
28// ^
29// |
30// compare
31// |
32// v
33// ---------------------- -----------------
34// | URL to compare | | |
35// | to all URL Query | ----translate----> | String |
36// | operators | | |
37// ---------------------- -----------------
[email protected]fb5bcc02012-02-17 14:05:4238//
39// The reason for this problem reduction is that there are efficient algorithms
40// for searching many substrings in one string (see Aho-Corasick algorithm).
41//
[email protected]5bcf3b72012-09-14 00:20:2842// Additionally, some of the same pieces are reused to implement regular
43// expression comparisons. The FilteredRE2 implementation for matching many
44// regular expressions against one string uses prefiltering, in which a set
45// of substrings (derived from the regexes) are first searched for, to reduce
46// the number of regular expressions to test; the prefiltering step also
47// uses Aho-Corasick.
48//
[email protected]fb5bcc02012-02-17 14:05:4249// Case 1: {host,path,query}_{prefix,suffix,equals} searches.
50// ==========================================================
51//
52// For searches in this class, we normalize URLs as follows:
53//
54// Step 1:
55// Remove scheme, port and segment from URL:
56// -> http://www.example.com:8080/index.html?search=foo#first_match becomes
57// www.example.com/index.html?search=foo
58//
59// We remove the scheme and port number because they can be checked later
60// in a secondary filter step. We remove the segment (the #... part) because
61// this is not guaranteed to be ASCII-7 encoded.
62//
63// Step 2:
64// Translate URL to String and add the following position markers:
65// - BU = Beginning of URL
66// - ED = End of Domain
67// - EP = End of Path
68// - EU = End of URL
69// Furthermore, the hostname is canonicalized to start with a ".".
70//
71// Position markers are represented as characters >127, which are therefore
72// guaranteed not to be part of the ASCII-7 encoded URL character set.
73//
74// -> www.example.com/index.html?search=foo becomes
75// BU .www.example.com ED /index.html EP ?search=foo EU
76//
77// -> www.example.com/index.html becomes
78// BU .www.example.com ED /index.html EP EU
79//
80// Step 3:
81// Translate URL Component Patterns as follows:
82//
83// host_prefix(prefix) = BU add_missing_dot_prefix(prefix)
84// -> host_prefix("www.example") = BU .www.example
85//
86// host_suffix(suffix) = suffix ED
87// -> host_suffix("example.com") = example.com ED
88// -> host_suffix(".example.com") = .example.com ED
89//
90// host_equals(domain) = BU add_missing_dot_prefix(domain) ED
91// -> host_equals("www.example.com") = BU .www.example.com ED
92//
93// Similarly for path query parameters ({path, query}_{prefix, suffix, equals}).
94//
[email protected]5bcf3b72012-09-14 00:20:2895// With this, we can search the StringPatterns in the normalized URL.
[email protected]fb5bcc02012-02-17 14:05:4296//
97//
98// Case 2: url_{prefix,suffix,equals,contains} searches.
99// =====================================================
100//
[email protected]c640fd7c2012-08-17 08:19:25101// Step 1: as above, except that
102// - the scheme is not removed
103// - the port is not removed if it is specified and does not match the default
104// port for the given scheme.
[email protected]fb5bcc02012-02-17 14:05:42105//
106// Step 2:
107// Translate URL to String and add the following position markers:
108// - BU = Beginning of URL
109// - EU = End of URL
[email protected]fb5bcc02012-02-17 14:05:42110//
[email protected]c640fd7c2012-08-17 08:19:25111// -> http://www.example.com:8080/index.html?search=foo#first_match becomes
112// BU http://www.example.com:8080/index.html?search=foo EU
113// -> http://www.example.com:80/index.html?search=foo#first_match becomes
114// BU http://www.example.com/index.html?search=foo EU
[email protected]fb5bcc02012-02-17 14:05:42115//
[email protected]c640fd7c2012-08-17 08:19:25116// url_prefix(prefix) = BU prefix
117// -> url_prefix("http://www.example") = BU http://www.example
[email protected]fb5bcc02012-02-17 14:05:42118//
119// url_contains(substring) = substring
120// -> url_contains("index") = index
121//
122//
123// Case 3: {host,path,query}_contains searches.
124// ============================================
125//
126// These kinds of searches are not supported directly but can be derived
127// by a combination of a url_contains() query followed by an explicit test:
128//
129// host_contains(str) = url_contains(str) followed by test whether str occurs
[email protected]c640fd7c2012-08-17 08:19:25130// in host component of original URL.
[email protected]fb5bcc02012-02-17 14:05:42131// -> host_contains("example.co") = example.co
132// followed by gurl.host().find("example.co");
133//
134// [similarly for path_contains and query_contains].
[email protected]5bcf3b72012-09-14 00:20:28135//
136//
137// Regular expression matching (url_matches searches)
138// ==================================================
139//
140// This class also supports matching regular expressions (RE2 syntax)
141// against full URLs, which are transformed as in case 2.
[email protected]fb5bcc02012-02-17 14:05:42142
[email protected]5bcf3b72012-09-14 00:20:28143namespace {
144
145bool IsRegexCriterion(URLMatcherCondition::Criterion criterion) {
146 return criterion == URLMatcherCondition::URL_MATCHES;
147}
148
149} // namespace
[email protected]fb5bcc02012-02-17 14:05:42150
151//
152// URLMatcherCondition
153//
154
155URLMatcherCondition::URLMatcherCondition()
156 : criterion_(HOST_PREFIX),
[email protected]5bcf3b72012-09-14 00:20:28157 string_pattern_(NULL) {}
[email protected]fb5bcc02012-02-17 14:05:42158
159URLMatcherCondition::~URLMatcherCondition() {}
160
161URLMatcherCondition::URLMatcherCondition(
162 Criterion criterion,
[email protected]5bcf3b72012-09-14 00:20:28163 const StringPattern* string_pattern)
[email protected]fb5bcc02012-02-17 14:05:42164 : criterion_(criterion),
[email protected]5bcf3b72012-09-14 00:20:28165 string_pattern_(string_pattern) {}
[email protected]fb5bcc02012-02-17 14:05:42166
167URLMatcherCondition::URLMatcherCondition(const URLMatcherCondition& rhs)
168 : criterion_(rhs.criterion_),
[email protected]5bcf3b72012-09-14 00:20:28169 string_pattern_(rhs.string_pattern_) {}
[email protected]fb5bcc02012-02-17 14:05:42170
171URLMatcherCondition& URLMatcherCondition::operator=(
172 const URLMatcherCondition& rhs) {
173 criterion_ = rhs.criterion_;
[email protected]5bcf3b72012-09-14 00:20:28174 string_pattern_ = rhs.string_pattern_;
[email protected]fb5bcc02012-02-17 14:05:42175 return *this;
176}
177
178bool URLMatcherCondition::operator<(const URLMatcherCondition& rhs) const {
179 if (criterion_ < rhs.criterion_) return true;
180 if (criterion_ > rhs.criterion_) return false;
[email protected]5bcf3b72012-09-14 00:20:28181 if (string_pattern_ != NULL && rhs.string_pattern_ != NULL)
182 return *string_pattern_ < *rhs.string_pattern_;
183 if (string_pattern_ == NULL && rhs.string_pattern_ != NULL) return true;
184 // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL,
[email protected]fb5bcc02012-02-17 14:05:42185 // or both are NULL.
186 return false;
187}
188
189bool URLMatcherCondition::IsFullURLCondition() const {
190 // For these criteria the SubstringMatcher needs to be executed on the
[email protected]c640fd7c2012-08-17 08:19:25191 // GURL that is canonicalized with
[email protected]fb5bcc02012-02-17 14:05:42192 // URLMatcherConditionFactory::CanonicalizeURLForFullSearches.
193 switch (criterion_) {
194 case HOST_CONTAINS:
195 case PATH_CONTAINS:
196 case QUERY_CONTAINS:
197 case URL_PREFIX:
198 case URL_SUFFIX:
199 case URL_CONTAINS:
200 case URL_EQUALS:
201 return true;
202 default:
203 break;
204 }
205 return false;
206}
207
[email protected]5bcf3b72012-09-14 00:20:28208bool URLMatcherCondition::IsRegexCondition() const {
209 return IsRegexCriterion(criterion_);
210}
211
[email protected]fb5bcc02012-02-17 14:05:42212bool URLMatcherCondition::IsMatch(
[email protected]5bcf3b72012-09-14 00:20:28213 const std::set<StringPattern::ID>& matching_patterns,
[email protected]fb5bcc02012-02-17 14:05:42214 const GURL& url) const {
[email protected]5bcf3b72012-09-14 00:20:28215 DCHECK(string_pattern_);
216 if (!ContainsKey(matching_patterns, string_pattern_->id()))
[email protected]fb5bcc02012-02-17 14:05:42217 return false;
218 // The criteria HOST_CONTAINS, PATH_CONTAINS, QUERY_CONTAINS are based on
219 // a substring match on the raw URL. In case of a match, we need to verify
220 // that the match was found in the correct component of the URL.
221 switch (criterion_) {
222 case HOST_CONTAINS:
[email protected]5bcf3b72012-09-14 00:20:28223 return url.host().find(string_pattern_->pattern()) !=
[email protected]fb5bcc02012-02-17 14:05:42224 std::string::npos;
225 case PATH_CONTAINS:
[email protected]5bcf3b72012-09-14 00:20:28226 return url.path().find(string_pattern_->pattern()) !=
[email protected]fb5bcc02012-02-17 14:05:42227 std::string::npos;
228 case QUERY_CONTAINS:
[email protected]5bcf3b72012-09-14 00:20:28229 return url.query().find(string_pattern_->pattern()) !=
[email protected]fb5bcc02012-02-17 14:05:42230 std::string::npos;
231 default:
232 break;
233 }
234 return true;
235}
236
237//
238// URLMatcherConditionFactory
239//
240
241namespace {
242// These are symbols that are not contained in 7-bit ASCII used in GURLs.
[email protected]078c1a72012-07-16 19:26:07243const char kBeginningOfURL[] = {static_cast<char>(-1), 0};
244const char kEndOfDomain[] = {static_cast<char>(-2), 0};
245const char kEndOfPath[] = {static_cast<char>(-3), 0};
246const char kEndOfURL[] = {static_cast<char>(-4), 0};
[email protected]fb5bcc02012-02-17 14:05:42247} // namespace
248
249URLMatcherConditionFactory::URLMatcherConditionFactory() : id_counter_(0) {}
250
251URLMatcherConditionFactory::~URLMatcherConditionFactory() {
[email protected]5bcf3b72012-09-14 00:20:28252 STLDeleteElements(&substring_pattern_singletons_);
253 STLDeleteElements(&regex_pattern_singletons_);
[email protected]fb5bcc02012-02-17 14:05:42254}
255
256std::string URLMatcherConditionFactory::CanonicalizeURLForComponentSearches(
[email protected]801ae922013-01-22 20:45:26257 const GURL& url) const {
[email protected]fb5bcc02012-02-17 14:05:42258 return kBeginningOfURL + CanonicalizeHostname(url.host()) + kEndOfDomain +
259 url.path() + kEndOfPath + (url.has_query() ? "?" + url.query() : "") +
260 kEndOfURL;
261}
262
263URLMatcherCondition URLMatcherConditionFactory::CreateHostPrefixCondition(
264 const std::string& prefix) {
265 return CreateCondition(URLMatcherCondition::HOST_PREFIX,
266 kBeginningOfURL + CanonicalizeHostname(prefix));
267}
268
269URLMatcherCondition URLMatcherConditionFactory::CreateHostSuffixCondition(
270 const std::string& suffix) {
271 return CreateCondition(URLMatcherCondition::HOST_SUFFIX,
272 suffix + kEndOfDomain);
273}
274
275URLMatcherCondition URLMatcherConditionFactory::CreateHostContainsCondition(
276 const std::string& str) {
277 return CreateCondition(URLMatcherCondition::HOST_CONTAINS, str);
278}
279
280URLMatcherCondition URLMatcherConditionFactory::CreateHostEqualsCondition(
281 const std::string& str) {
282 return CreateCondition(URLMatcherCondition::HOST_EQUALS,
283 kBeginningOfURL + CanonicalizeHostname(str) + kEndOfDomain);
284}
285
286URLMatcherCondition URLMatcherConditionFactory::CreatePathPrefixCondition(
287 const std::string& prefix) {
288 return CreateCondition(URLMatcherCondition::PATH_PREFIX,
289 kEndOfDomain + prefix);
290}
291
292URLMatcherCondition URLMatcherConditionFactory::CreatePathSuffixCondition(
293 const std::string& suffix) {
[email protected]6d8e5e42012-07-17 15:54:45294 return CreateCondition(URLMatcherCondition::PATH_SUFFIX, suffix + kEndOfPath);
[email protected]fb5bcc02012-02-17 14:05:42295}
296
297URLMatcherCondition URLMatcherConditionFactory::CreatePathContainsCondition(
298 const std::string& str) {
299 return CreateCondition(URLMatcherCondition::PATH_CONTAINS, str);
300}
301
302URLMatcherCondition URLMatcherConditionFactory::CreatePathEqualsCondition(
303 const std::string& str) {
304 return CreateCondition(URLMatcherCondition::PATH_EQUALS,
305 kEndOfDomain + str + kEndOfPath);
306}
307
308URLMatcherCondition URLMatcherConditionFactory::CreateQueryPrefixCondition(
309 const std::string& prefix) {
[email protected]ea6249b2012-12-06 18:45:20310 std::string pattern;
311 if (!prefix.empty() && prefix[0] == '?')
312 pattern = kEndOfPath + prefix;
313 else
314 pattern = kEndOfPath + ('?' + prefix);
315
316 return CreateCondition(URLMatcherCondition::QUERY_PREFIX, pattern);
[email protected]fb5bcc02012-02-17 14:05:42317}
318
319URLMatcherCondition URLMatcherConditionFactory::CreateQuerySuffixCondition(
320 const std::string& suffix) {
[email protected]ea6249b2012-12-06 18:45:20321 if (!suffix.empty() && suffix[0] == '?') {
322 return CreateQueryEqualsCondition(suffix);
323 } else {
324 return CreateCondition(URLMatcherCondition::QUERY_SUFFIX,
325 suffix + kEndOfURL);
326 }
[email protected]fb5bcc02012-02-17 14:05:42327}
328
329URLMatcherCondition URLMatcherConditionFactory::CreateQueryContainsCondition(
330 const std::string& str) {
[email protected]ea6249b2012-12-06 18:45:20331 if (!str.empty() && str[0] == '?')
332 return CreateQueryPrefixCondition(str);
333 else
334 return CreateCondition(URLMatcherCondition::QUERY_CONTAINS, str);
[email protected]fb5bcc02012-02-17 14:05:42335}
336
337URLMatcherCondition URLMatcherConditionFactory::CreateQueryEqualsCondition(
338 const std::string& str) {
[email protected]ea6249b2012-12-06 18:45:20339 std::string pattern;
340 if (!str.empty() && str[0] == '?')
341 pattern = kEndOfPath + str + kEndOfURL;
342 else
343 pattern = kEndOfPath + ('?' + str) + kEndOfURL;
344
345 return CreateCondition(URLMatcherCondition::QUERY_EQUALS, pattern);
[email protected]fb5bcc02012-02-17 14:05:42346}
347
348URLMatcherCondition
349 URLMatcherConditionFactory::CreateHostSuffixPathPrefixCondition(
350 const std::string& host_suffix,
351 const std::string& path_prefix) {
352 return CreateCondition(URLMatcherCondition::HOST_SUFFIX_PATH_PREFIX,
353 host_suffix + kEndOfDomain + path_prefix);
354}
355
[email protected]6d8e5e42012-07-17 15:54:45356URLMatcherCondition
357URLMatcherConditionFactory::CreateHostEqualsPathPrefixCondition(
358 const std::string& host,
359 const std::string& path_prefix) {
360 return CreateCondition(URLMatcherCondition::HOST_EQUALS_PATH_PREFIX,
361 kBeginningOfURL + CanonicalizeHostname(host) + kEndOfDomain +
362 path_prefix);
363}
364
[email protected]fb5bcc02012-02-17 14:05:42365std::string URLMatcherConditionFactory::CanonicalizeURLForFullSearches(
[email protected]801ae922013-01-22 20:45:26366 const GURL& url) const {
[email protected]c640fd7c2012-08-17 08:19:25367 GURL::Replacements replacements;
368 replacements.ClearPassword();
369 replacements.ClearUsername();
370 replacements.ClearRef();
371 // Clear port if it is implicit from scheme.
372 if (url.has_port()) {
373 const std::string& port = url.scheme();
374 if (url_canon::DefaultPortForScheme(port.c_str(), port.size()) ==
375 url.EffectiveIntPort()) {
376 replacements.ClearPort();
377 }
378 }
379 return kBeginningOfURL + url.ReplaceComponents(replacements).spec() +
380 kEndOfURL;
[email protected]fb5bcc02012-02-17 14:05:42381}
382
[email protected]5bcf3b72012-09-14 00:20:28383std::string URLMatcherConditionFactory::CanonicalizeURLForRegexSearches(
[email protected]801ae922013-01-22 20:45:26384 const GURL& url) const {
[email protected]5bcf3b72012-09-14 00:20:28385 GURL::Replacements replacements;
386 replacements.ClearPassword();
387 replacements.ClearUsername();
388 replacements.ClearRef();
389 // Clear port if it is implicit from scheme.
390 if (url.has_port()) {
391 const std::string& port = url.scheme();
392 if (url_canon::DefaultPortForScheme(port.c_str(), port.size()) ==
393 url.EffectiveIntPort()) {
394 replacements.ClearPort();
395 }
396 }
397 return url.ReplaceComponents(replacements).spec();
398}
399
[email protected]fb5bcc02012-02-17 14:05:42400URLMatcherCondition URLMatcherConditionFactory::CreateURLPrefixCondition(
401 const std::string& prefix) {
402 return CreateCondition(URLMatcherCondition::URL_PREFIX,
[email protected]c640fd7c2012-08-17 08:19:25403 kBeginningOfURL + prefix);
[email protected]fb5bcc02012-02-17 14:05:42404}
405
406URLMatcherCondition URLMatcherConditionFactory::CreateURLSuffixCondition(
407 const std::string& suffix) {
408 return CreateCondition(URLMatcherCondition::URL_SUFFIX, suffix + kEndOfURL);
409}
410
411URLMatcherCondition URLMatcherConditionFactory::CreateURLContainsCondition(
412 const std::string& str) {
413 return CreateCondition(URLMatcherCondition::URL_CONTAINS, str);
414}
415
416URLMatcherCondition URLMatcherConditionFactory::CreateURLEqualsCondition(
417 const std::string& str) {
[email protected]6d8e5e42012-07-17 15:54:45418 return CreateCondition(URLMatcherCondition::URL_EQUALS,
[email protected]c640fd7c2012-08-17 08:19:25419 kBeginningOfURL + str + kEndOfURL);
[email protected]fb5bcc02012-02-17 14:05:42420}
421
[email protected]5bcf3b72012-09-14 00:20:28422URLMatcherCondition URLMatcherConditionFactory::CreateURLMatchesCondition(
423 const std::string& regex) {
424 return CreateCondition(URLMatcherCondition::URL_MATCHES, regex);
425}
426
[email protected]fb5bcc02012-02-17 14:05:42427void URLMatcherConditionFactory::ForgetUnusedPatterns(
[email protected]5bcf3b72012-09-14 00:20:28428 const std::set<StringPattern::ID>& used_patterns) {
429 PatternSingletons::iterator i = substring_pattern_singletons_.begin();
430 while (i != substring_pattern_singletons_.end()) {
[email protected]2fb51d92012-02-17 15:05:47431 if (used_patterns.find((*i)->id()) != used_patterns.end()) {
[email protected]fb5bcc02012-02-17 14:05:42432 ++i;
[email protected]2fb51d92012-02-17 15:05:47433 } else {
434 delete *i;
[email protected]5bcf3b72012-09-14 00:20:28435 substring_pattern_singletons_.erase(i++);
436 }
437 }
438 i = regex_pattern_singletons_.begin();
439 while (i != regex_pattern_singletons_.end()) {
440 if (used_patterns.find((*i)->id()) != used_patterns.end()) {
441 ++i;
442 } else {
443 delete *i;
444 regex_pattern_singletons_.erase(i++);
[email protected]2fb51d92012-02-17 15:05:47445 }
[email protected]fb5bcc02012-02-17 14:05:42446 }
447}
448
[email protected]357c4db2012-03-29 07:51:57449bool URLMatcherConditionFactory::IsEmpty() const {
[email protected]5bcf3b72012-09-14 00:20:28450 return substring_pattern_singletons_.empty() &&
451 regex_pattern_singletons_.empty();
[email protected]357c4db2012-03-29 07:51:57452}
453
[email protected]fb5bcc02012-02-17 14:05:42454URLMatcherCondition URLMatcherConditionFactory::CreateCondition(
455 URLMatcherCondition::Criterion criterion,
456 const std::string& pattern) {
[email protected]5bcf3b72012-09-14 00:20:28457 StringPattern search_pattern(pattern, 0);
458 PatternSingletons* pattern_singletons =
459 IsRegexCriterion(criterion) ? &regex_pattern_singletons_
460 : &substring_pattern_singletons_;
461
[email protected]fb5bcc02012-02-17 14:05:42462 PatternSingletons::const_iterator iter =
[email protected]5bcf3b72012-09-14 00:20:28463 pattern_singletons->find(&search_pattern);
464
465 if (iter != pattern_singletons->end()) {
[email protected]fb5bcc02012-02-17 14:05:42466 return URLMatcherCondition(criterion, *iter);
467 } else {
[email protected]5bcf3b72012-09-14 00:20:28468 StringPattern* new_pattern =
469 new StringPattern(pattern, id_counter_++);
470 pattern_singletons->insert(new_pattern);
[email protected]fb5bcc02012-02-17 14:05:42471 return URLMatcherCondition(criterion, new_pattern);
472 }
473}
474
475std::string URLMatcherConditionFactory::CanonicalizeHostname(
476 const std::string& hostname) const {
477 if (!hostname.empty() && hostname[0] == '.')
478 return hostname;
479 else
480 return "." + hostname;
481}
482
[email protected]5bcf3b72012-09-14 00:20:28483bool URLMatcherConditionFactory::StringPatternPointerCompare::operator()(
484 StringPattern* lhs,
485 StringPattern* rhs) const {
[email protected]fb5bcc02012-02-17 14:05:42486 if (lhs == NULL && rhs != NULL) return true;
487 if (lhs != NULL && rhs != NULL)
488 return lhs->pattern() < rhs->pattern();
489 // Either both are NULL or only rhs is NULL.
490 return false;
491}
492
493//
[email protected]faceb0f2012-04-12 17:07:19494// URLMatcherSchemeFilter
495//
496
497URLMatcherSchemeFilter::URLMatcherSchemeFilter(const std::string& filter)
498 : filters_(1) {
499 filters_.push_back(filter);
500}
501
502URLMatcherSchemeFilter::URLMatcherSchemeFilter(
503 const std::vector<std::string>& filters)
504 : filters_(filters) {}
505
506URLMatcherSchemeFilter::~URLMatcherSchemeFilter() {}
507
508bool URLMatcherSchemeFilter::IsMatch(const GURL& url) const {
509 return std::find(filters_.begin(), filters_.end(), url.scheme()) !=
510 filters_.end();
511}
512
513//
[email protected]00520a52012-04-12 18:30:47514// URLMatcherPortFilter
515//
516
517URLMatcherPortFilter::URLMatcherPortFilter(
518 const std::vector<URLMatcherPortFilter::Range>& ranges)
519 : ranges_(ranges) {}
520
521URLMatcherPortFilter::~URLMatcherPortFilter() {}
522
523bool URLMatcherPortFilter::IsMatch(const GURL& url) const {
524 int port = url.EffectiveIntPort();
525 for (std::vector<Range>::const_iterator i = ranges_.begin();
526 i != ranges_.end(); ++i) {
527 if (i->first <= port && port <= i->second)
528 return true;
529 }
530 return false;
531}
532
533// static
534URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int from,
535 int to) {
536 return Range(from, to);
537}
538
539// static
540URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int port) {
541 return Range(port, port);
542}
543
544//
[email protected]fb5bcc02012-02-17 14:05:42545// URLMatcherConditionSet
546//
547
[email protected]fb5bcc02012-02-17 14:05:42548URLMatcherConditionSet::~URLMatcherConditionSet() {}
549
550URLMatcherConditionSet::URLMatcherConditionSet(
551 ID id,
552 const Conditions& conditions)
553 : id_(id),
554 conditions_(conditions) {}
555
[email protected]faceb0f2012-04-12 17:07:19556URLMatcherConditionSet::URLMatcherConditionSet(
557 ID id,
558 const Conditions& conditions,
[email protected]00520a52012-04-12 18:30:47559 scoped_ptr<URLMatcherSchemeFilter> scheme_filter,
560 scoped_ptr<URLMatcherPortFilter> port_filter)
[email protected]faceb0f2012-04-12 17:07:19561 : id_(id),
562 conditions_(conditions),
[email protected]00520a52012-04-12 18:30:47563 scheme_filter_(scheme_filter.Pass()),
564 port_filter_(port_filter.Pass()) {}
[email protected]faceb0f2012-04-12 17:07:19565
[email protected]fb5bcc02012-02-17 14:05:42566bool URLMatcherConditionSet::IsMatch(
[email protected]5bcf3b72012-09-14 00:20:28567 const std::set<StringPattern::ID>& matching_patterns,
[email protected]fb5bcc02012-02-17 14:05:42568 const GURL& url) const {
569 for (Conditions::const_iterator i = conditions_.begin();
[email protected]3b001a02012-04-05 10:38:06570 i != conditions_.end(); ++i) {
[email protected]5bcf3b72012-09-14 00:20:28571 if (!i->IsMatch(matching_patterns, url))
[email protected]fb5bcc02012-02-17 14:05:42572 return false;
573 }
[email protected]faceb0f2012-04-12 17:07:19574 if (scheme_filter_.get() && !scheme_filter_->IsMatch(url))
575 return false;
[email protected]00520a52012-04-12 18:30:47576 if (port_filter_.get() && !port_filter_->IsMatch(url))
577 return false;
[email protected]fb5bcc02012-02-17 14:05:42578 return true;
579}
580
[email protected]fb5bcc02012-02-17 14:05:42581//
582// URLMatcher
583//
584
585URLMatcher::URLMatcher() {}
586
587URLMatcher::~URLMatcher() {}
588
589void URLMatcher::AddConditionSets(
[email protected]3b001a02012-04-05 10:38:06590 const URLMatcherConditionSet::Vector& condition_sets) {
591 for (URLMatcherConditionSet::Vector::const_iterator i =
592 condition_sets.begin(); i != condition_sets.end(); ++i) {
593 DCHECK(url_matcher_condition_sets_.find((*i)->id()) ==
594 url_matcher_condition_sets_.end());
595 url_matcher_condition_sets_[(*i)->id()] = *i;
[email protected]fb5bcc02012-02-17 14:05:42596 }
597 UpdateInternalDatastructures();
598}
599
600void URLMatcher::RemoveConditionSets(
601 const std::vector<URLMatcherConditionSet::ID>& condition_set_ids) {
602 for (std::vector<URLMatcherConditionSet::ID>::const_iterator i =
[email protected]3b001a02012-04-05 10:38:06603 condition_set_ids.begin(); i != condition_set_ids.end(); ++i) {
[email protected]fb5bcc02012-02-17 14:05:42604 DCHECK(url_matcher_condition_sets_.find(*i) !=
605 url_matcher_condition_sets_.end());
606 url_matcher_condition_sets_.erase(*i);
607 }
608 UpdateInternalDatastructures();
609}
610
[email protected]d552b432012-03-29 07:42:32611void URLMatcher::ClearUnusedConditionSets() {
612 UpdateConditionFactory();
613}
614
[email protected]801ae922013-01-22 20:45:26615std::set<URLMatcherConditionSet::ID> URLMatcher::MatchURL(
616 const GURL& url) const {
[email protected]5bcf3b72012-09-14 00:20:28617 // Find all IDs of StringPatterns that match |url|.
[email protected]fb5bcc02012-02-17 14:05:42618 // See URLMatcherConditionFactory for the canonicalization of URLs and the
619 // distinction between full url searches and url component searches.
[email protected]5bcf3b72012-09-14 00:20:28620 std::set<StringPattern::ID> matches;
[email protected]fb5bcc02012-02-17 14:05:42621 full_url_matcher_.Match(
622 condition_factory_.CanonicalizeURLForFullSearches(url), &matches);
623 url_component_matcher_.Match(
624 condition_factory_.CanonicalizeURLForComponentSearches(url), &matches);
[email protected]5bcf3b72012-09-14 00:20:28625 regex_set_matcher_.Match(
626 condition_factory_.CanonicalizeURLForRegexSearches(url), &matches);
[email protected]fb5bcc02012-02-17 14:05:42627
628 // Calculate all URLMatcherConditionSets for which all URLMatcherConditions
629 // were fulfilled.
630 std::set<URLMatcherConditionSet::ID> result;
[email protected]5bcf3b72012-09-14 00:20:28631 for (std::set<StringPattern::ID>::const_iterator i = matches.begin();
[email protected]3b001a02012-04-05 10:38:06632 i != matches.end(); ++i) {
[email protected]fb5bcc02012-02-17 14:05:42633 // For each URLMatcherConditionSet there is exactly one condition
634 // registered in substring_match_triggers_. This means that the following
635 // logic tests each URLMatcherConditionSet exactly once if it can be
636 // completely fulfilled.
[email protected]801ae922013-01-22 20:45:26637 StringPatternTriggers::const_iterator triggered_condition_sets_iter =
638 substring_match_triggers_.find(*i);
639 if (triggered_condition_sets_iter == substring_match_triggers_.end())
640 continue; // Not all substring matches are triggers for a condition set.
641 const std::set<URLMatcherConditionSet::ID>& condition_sets =
642 triggered_condition_sets_iter->second;
[email protected]fb5bcc02012-02-17 14:05:42643 for (std::set<URLMatcherConditionSet::ID>::const_iterator j =
[email protected]3b001a02012-04-05 10:38:06644 condition_sets.begin(); j != condition_sets.end(); ++j) {
[email protected]801ae922013-01-22 20:45:26645 URLMatcherConditionSets::const_iterator condition_set_iter =
646 url_matcher_condition_sets_.find(*j);
647 DCHECK(condition_set_iter != url_matcher_condition_sets_.end());
648 if (condition_set_iter->second->IsMatch(matches, url))
[email protected]fb5bcc02012-02-17 14:05:42649 result.insert(*j);
650 }
651 }
652
653 return result;
654}
655
[email protected]357c4db2012-03-29 07:51:57656bool URLMatcher::IsEmpty() const {
657 return condition_factory_.IsEmpty() &&
658 url_matcher_condition_sets_.empty() &&
659 substring_match_triggers_.empty() &&
660 full_url_matcher_.IsEmpty() &&
661 url_component_matcher_.IsEmpty() &&
662 registered_full_url_patterns_.empty() &&
663 registered_url_component_patterns_.empty();
664}
665
[email protected]fb5bcc02012-02-17 14:05:42666void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) {
667 // The purpose of |full_url_conditions| is just that we need to execute
668 // the same logic once for Full URL searches and once for URL Component
669 // searches (see URLMatcherConditionFactory).
670
671 // Determine which patterns need to be registered when this function
672 // terminates.
[email protected]5bcf3b72012-09-14 00:20:28673 std::set<const StringPattern*> new_patterns;
[email protected]fb5bcc02012-02-17 14:05:42674 for (URLMatcherConditionSets::const_iterator condition_set_iter =
675 url_matcher_condition_sets_.begin();
676 condition_set_iter != url_matcher_condition_sets_.end();
677 ++condition_set_iter) {
678 const URLMatcherConditionSet::Conditions& conditions =
[email protected]3b001a02012-04-05 10:38:06679 condition_set_iter->second->conditions();
[email protected]fb5bcc02012-02-17 14:05:42680 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
[email protected]3b001a02012-04-05 10:38:06681 conditions.begin(); condition_iter != conditions.end();
682 ++condition_iter) {
[email protected]5bcf3b72012-09-14 00:20:28683 // If we are called to process Full URL searches, ignore others, and
684 // vice versa. (Regex conditions are updated in UpdateRegexSetMatcher.)
685 if (!condition_iter->IsRegexCondition() &&
686 full_url_conditions == condition_iter->IsFullURLCondition())
687 new_patterns.insert(condition_iter->string_pattern());
[email protected]fb5bcc02012-02-17 14:05:42688 }
689 }
690
691 // This is the set of patterns that were registered before this function
692 // is called.
[email protected]5bcf3b72012-09-14 00:20:28693 std::set<const StringPattern*>& registered_patterns =
[email protected]fb5bcc02012-02-17 14:05:42694 full_url_conditions ? registered_full_url_patterns_
695 : registered_url_component_patterns_;
696
697 // Add all patterns that are in new_patterns but not in registered_patterns.
[email protected]5bcf3b72012-09-14 00:20:28698 std::vector<const StringPattern*> patterns_to_register;
[email protected]fb5bcc02012-02-17 14:05:42699 std::set_difference(
700 new_patterns.begin(), new_patterns.end(),
701 registered_patterns.begin(), registered_patterns.end(),
702 std::back_inserter(patterns_to_register));
703
704 // Remove all patterns that are in registered_patterns but not in
705 // new_patterns.
[email protected]5bcf3b72012-09-14 00:20:28706 std::vector<const StringPattern*> patterns_to_unregister;
[email protected]fb5bcc02012-02-17 14:05:42707 std::set_difference(
708 registered_patterns.begin(), registered_patterns.end(),
709 new_patterns.begin(), new_patterns.end(),
710 std::back_inserter(patterns_to_unregister));
711
712 // Update the SubstringSetMatcher.
713 SubstringSetMatcher& url_matcher =
714 full_url_conditions ? full_url_matcher_ : url_component_matcher_;
715 url_matcher.RegisterAndUnregisterPatterns(patterns_to_register,
716 patterns_to_unregister);
717
718 // Update the set of registered_patterns for the next time this function
719 // is being called.
720 registered_patterns.swap(new_patterns);
721}
722
[email protected]5bcf3b72012-09-14 00:20:28723void URLMatcher::UpdateRegexSetMatcher() {
724 std::vector<const StringPattern*> new_patterns;
725
[email protected]fb5bcc02012-02-17 14:05:42726 for (URLMatcherConditionSets::const_iterator condition_set_iter =
727 url_matcher_condition_sets_.begin();
728 condition_set_iter != url_matcher_condition_sets_.end();
729 ++condition_set_iter) {
730 const URLMatcherConditionSet::Conditions& conditions =
[email protected]3b001a02012-04-05 10:38:06731 condition_set_iter->second->conditions();
[email protected]fb5bcc02012-02-17 14:05:42732 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
[email protected]3b001a02012-04-05 10:38:06733 conditions.begin(); condition_iter != conditions.end();
734 ++condition_iter) {
[email protected]5bcf3b72012-09-14 00:20:28735 if (condition_iter->IsRegexCondition())
736 new_patterns.push_back(condition_iter->string_pattern());
737 }
738 }
739
740 // Start over from scratch. We can't really do better than this, since the
741 // FilteredRE2 backend doesn't support incremental updates.
742 regex_set_matcher_.ClearPatterns();
743 regex_set_matcher_.AddPatterns(new_patterns);
744}
745
746void URLMatcher::UpdateTriggers() {
747 // Count substring pattern frequencies.
748 std::map<StringPattern::ID, size_t> substring_pattern_frequencies;
749 for (URLMatcherConditionSets::const_iterator condition_set_iter =
750 url_matcher_condition_sets_.begin();
751 condition_set_iter != url_matcher_condition_sets_.end();
752 ++condition_set_iter) {
753 const URLMatcherConditionSet::Conditions& conditions =
754 condition_set_iter->second->conditions();
755 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
756 conditions.begin(); condition_iter != conditions.end();
757 ++condition_iter) {
758 const StringPattern* pattern = condition_iter->string_pattern();
[email protected]fb5bcc02012-02-17 14:05:42759 substring_pattern_frequencies[pattern->id()]++;
760 }
761 }
762
763 // Update trigger conditions: Determine for each URLMatcherConditionSet which
[email protected]5bcf3b72012-09-14 00:20:28764 // URLMatcherCondition contains a StringPattern that occurs least
[email protected]fb5bcc02012-02-17 14:05:42765 // frequently in this URLMatcher. We assume that this condition is very
766 // specific and occurs rarely in URLs. If a match occurs for this
767 // URLMatcherCondition, we want to test all other URLMatcherCondition in the
768 // respective URLMatcherConditionSet as well to see whether the entire
769 // URLMatcherConditionSet is considered matching.
770 substring_match_triggers_.clear();
771 for (URLMatcherConditionSets::const_iterator condition_set_iter =
772 url_matcher_condition_sets_.begin();
773 condition_set_iter != url_matcher_condition_sets_.end();
774 ++condition_set_iter) {
775 const URLMatcherConditionSet::Conditions& conditions =
[email protected]3b001a02012-04-05 10:38:06776 condition_set_iter->second->conditions();
[email protected]fb5bcc02012-02-17 14:05:42777 if (conditions.empty())
778 continue;
779 URLMatcherConditionSet::Conditions::const_iterator condition_iter =
780 conditions.begin();
[email protected]5bcf3b72012-09-14 00:20:28781 StringPattern::ID trigger = condition_iter->string_pattern()->id();
[email protected]fb5bcc02012-02-17 14:05:42782 // We skip the first element in the following loop.
783 ++condition_iter;
784 for (; condition_iter != conditions.end(); ++condition_iter) {
[email protected]5bcf3b72012-09-14 00:20:28785 StringPattern::ID current_id =
786 condition_iter->string_pattern()->id();
[email protected]fb5bcc02012-02-17 14:05:42787 if (substring_pattern_frequencies[trigger] >
788 substring_pattern_frequencies[current_id]) {
789 trigger = current_id;
790 }
791 }
[email protected]3b001a02012-04-05 10:38:06792 substring_match_triggers_[trigger].insert(condition_set_iter->second->id());
[email protected]fb5bcc02012-02-17 14:05:42793 }
794}
795
796void URLMatcher::UpdateConditionFactory() {
[email protected]5bcf3b72012-09-14 00:20:28797 std::set<StringPattern::ID> used_patterns;
[email protected]fb5bcc02012-02-17 14:05:42798 for (URLMatcherConditionSets::const_iterator condition_set_iter =
799 url_matcher_condition_sets_.begin();
800 condition_set_iter != url_matcher_condition_sets_.end();
801 ++condition_set_iter) {
802 const URLMatcherConditionSet::Conditions& conditions =
[email protected]3b001a02012-04-05 10:38:06803 condition_set_iter->second->conditions();
[email protected]fb5bcc02012-02-17 14:05:42804 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
[email protected]3b001a02012-04-05 10:38:06805 conditions.begin(); condition_iter != conditions.end();
806 ++condition_iter) {
[email protected]5bcf3b72012-09-14 00:20:28807 used_patterns.insert(condition_iter->string_pattern()->id());
[email protected]fb5bcc02012-02-17 14:05:42808 }
809 }
810 condition_factory_.ForgetUnusedPatterns(used_patterns);
811}
812
813void URLMatcher::UpdateInternalDatastructures() {
814 UpdateSubstringSetMatcher(false);
815 UpdateSubstringSetMatcher(true);
[email protected]5bcf3b72012-09-14 00:20:28816 UpdateRegexSetMatcher();
[email protected]fb5bcc02012-02-17 14:05:42817 UpdateTriggers();
818 UpdateConditionFactory();
819}
820
821} // namespace extensions