blob: 5cdf4a423cc9fa157b1390aefea45368ccdb93ba [file] [log] [blame]
[email protected]716c0162013-12-13 20:36:531// Copyright 2013 The Chromium Authors. All rights reserved.
[email protected]fb5bcc02012-02-17 14:05:422// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
[email protected]716c0162013-12-13 20:36:535#include "components/url_matcher/url_matcher.h"
[email protected]fb5bcc02012-02-17 14:05:426
7#include <algorithm>
[email protected]9947d0e2012-02-23 22:36:538#include <iterator>
[email protected]fb5bcc02012-02-17 14:05:429
10#include "base/logging.h"
[email protected]79fe2272013-07-13 20:01:4011#include "url/gurl.h"
12#include "url/url_canon.h"
[email protected]fb5bcc02012-02-17 14:05:4213
[email protected]716c0162013-12-13 20:36:5314namespace url_matcher {
[email protected]fb5bcc02012-02-17 14:05:4215
16// This set of classes implement a mapping of URL Component Patterns, such as
[email protected]5bcf3b72012-09-14 00:20:2817// host_prefix, host_suffix, host_equals, ..., etc., to StringPatterns
18// for use in substring comparisons.
[email protected]fb5bcc02012-02-17 14:05:4219//
20// The idea of this mapping is to reduce the problem of comparing many
21// URL Component Patterns against one URL to the problem of searching many
22// substrings in one string:
23//
[email protected]5bcf3b72012-09-14 00:20:2824// ---------------------- -----------------
25// | URL Query operator | ----translate----> | StringPattern |
26// ---------------------- -----------------
27// ^
28// |
29// compare
30// |
31// v
32// ---------------------- -----------------
33// | URL to compare | | |
34// | to all URL Query | ----translate----> | String |
35// | operators | | |
36// ---------------------- -----------------
[email protected]fb5bcc02012-02-17 14:05:4237//
38// The reason for this problem reduction is that there are efficient algorithms
39// for searching many substrings in one string (see Aho-Corasick algorithm).
40//
[email protected]5bcf3b72012-09-14 00:20:2841// Additionally, some of the same pieces are reused to implement regular
42// expression comparisons. The FilteredRE2 implementation for matching many
43// regular expressions against one string uses prefiltering, in which a set
44// of substrings (derived from the regexes) are first searched for, to reduce
45// the number of regular expressions to test; the prefiltering step also
46// uses Aho-Corasick.
47//
[email protected]fb5bcc02012-02-17 14:05:4248// Case 1: {host,path,query}_{prefix,suffix,equals} searches.
49// ==========================================================
50//
51// For searches in this class, we normalize URLs as follows:
52//
53// Step 1:
54// Remove scheme, port and segment from URL:
55// -> http://www.example.com:8080/index.html?search=foo#first_match becomes
56// www.example.com/index.html?search=foo
57//
58// We remove the scheme and port number because they can be checked later
59// in a secondary filter step. We remove the segment (the #... part) because
60// this is not guaranteed to be ASCII-7 encoded.
61//
62// Step 2:
63// Translate URL to String and add the following position markers:
64// - BU = Beginning of URL
65// - ED = End of Domain
66// - EP = End of Path
67// - EU = End of URL
68// Furthermore, the hostname is canonicalized to start with a ".".
69//
70// Position markers are represented as characters >127, which are therefore
71// guaranteed not to be part of the ASCII-7 encoded URL character set.
72//
73// -> www.example.com/index.html?search=foo becomes
74// BU .www.example.com ED /index.html EP ?search=foo EU
75//
76// -> www.example.com/index.html becomes
77// BU .www.example.com ED /index.html EP EU
78//
79// Step 3:
80// Translate URL Component Patterns as follows:
81//
82// host_prefix(prefix) = BU add_missing_dot_prefix(prefix)
83// -> host_prefix("www.example") = BU .www.example
84//
85// host_suffix(suffix) = suffix ED
86// -> host_suffix("example.com") = example.com ED
87// -> host_suffix(".example.com") = .example.com ED
88//
89// host_equals(domain) = BU add_missing_dot_prefix(domain) ED
90// -> host_equals("www.example.com") = BU .www.example.com ED
91//
92// Similarly for path query parameters ({path, query}_{prefix, suffix, equals}).
93//
[email protected]5bcf3b72012-09-14 00:20:2894// With this, we can search the StringPatterns in the normalized URL.
[email protected]fb5bcc02012-02-17 14:05:4295//
96//
97// Case 2: url_{prefix,suffix,equals,contains} searches.
98// =====================================================
99//
[email protected]c640fd7c2012-08-17 08:19:25100// Step 1: as above, except that
101// - the scheme is not removed
102// - the port is not removed if it is specified and does not match the default
103// port for the given scheme.
[email protected]fb5bcc02012-02-17 14:05:42104//
105// Step 2:
106// Translate URL to String and add the following position markers:
107// - BU = Beginning of URL
108// - EU = End of URL
[email protected]fb5bcc02012-02-17 14:05:42109//
[email protected]c640fd7c2012-08-17 08:19:25110// -> http://www.example.com:8080/index.html?search=foo#first_match becomes
111// BU http://www.example.com:8080/index.html?search=foo EU
112// -> http://www.example.com:80/index.html?search=foo#first_match becomes
113// BU http://www.example.com/index.html?search=foo EU
[email protected]fb5bcc02012-02-17 14:05:42114//
[email protected]c640fd7c2012-08-17 08:19:25115// url_prefix(prefix) = BU prefix
116// -> url_prefix("http://www.example") = BU http://www.example
[email protected]fb5bcc02012-02-17 14:05:42117//
118// url_contains(substring) = substring
119// -> url_contains("index") = index
120//
121//
122// Case 3: {host,path,query}_contains searches.
123// ============================================
124//
125// These kinds of searches are not supported directly but can be derived
126// by a combination of a url_contains() query followed by an explicit test:
127//
128// host_contains(str) = url_contains(str) followed by test whether str occurs
[email protected]c640fd7c2012-08-17 08:19:25129// in host component of original URL.
[email protected]fb5bcc02012-02-17 14:05:42130// -> host_contains("example.co") = example.co
131// followed by gurl.host().find("example.co");
132//
133// [similarly for path_contains and query_contains].
[email protected]5bcf3b72012-09-14 00:20:28134//
135//
136// Regular expression matching (url_matches searches)
137// ==================================================
138//
139// This class also supports matching regular expressions (RE2 syntax)
140// against full URLs, which are transformed as in case 2.
[email protected]fb5bcc02012-02-17 14:05:42141
[email protected]5bcf3b72012-09-14 00:20:28142namespace {
143
144bool IsRegexCriterion(URLMatcherCondition::Criterion criterion) {
145 return criterion == URLMatcherCondition::URL_MATCHES;
146}
147
[email protected]2280dc82013-04-11 20:04:01148bool IsOriginAndPathRegexCriterion(URLMatcherCondition::Criterion criterion) {
149 return criterion == URLMatcherCondition::ORIGIN_AND_PATH_MATCHES;
150}
151
[email protected]5bcf3b72012-09-14 00:20:28152} // namespace
[email protected]fb5bcc02012-02-17 14:05:42153
154//
155// URLMatcherCondition
156//
157
158URLMatcherCondition::URLMatcherCondition()
159 : criterion_(HOST_PREFIX),
[email protected]5bcf3b72012-09-14 00:20:28160 string_pattern_(NULL) {}
[email protected]fb5bcc02012-02-17 14:05:42161
162URLMatcherCondition::~URLMatcherCondition() {}
163
164URLMatcherCondition::URLMatcherCondition(
165 Criterion criterion,
[email protected]5bcf3b72012-09-14 00:20:28166 const StringPattern* string_pattern)
[email protected]fb5bcc02012-02-17 14:05:42167 : criterion_(criterion),
[email protected]5bcf3b72012-09-14 00:20:28168 string_pattern_(string_pattern) {}
[email protected]fb5bcc02012-02-17 14:05:42169
170URLMatcherCondition::URLMatcherCondition(const URLMatcherCondition& rhs)
171 : criterion_(rhs.criterion_),
[email protected]5bcf3b72012-09-14 00:20:28172 string_pattern_(rhs.string_pattern_) {}
[email protected]fb5bcc02012-02-17 14:05:42173
174URLMatcherCondition& URLMatcherCondition::operator=(
175 const URLMatcherCondition& rhs) {
176 criterion_ = rhs.criterion_;
[email protected]5bcf3b72012-09-14 00:20:28177 string_pattern_ = rhs.string_pattern_;
[email protected]fb5bcc02012-02-17 14:05:42178 return *this;
179}
180
181bool URLMatcherCondition::operator<(const URLMatcherCondition& rhs) const {
182 if (criterion_ < rhs.criterion_) return true;
183 if (criterion_ > rhs.criterion_) return false;
[email protected]5bcf3b72012-09-14 00:20:28184 if (string_pattern_ != NULL && rhs.string_pattern_ != NULL)
185 return *string_pattern_ < *rhs.string_pattern_;
186 if (string_pattern_ == NULL && rhs.string_pattern_ != NULL) return true;
187 // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL,
[email protected]fb5bcc02012-02-17 14:05:42188 // or both are NULL.
189 return false;
190}
191
192bool URLMatcherCondition::IsFullURLCondition() const {
193 // For these criteria the SubstringMatcher needs to be executed on the
[email protected]c640fd7c2012-08-17 08:19:25194 // GURL that is canonicalized with
[email protected]fb5bcc02012-02-17 14:05:42195 // URLMatcherConditionFactory::CanonicalizeURLForFullSearches.
196 switch (criterion_) {
197 case HOST_CONTAINS:
198 case PATH_CONTAINS:
199 case QUERY_CONTAINS:
200 case URL_PREFIX:
201 case URL_SUFFIX:
202 case URL_CONTAINS:
203 case URL_EQUALS:
204 return true;
205 default:
206 break;
207 }
208 return false;
209}
210
[email protected]5bcf3b72012-09-14 00:20:28211bool URLMatcherCondition::IsRegexCondition() const {
212 return IsRegexCriterion(criterion_);
213}
214
[email protected]2280dc82013-04-11 20:04:01215bool URLMatcherCondition::IsOriginAndPathRegexCondition() const {
216 return IsOriginAndPathRegexCriterion(criterion_);
217}
218
[email protected]fb5bcc02012-02-17 14:05:42219bool URLMatcherCondition::IsMatch(
[email protected]5bcf3b72012-09-14 00:20:28220 const std::set<StringPattern::ID>& matching_patterns,
[email protected]fb5bcc02012-02-17 14:05:42221 const GURL& url) const {
[email protected]5bcf3b72012-09-14 00:20:28222 DCHECK(string_pattern_);
223 if (!ContainsKey(matching_patterns, string_pattern_->id()))
[email protected]fb5bcc02012-02-17 14:05:42224 return false;
225 // The criteria HOST_CONTAINS, PATH_CONTAINS, QUERY_CONTAINS are based on
226 // a substring match on the raw URL. In case of a match, we need to verify
227 // that the match was found in the correct component of the URL.
228 switch (criterion_) {
229 case HOST_CONTAINS:
[email protected]5bcf3b72012-09-14 00:20:28230 return url.host().find(string_pattern_->pattern()) !=
[email protected]fb5bcc02012-02-17 14:05:42231 std::string::npos;
232 case PATH_CONTAINS:
[email protected]5bcf3b72012-09-14 00:20:28233 return url.path().find(string_pattern_->pattern()) !=
[email protected]fb5bcc02012-02-17 14:05:42234 std::string::npos;
235 case QUERY_CONTAINS:
[email protected]5bcf3b72012-09-14 00:20:28236 return url.query().find(string_pattern_->pattern()) !=
[email protected]fb5bcc02012-02-17 14:05:42237 std::string::npos;
238 default:
239 break;
240 }
241 return true;
242}
243
244//
245// URLMatcherConditionFactory
246//
247
248namespace {
249// These are symbols that are not contained in 7-bit ASCII used in GURLs.
[email protected]078c1a72012-07-16 19:26:07250const char kBeginningOfURL[] = {static_cast<char>(-1), 0};
251const char kEndOfDomain[] = {static_cast<char>(-2), 0};
252const char kEndOfPath[] = {static_cast<char>(-3), 0};
[email protected]c967c002014-04-11 13:45:02253const char kQueryComponentDelimiter[] = {static_cast<char>(-4), 0};
254const char kEndOfURL[] = {static_cast<char>(-5), 0};
255
256// The delimiter for query parameters
257const char kQuerySeparator = '&';
[email protected]fb5bcc02012-02-17 14:05:42258} // namespace
259
260URLMatcherConditionFactory::URLMatcherConditionFactory() : id_counter_(0) {}
261
262URLMatcherConditionFactory::~URLMatcherConditionFactory() {
[email protected]5bcf3b72012-09-14 00:20:28263 STLDeleteElements(&substring_pattern_singletons_);
264 STLDeleteElements(&regex_pattern_singletons_);
[email protected]2280dc82013-04-11 20:04:01265 STLDeleteElements(&origin_and_path_regex_pattern_singletons_);
[email protected]fb5bcc02012-02-17 14:05:42266}
267
268std::string URLMatcherConditionFactory::CanonicalizeURLForComponentSearches(
[email protected]801ae922013-01-22 20:45:26269 const GURL& url) const {
[email protected]fb5bcc02012-02-17 14:05:42270 return kBeginningOfURL + CanonicalizeHostname(url.host()) + kEndOfDomain +
[email protected]007b3f82013-04-09 08:46:45271 url.path() + kEndOfPath +
[email protected]c967c002014-04-11 13:45:02272 (url.has_query() ? CanonicalizeQuery(url.query(), true, true)
273 : std::string()) +
274 kEndOfURL;
[email protected]fb5bcc02012-02-17 14:05:42275}
276
277URLMatcherCondition URLMatcherConditionFactory::CreateHostPrefixCondition(
278 const std::string& prefix) {
279 return CreateCondition(URLMatcherCondition::HOST_PREFIX,
mnisslerd9cdcd872015-05-29 13:57:41280 kBeginningOfURL + CanonicalizeHostPrefix(prefix));
[email protected]fb5bcc02012-02-17 14:05:42281}
282
283URLMatcherCondition URLMatcherConditionFactory::CreateHostSuffixCondition(
284 const std::string& suffix) {
285 return CreateCondition(URLMatcherCondition::HOST_SUFFIX,
mnisslerd9cdcd872015-05-29 13:57:41286 CanonicalizeHostSuffix(suffix) + kEndOfDomain);
[email protected]fb5bcc02012-02-17 14:05:42287}
288
289URLMatcherCondition URLMatcherConditionFactory::CreateHostContainsCondition(
290 const std::string& str) {
291 return CreateCondition(URLMatcherCondition::HOST_CONTAINS, str);
292}
293
294URLMatcherCondition URLMatcherConditionFactory::CreateHostEqualsCondition(
295 const std::string& str) {
296 return CreateCondition(URLMatcherCondition::HOST_EQUALS,
297 kBeginningOfURL + CanonicalizeHostname(str) + kEndOfDomain);
298}
299
300URLMatcherCondition URLMatcherConditionFactory::CreatePathPrefixCondition(
301 const std::string& prefix) {
302 return CreateCondition(URLMatcherCondition::PATH_PREFIX,
303 kEndOfDomain + prefix);
304}
305
306URLMatcherCondition URLMatcherConditionFactory::CreatePathSuffixCondition(
307 const std::string& suffix) {
[email protected]6d8e5e42012-07-17 15:54:45308 return CreateCondition(URLMatcherCondition::PATH_SUFFIX, suffix + kEndOfPath);
[email protected]fb5bcc02012-02-17 14:05:42309}
310
311URLMatcherCondition URLMatcherConditionFactory::CreatePathContainsCondition(
312 const std::string& str) {
313 return CreateCondition(URLMatcherCondition::PATH_CONTAINS, str);
314}
315
316URLMatcherCondition URLMatcherConditionFactory::CreatePathEqualsCondition(
317 const std::string& str) {
318 return CreateCondition(URLMatcherCondition::PATH_EQUALS,
319 kEndOfDomain + str + kEndOfPath);
320}
321
322URLMatcherCondition URLMatcherConditionFactory::CreateQueryPrefixCondition(
323 const std::string& prefix) {
[email protected]ea6249b2012-12-06 18:45:20324 std::string pattern;
325 if (!prefix.empty() && prefix[0] == '?')
[email protected]c967c002014-04-11 13:45:02326 pattern = kEndOfPath + CanonicalizeQuery(prefix.substr(1), true, false);
[email protected]ea6249b2012-12-06 18:45:20327 else
[email protected]c967c002014-04-11 13:45:02328 pattern = kEndOfPath + CanonicalizeQuery(prefix, true, false);
[email protected]ea6249b2012-12-06 18:45:20329
330 return CreateCondition(URLMatcherCondition::QUERY_PREFIX, pattern);
[email protected]fb5bcc02012-02-17 14:05:42331}
332
333URLMatcherCondition URLMatcherConditionFactory::CreateQuerySuffixCondition(
334 const std::string& suffix) {
[email protected]ea6249b2012-12-06 18:45:20335 if (!suffix.empty() && suffix[0] == '?') {
336 return CreateQueryEqualsCondition(suffix);
337 } else {
338 return CreateCondition(URLMatcherCondition::QUERY_SUFFIX,
[email protected]c967c002014-04-11 13:45:02339 CanonicalizeQuery(suffix, false, true) + kEndOfURL);
[email protected]ea6249b2012-12-06 18:45:20340 }
[email protected]fb5bcc02012-02-17 14:05:42341}
342
343URLMatcherCondition URLMatcherConditionFactory::CreateQueryContainsCondition(
344 const std::string& str) {
[email protected]ea6249b2012-12-06 18:45:20345 if (!str.empty() && str[0] == '?')
346 return CreateQueryPrefixCondition(str);
347 else
348 return CreateCondition(URLMatcherCondition::QUERY_CONTAINS, str);
[email protected]fb5bcc02012-02-17 14:05:42349}
350
351URLMatcherCondition URLMatcherConditionFactory::CreateQueryEqualsCondition(
352 const std::string& str) {
[email protected]ea6249b2012-12-06 18:45:20353 std::string pattern;
354 if (!str.empty() && str[0] == '?')
[email protected]c967c002014-04-11 13:45:02355 pattern =
356 kEndOfPath + CanonicalizeQuery(str.substr(1), true, true) + kEndOfURL;
[email protected]ea6249b2012-12-06 18:45:20357 else
[email protected]c967c002014-04-11 13:45:02358 pattern = kEndOfPath + CanonicalizeQuery(str, true, true) + kEndOfURL;
[email protected]ea6249b2012-12-06 18:45:20359
360 return CreateCondition(URLMatcherCondition::QUERY_EQUALS, pattern);
[email protected]fb5bcc02012-02-17 14:05:42361}
362
363URLMatcherCondition
364 URLMatcherConditionFactory::CreateHostSuffixPathPrefixCondition(
365 const std::string& host_suffix,
366 const std::string& path_prefix) {
367 return CreateCondition(URLMatcherCondition::HOST_SUFFIX_PATH_PREFIX,
mnisslerd9cdcd872015-05-29 13:57:41368 CanonicalizeHostSuffix(host_suffix) + kEndOfDomain + path_prefix);
[email protected]fb5bcc02012-02-17 14:05:42369}
370
[email protected]6d8e5e42012-07-17 15:54:45371URLMatcherCondition
372URLMatcherConditionFactory::CreateHostEqualsPathPrefixCondition(
373 const std::string& host,
374 const std::string& path_prefix) {
375 return CreateCondition(URLMatcherCondition::HOST_EQUALS_PATH_PREFIX,
376 kBeginningOfURL + CanonicalizeHostname(host) + kEndOfDomain +
377 path_prefix);
378}
379
[email protected]fb5bcc02012-02-17 14:05:42380std::string URLMatcherConditionFactory::CanonicalizeURLForFullSearches(
[email protected]801ae922013-01-22 20:45:26381 const GURL& url) const {
[email protected]c640fd7c2012-08-17 08:19:25382 GURL::Replacements replacements;
383 replacements.ClearPassword();
384 replacements.ClearUsername();
385 replacements.ClearRef();
386 // Clear port if it is implicit from scheme.
387 if (url.has_port()) {
388 const std::string& port = url.scheme();
[email protected]04307e02014-05-01 18:01:49389 if (url::DefaultPortForScheme(port.c_str(), port.size()) ==
390 url.EffectiveIntPort()) {
[email protected]c640fd7c2012-08-17 08:19:25391 replacements.ClearPort();
392 }
393 }
394 return kBeginningOfURL + url.ReplaceComponents(replacements).spec() +
395 kEndOfURL;
[email protected]fb5bcc02012-02-17 14:05:42396}
397
[email protected]2280dc82013-04-11 20:04:01398static std::string CanonicalizeURLForRegexSearchesHelper(
399 const GURL& url,
400 bool clear_query) {
[email protected]5bcf3b72012-09-14 00:20:28401 GURL::Replacements replacements;
402 replacements.ClearPassword();
403 replacements.ClearUsername();
404 replacements.ClearRef();
[email protected]2280dc82013-04-11 20:04:01405 if (clear_query)
406 replacements.ClearQuery();
[email protected]5bcf3b72012-09-14 00:20:28407 // Clear port if it is implicit from scheme.
408 if (url.has_port()) {
409 const std::string& port = url.scheme();
[email protected]04307e02014-05-01 18:01:49410 if (url::DefaultPortForScheme(port.c_str(), port.size()) ==
411 url.EffectiveIntPort()) {
[email protected]5bcf3b72012-09-14 00:20:28412 replacements.ClearPort();
413 }
414 }
415 return url.ReplaceComponents(replacements).spec();
416}
417
[email protected]2280dc82013-04-11 20:04:01418std::string URLMatcherConditionFactory::CanonicalizeURLForRegexSearches(
419 const GURL& url) const {
420 return CanonicalizeURLForRegexSearchesHelper(url, false);
421}
422
423std::string
424URLMatcherConditionFactory::CanonicalizeURLForOriginAndPathRegexSearches(
425 const GURL& url) const {
426 return CanonicalizeURLForRegexSearchesHelper(url, true);
427}
428
[email protected]fb5bcc02012-02-17 14:05:42429URLMatcherCondition URLMatcherConditionFactory::CreateURLPrefixCondition(
430 const std::string& prefix) {
431 return CreateCondition(URLMatcherCondition::URL_PREFIX,
[email protected]c640fd7c2012-08-17 08:19:25432 kBeginningOfURL + prefix);
[email protected]fb5bcc02012-02-17 14:05:42433}
434
435URLMatcherCondition URLMatcherConditionFactory::CreateURLSuffixCondition(
436 const std::string& suffix) {
437 return CreateCondition(URLMatcherCondition::URL_SUFFIX, suffix + kEndOfURL);
438}
439
440URLMatcherCondition URLMatcherConditionFactory::CreateURLContainsCondition(
441 const std::string& str) {
442 return CreateCondition(URLMatcherCondition::URL_CONTAINS, str);
443}
444
445URLMatcherCondition URLMatcherConditionFactory::CreateURLEqualsCondition(
446 const std::string& str) {
[email protected]6d8e5e42012-07-17 15:54:45447 return CreateCondition(URLMatcherCondition::URL_EQUALS,
[email protected]c640fd7c2012-08-17 08:19:25448 kBeginningOfURL + str + kEndOfURL);
[email protected]fb5bcc02012-02-17 14:05:42449}
450
[email protected]5bcf3b72012-09-14 00:20:28451URLMatcherCondition URLMatcherConditionFactory::CreateURLMatchesCondition(
452 const std::string& regex) {
453 return CreateCondition(URLMatcherCondition::URL_MATCHES, regex);
454}
455
[email protected]2280dc82013-04-11 20:04:01456URLMatcherCondition
457URLMatcherConditionFactory::CreateOriginAndPathMatchesCondition(
458 const std::string& regex) {
459 return CreateCondition(URLMatcherCondition::ORIGIN_AND_PATH_MATCHES, regex);
460}
461
[email protected]fb5bcc02012-02-17 14:05:42462void URLMatcherConditionFactory::ForgetUnusedPatterns(
[email protected]5bcf3b72012-09-14 00:20:28463 const std::set<StringPattern::ID>& used_patterns) {
464 PatternSingletons::iterator i = substring_pattern_singletons_.begin();
465 while (i != substring_pattern_singletons_.end()) {
[email protected]2280dc82013-04-11 20:04:01466 if (ContainsKey(used_patterns, (*i)->id())) {
[email protected]fb5bcc02012-02-17 14:05:42467 ++i;
[email protected]2fb51d92012-02-17 15:05:47468 } else {
469 delete *i;
[email protected]5bcf3b72012-09-14 00:20:28470 substring_pattern_singletons_.erase(i++);
471 }
472 }
473 i = regex_pattern_singletons_.begin();
474 while (i != regex_pattern_singletons_.end()) {
[email protected]2280dc82013-04-11 20:04:01475 if (ContainsKey(used_patterns, (*i)->id())) {
[email protected]5bcf3b72012-09-14 00:20:28476 ++i;
477 } else {
478 delete *i;
479 regex_pattern_singletons_.erase(i++);
[email protected]2fb51d92012-02-17 15:05:47480 }
[email protected]fb5bcc02012-02-17 14:05:42481 }
[email protected]2280dc82013-04-11 20:04:01482 i = origin_and_path_regex_pattern_singletons_.begin();
483 while (i != origin_and_path_regex_pattern_singletons_.end()) {
484 if (ContainsKey(used_patterns, (*i)->id())) {
485 ++i;
486 } else {
487 delete *i;
488 origin_and_path_regex_pattern_singletons_.erase(i++);
489 }
490 }
[email protected]fb5bcc02012-02-17 14:05:42491}
492
[email protected]357c4db2012-03-29 07:51:57493bool URLMatcherConditionFactory::IsEmpty() const {
[email protected]5bcf3b72012-09-14 00:20:28494 return substring_pattern_singletons_.empty() &&
[email protected]2280dc82013-04-11 20:04:01495 regex_pattern_singletons_.empty() &&
496 origin_and_path_regex_pattern_singletons_.empty();
[email protected]357c4db2012-03-29 07:51:57497}
498
[email protected]fb5bcc02012-02-17 14:05:42499URLMatcherCondition URLMatcherConditionFactory::CreateCondition(
500 URLMatcherCondition::Criterion criterion,
501 const std::string& pattern) {
[email protected]5bcf3b72012-09-14 00:20:28502 StringPattern search_pattern(pattern, 0);
[email protected]2280dc82013-04-11 20:04:01503 PatternSingletons* pattern_singletons = NULL;
504 if (IsRegexCriterion(criterion))
505 pattern_singletons = &regex_pattern_singletons_;
506 else if (IsOriginAndPathRegexCriterion(criterion))
507 pattern_singletons = &origin_and_path_regex_pattern_singletons_;
508 else
509 pattern_singletons = &substring_pattern_singletons_;
[email protected]5bcf3b72012-09-14 00:20:28510
[email protected]fb5bcc02012-02-17 14:05:42511 PatternSingletons::const_iterator iter =
[email protected]5bcf3b72012-09-14 00:20:28512 pattern_singletons->find(&search_pattern);
513
514 if (iter != pattern_singletons->end()) {
[email protected]fb5bcc02012-02-17 14:05:42515 return URLMatcherCondition(criterion, *iter);
516 } else {
[email protected]5bcf3b72012-09-14 00:20:28517 StringPattern* new_pattern =
518 new StringPattern(pattern, id_counter_++);
519 pattern_singletons->insert(new_pattern);
[email protected]fb5bcc02012-02-17 14:05:42520 return URLMatcherCondition(criterion, new_pattern);
521 }
522}
523
mnisslerd9cdcd872015-05-29 13:57:41524std::string URLMatcherConditionFactory::CanonicalizeHostSuffix(
525 const std::string& suffix) const {
526 if (!suffix.empty() && suffix[suffix.size() - 1] == '.')
527 return suffix;
528 else
529 return suffix + ".";
530}
531
532std::string URLMatcherConditionFactory::CanonicalizeHostPrefix(
533 const std::string& prefix) const {
534 if (!prefix.empty() && prefix[0] == '.')
535 return prefix;
536 else
537 return "." + prefix;
538}
539
[email protected]fb5bcc02012-02-17 14:05:42540std::string URLMatcherConditionFactory::CanonicalizeHostname(
541 const std::string& hostname) const {
mnisslerd9cdcd872015-05-29 13:57:41542 return CanonicalizeHostPrefix(CanonicalizeHostSuffix(hostname));
[email protected]fb5bcc02012-02-17 14:05:42543}
544
[email protected]c967c002014-04-11 13:45:02545// This function prepares the query string by replacing query separator with a
546// magic value (|kQueryComponentDelimiter|). When the boolean
547// |prepend_beginning_of_query_component| is true the function prepends the
548// query with the same magic. This is done to locate the start of a key value
549// pair in the query string. The parameter |query| is passed by value
550// intentionally, since it is locally modified.
551std::string URLMatcherConditionFactory::CanonicalizeQuery(
552 std::string query,
553 bool prepend_beginning_of_query_component,
554 bool append_end_of_query_component) const {
555 for (std::string::iterator it = query.begin(); it != query.end(); ++it) {
556 if (*it == kQuerySeparator)
557 *it = kQueryComponentDelimiter[0];
558 }
559 if (prepend_beginning_of_query_component)
560 query = kQueryComponentDelimiter + query;
561 if (append_end_of_query_component)
562 query += kQueryComponentDelimiter;
563 return query;
564}
565
[email protected]5bcf3b72012-09-14 00:20:28566bool URLMatcherConditionFactory::StringPatternPointerCompare::operator()(
567 StringPattern* lhs,
568 StringPattern* rhs) const {
[email protected]fb5bcc02012-02-17 14:05:42569 if (lhs == NULL && rhs != NULL) return true;
570 if (lhs != NULL && rhs != NULL)
571 return lhs->pattern() < rhs->pattern();
572 // Either both are NULL or only rhs is NULL.
573 return false;
574}
575
576//
[email protected]c967c002014-04-11 13:45:02577// URLQueryElementMatcherCondition
578//
579
580URLQueryElementMatcherCondition::URLQueryElementMatcherCondition(
581 const std::string& key,
582 const std::string& value,
583 QueryValueMatchType query_value_match_type,
584 QueryElementType query_element_type,
585 Type match_type,
586 URLMatcherConditionFactory* factory) {
587 match_type_ = match_type;
588
589 if (query_element_type == ELEMENT_TYPE_KEY_VALUE) {
590 key_ = kQueryComponentDelimiter + key + "=";
591 value_ = value;
592 } else {
593 key_ = kQueryComponentDelimiter + key;
594 value_ = std::string();
595 }
596
597 if (query_value_match_type == QUERY_VALUE_MATCH_EXACT)
598 value_ += kQueryComponentDelimiter;
599
600 // If |value_| is empty no need to find the |key_| and verify if the value
601 // matches. Simply checking the presence of key is sufficient, which is done
602 // by MATCH_ANY
603 if (value_.empty())
604 match_type_ = MATCH_ANY;
605
606 URLMatcherCondition condition;
607 // If |match_type_| is MATCH_ANY, then we could simply look for the
608 // combination of |key_| + |value_|, which can be efficiently done by
609 // SubstringMatcher
610 if (match_type_ == MATCH_ANY)
611 condition = factory->CreateQueryContainsCondition(key_ + value_);
612 else
613 condition = factory->CreateQueryContainsCondition(key_);
614 string_pattern_ = condition.string_pattern();
615
616 key_length_ = key_.length();
617 value_length_ = value_.length();
618}
619
620URLQueryElementMatcherCondition::~URLQueryElementMatcherCondition() {}
621
622bool URLQueryElementMatcherCondition::operator<(
623 const URLQueryElementMatcherCondition& rhs) const {
624 if (match_type_ != rhs.match_type_)
625 return match_type_ < rhs.match_type_;
626 if (string_pattern_ != NULL && rhs.string_pattern_ != NULL)
627 return *string_pattern_ < *rhs.string_pattern_;
628 if (string_pattern_ == NULL && rhs.string_pattern_ != NULL)
629 return true;
630 // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL,
631 // or both are NULL.
632 return false;
633}
634
635bool URLQueryElementMatcherCondition::IsMatch(
636 const std::string& url_for_component_searches) const {
637 switch (match_type_) {
638 case MATCH_ANY: {
639 // For MATCH_ANY, no additional verification step is needed. We can trust
640 // the SubstringMatcher to do the verification.
641 return true;
642 }
643 case MATCH_ALL: {
644 size_t start = 0;
645 int found = 0;
646 size_t offset;
647 while ((offset = url_for_component_searches.find(key_, start)) !=
648 std::string::npos) {
649 if (url_for_component_searches.compare(
650 offset + key_length_, value_length_, value_) != 0) {
651 return false;
652 } else {
653 ++found;
654 }
655 start = offset + key_length_ + value_length_ - 1;
656 }
657 return !!found;
658 }
659 case MATCH_FIRST: {
660 size_t offset = url_for_component_searches.find(key_);
661 return url_for_component_searches.compare(
662 offset + key_length_, value_length_, value_) == 0;
663 }
664 case MATCH_LAST: {
665 size_t offset = url_for_component_searches.rfind(key_);
666 return url_for_component_searches.compare(
667 offset + key_length_, value_length_, value_) == 0;
668 }
669 }
670 NOTREACHED();
671 return false;
672}
673
674//
[email protected]faceb0f2012-04-12 17:07:19675// URLMatcherSchemeFilter
676//
677
678URLMatcherSchemeFilter::URLMatcherSchemeFilter(const std::string& filter)
679 : filters_(1) {
680 filters_.push_back(filter);
681}
682
683URLMatcherSchemeFilter::URLMatcherSchemeFilter(
684 const std::vector<std::string>& filters)
685 : filters_(filters) {}
686
687URLMatcherSchemeFilter::~URLMatcherSchemeFilter() {}
688
689bool URLMatcherSchemeFilter::IsMatch(const GURL& url) const {
690 return std::find(filters_.begin(), filters_.end(), url.scheme()) !=
691 filters_.end();
692}
693
694//
[email protected]00520a52012-04-12 18:30:47695// URLMatcherPortFilter
696//
697
698URLMatcherPortFilter::URLMatcherPortFilter(
699 const std::vector<URLMatcherPortFilter::Range>& ranges)
700 : ranges_(ranges) {}
701
702URLMatcherPortFilter::~URLMatcherPortFilter() {}
703
704bool URLMatcherPortFilter::IsMatch(const GURL& url) const {
705 int port = url.EffectiveIntPort();
706 for (std::vector<Range>::const_iterator i = ranges_.begin();
707 i != ranges_.end(); ++i) {
708 if (i->first <= port && port <= i->second)
709 return true;
710 }
711 return false;
712}
713
714// static
715URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int from,
716 int to) {
717 return Range(from, to);
718}
719
720// static
721URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int port) {
722 return Range(port, port);
723}
724
725//
[email protected]fb5bcc02012-02-17 14:05:42726// URLMatcherConditionSet
727//
728
[email protected]fb5bcc02012-02-17 14:05:42729URLMatcherConditionSet::~URLMatcherConditionSet() {}
730
731URLMatcherConditionSet::URLMatcherConditionSet(
732 ID id,
733 const Conditions& conditions)
734 : id_(id),
735 conditions_(conditions) {}
736
[email protected]faceb0f2012-04-12 17:07:19737URLMatcherConditionSet::URLMatcherConditionSet(
738 ID id,
739 const Conditions& conditions,
[email protected]00520a52012-04-12 18:30:47740 scoped_ptr<URLMatcherSchemeFilter> scheme_filter,
741 scoped_ptr<URLMatcherPortFilter> port_filter)
[email protected]faceb0f2012-04-12 17:07:19742 : id_(id),
743 conditions_(conditions),
[email protected]00520a52012-04-12 18:30:47744 scheme_filter_(scheme_filter.Pass()),
745 port_filter_(port_filter.Pass()) {}
[email protected]faceb0f2012-04-12 17:07:19746
[email protected]c967c002014-04-11 13:45:02747URLMatcherConditionSet::URLMatcherConditionSet(
748 ID id,
749 const Conditions& conditions,
750 const QueryConditions& query_conditions,
751 scoped_ptr<URLMatcherSchemeFilter> scheme_filter,
752 scoped_ptr<URLMatcherPortFilter> port_filter)
753 : id_(id),
754 conditions_(conditions),
755 query_conditions_(query_conditions),
756 scheme_filter_(scheme_filter.Pass()),
757 port_filter_(port_filter.Pass()) {}
758
[email protected]fb5bcc02012-02-17 14:05:42759bool URLMatcherConditionSet::IsMatch(
[email protected]5bcf3b72012-09-14 00:20:28760 const std::set<StringPattern::ID>& matching_patterns,
[email protected]fb5bcc02012-02-17 14:05:42761 const GURL& url) const {
[email protected]c967c002014-04-11 13:45:02762 return IsMatch(matching_patterns, url, std::string());
763}
764
765bool URLMatcherConditionSet::IsMatch(
766 const std::set<StringPattern::ID>& matching_patterns,
767 const GURL& url,
768 const std::string& url_for_component_searches) const {
[email protected]fb5bcc02012-02-17 14:05:42769 for (Conditions::const_iterator i = conditions_.begin();
[email protected]3b001a02012-04-05 10:38:06770 i != conditions_.end(); ++i) {
[email protected]5bcf3b72012-09-14 00:20:28771 if (!i->IsMatch(matching_patterns, url))
[email protected]fb5bcc02012-02-17 14:05:42772 return false;
773 }
[email protected]faceb0f2012-04-12 17:07:19774 if (scheme_filter_.get() && !scheme_filter_->IsMatch(url))
775 return false;
[email protected]00520a52012-04-12 18:30:47776 if (port_filter_.get() && !port_filter_->IsMatch(url))
777 return false;
[email protected]c967c002014-04-11 13:45:02778 if (query_conditions_.empty())
779 return true;
780 // The loop is duplicated below for performance reasons. If not all query
781 // elements are found, no need to verify match that is expected to take more
782 // cycles.
783 for (QueryConditions::const_iterator i = query_conditions_.begin();
784 i != query_conditions_.end();
785 ++i) {
786 if (!ContainsKey(matching_patterns, i->string_pattern()->id()))
787 return false;
788 }
789 for (QueryConditions::const_iterator i = query_conditions_.begin();
790 i != query_conditions_.end();
791 ++i) {
792 if (!i->IsMatch(url_for_component_searches))
793 return false;
794 }
[email protected]fb5bcc02012-02-17 14:05:42795 return true;
796}
797
[email protected]fb5bcc02012-02-17 14:05:42798//
799// URLMatcher
800//
801
802URLMatcher::URLMatcher() {}
803
804URLMatcher::~URLMatcher() {}
805
806void URLMatcher::AddConditionSets(
[email protected]3b001a02012-04-05 10:38:06807 const URLMatcherConditionSet::Vector& condition_sets) {
808 for (URLMatcherConditionSet::Vector::const_iterator i =
809 condition_sets.begin(); i != condition_sets.end(); ++i) {
810 DCHECK(url_matcher_condition_sets_.find((*i)->id()) ==
811 url_matcher_condition_sets_.end());
812 url_matcher_condition_sets_[(*i)->id()] = *i;
[email protected]fb5bcc02012-02-17 14:05:42813 }
814 UpdateInternalDatastructures();
815}
816
817void URLMatcher::RemoveConditionSets(
818 const std::vector<URLMatcherConditionSet::ID>& condition_set_ids) {
819 for (std::vector<URLMatcherConditionSet::ID>::const_iterator i =
[email protected]3b001a02012-04-05 10:38:06820 condition_set_ids.begin(); i != condition_set_ids.end(); ++i) {
[email protected]fb5bcc02012-02-17 14:05:42821 DCHECK(url_matcher_condition_sets_.find(*i) !=
822 url_matcher_condition_sets_.end());
823 url_matcher_condition_sets_.erase(*i);
824 }
825 UpdateInternalDatastructures();
826}
827
[email protected]d552b432012-03-29 07:42:32828void URLMatcher::ClearUnusedConditionSets() {
829 UpdateConditionFactory();
830}
831
[email protected]801ae922013-01-22 20:45:26832std::set<URLMatcherConditionSet::ID> URLMatcher::MatchURL(
833 const GURL& url) const {
[email protected]5bcf3b72012-09-14 00:20:28834 // Find all IDs of StringPatterns that match |url|.
[email protected]fb5bcc02012-02-17 14:05:42835 // See URLMatcherConditionFactory for the canonicalization of URLs and the
836 // distinction between full url searches and url component searches.
[email protected]5bcf3b72012-09-14 00:20:28837 std::set<StringPattern::ID> matches;
[email protected]c967c002014-04-11 13:45:02838 std::string url_for_component_searches;
839
[email protected]2280dc82013-04-11 20:04:01840 if (!full_url_matcher_.IsEmpty()) {
841 full_url_matcher_.Match(
842 condition_factory_.CanonicalizeURLForFullSearches(url), &matches);
843 }
844 if (!url_component_matcher_.IsEmpty()) {
[email protected]c967c002014-04-11 13:45:02845 url_for_component_searches =
846 condition_factory_.CanonicalizeURLForComponentSearches(url);
847 url_component_matcher_.Match(url_for_component_searches, &matches);
[email protected]2280dc82013-04-11 20:04:01848 }
849 if (!regex_set_matcher_.IsEmpty()) {
850 regex_set_matcher_.Match(
851 condition_factory_.CanonicalizeURLForRegexSearches(url), &matches);
852 }
853 if (!origin_and_path_regex_set_matcher_.IsEmpty()) {
854 origin_and_path_regex_set_matcher_.Match(
855 condition_factory_.CanonicalizeURLForOriginAndPathRegexSearches(url),
856 &matches);
857 }
[email protected]fb5bcc02012-02-17 14:05:42858
859 // Calculate all URLMatcherConditionSets for which all URLMatcherConditions
860 // were fulfilled.
861 std::set<URLMatcherConditionSet::ID> result;
[email protected]5bcf3b72012-09-14 00:20:28862 for (std::set<StringPattern::ID>::const_iterator i = matches.begin();
[email protected]3b001a02012-04-05 10:38:06863 i != matches.end(); ++i) {
[email protected]fb5bcc02012-02-17 14:05:42864 // For each URLMatcherConditionSet there is exactly one condition
865 // registered in substring_match_triggers_. This means that the following
866 // logic tests each URLMatcherConditionSet exactly once if it can be
867 // completely fulfilled.
[email protected]801ae922013-01-22 20:45:26868 StringPatternTriggers::const_iterator triggered_condition_sets_iter =
869 substring_match_triggers_.find(*i);
870 if (triggered_condition_sets_iter == substring_match_triggers_.end())
871 continue; // Not all substring matches are triggers for a condition set.
872 const std::set<URLMatcherConditionSet::ID>& condition_sets =
873 triggered_condition_sets_iter->second;
[email protected]fb5bcc02012-02-17 14:05:42874 for (std::set<URLMatcherConditionSet::ID>::const_iterator j =
[email protected]3b001a02012-04-05 10:38:06875 condition_sets.begin(); j != condition_sets.end(); ++j) {
[email protected]801ae922013-01-22 20:45:26876 URLMatcherConditionSets::const_iterator condition_set_iter =
877 url_matcher_condition_sets_.find(*j);
878 DCHECK(condition_set_iter != url_matcher_condition_sets_.end());
[email protected]c967c002014-04-11 13:45:02879 if (condition_set_iter->second->IsMatch(
880 matches, url, url_for_component_searches))
[email protected]fb5bcc02012-02-17 14:05:42881 result.insert(*j);
882 }
883 }
884
885 return result;
886}
887
[email protected]357c4db2012-03-29 07:51:57888bool URLMatcher::IsEmpty() const {
889 return condition_factory_.IsEmpty() &&
890 url_matcher_condition_sets_.empty() &&
891 substring_match_triggers_.empty() &&
892 full_url_matcher_.IsEmpty() &&
893 url_component_matcher_.IsEmpty() &&
[email protected]2280dc82013-04-11 20:04:01894 regex_set_matcher_.IsEmpty() &&
895 origin_and_path_regex_set_matcher_.IsEmpty() &&
[email protected]357c4db2012-03-29 07:51:57896 registered_full_url_patterns_.empty() &&
897 registered_url_component_patterns_.empty();
898}
899
[email protected]fb5bcc02012-02-17 14:05:42900void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) {
901 // The purpose of |full_url_conditions| is just that we need to execute
902 // the same logic once for Full URL searches and once for URL Component
903 // searches (see URLMatcherConditionFactory).
904
905 // Determine which patterns need to be registered when this function
906 // terminates.
[email protected]5bcf3b72012-09-14 00:20:28907 std::set<const StringPattern*> new_patterns;
[email protected]fb5bcc02012-02-17 14:05:42908 for (URLMatcherConditionSets::const_iterator condition_set_iter =
909 url_matcher_condition_sets_.begin();
910 condition_set_iter != url_matcher_condition_sets_.end();
911 ++condition_set_iter) {
912 const URLMatcherConditionSet::Conditions& conditions =
[email protected]3b001a02012-04-05 10:38:06913 condition_set_iter->second->conditions();
[email protected]fb5bcc02012-02-17 14:05:42914 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
[email protected]3b001a02012-04-05 10:38:06915 conditions.begin(); condition_iter != conditions.end();
916 ++condition_iter) {
[email protected]5bcf3b72012-09-14 00:20:28917 // If we are called to process Full URL searches, ignore others, and
918 // vice versa. (Regex conditions are updated in UpdateRegexSetMatcher.)
919 if (!condition_iter->IsRegexCondition() &&
[email protected]2280dc82013-04-11 20:04:01920 !condition_iter->IsOriginAndPathRegexCondition() &&
[email protected]5bcf3b72012-09-14 00:20:28921 full_url_conditions == condition_iter->IsFullURLCondition())
922 new_patterns.insert(condition_iter->string_pattern());
[email protected]fb5bcc02012-02-17 14:05:42923 }
[email protected]c967c002014-04-11 13:45:02924
925 if (full_url_conditions)
926 continue;
927
928 const URLMatcherConditionSet::QueryConditions& query_conditions =
929 condition_set_iter->second->query_conditions();
930 for (URLMatcherConditionSet::QueryConditions::const_iterator
931 query_condition_iter = query_conditions.begin();
932 query_condition_iter != query_conditions.end();
933 ++query_condition_iter) {
934 new_patterns.insert(query_condition_iter->string_pattern());
935 }
[email protected]fb5bcc02012-02-17 14:05:42936 }
937
938 // This is the set of patterns that were registered before this function
939 // is called.
[email protected]5bcf3b72012-09-14 00:20:28940 std::set<const StringPattern*>& registered_patterns =
[email protected]fb5bcc02012-02-17 14:05:42941 full_url_conditions ? registered_full_url_patterns_
942 : registered_url_component_patterns_;
943
944 // Add all patterns that are in new_patterns but not in registered_patterns.
[email protected]16bf7ba72013-08-23 11:52:54945 std::vector<const StringPattern*> patterns_to_register =
946 base::STLSetDifference<std::vector<const StringPattern*> >(
947 new_patterns, registered_patterns);
[email protected]fb5bcc02012-02-17 14:05:42948
949 // Remove all patterns that are in registered_patterns but not in
950 // new_patterns.
[email protected]16bf7ba72013-08-23 11:52:54951 std::vector<const StringPattern*> patterns_to_unregister =
952 base::STLSetDifference<std::vector<const StringPattern*> >(
953 registered_patterns, new_patterns);
[email protected]fb5bcc02012-02-17 14:05:42954
955 // Update the SubstringSetMatcher.
956 SubstringSetMatcher& url_matcher =
957 full_url_conditions ? full_url_matcher_ : url_component_matcher_;
958 url_matcher.RegisterAndUnregisterPatterns(patterns_to_register,
959 patterns_to_unregister);
960
961 // Update the set of registered_patterns for the next time this function
962 // is being called.
963 registered_patterns.swap(new_patterns);
964}
965
[email protected]5bcf3b72012-09-14 00:20:28966void URLMatcher::UpdateRegexSetMatcher() {
967 std::vector<const StringPattern*> new_patterns;
[email protected]2280dc82013-04-11 20:04:01968 std::vector<const StringPattern*> new_origin_and_path_patterns;
[email protected]5bcf3b72012-09-14 00:20:28969
[email protected]fb5bcc02012-02-17 14:05:42970 for (URLMatcherConditionSets::const_iterator condition_set_iter =
971 url_matcher_condition_sets_.begin();
972 condition_set_iter != url_matcher_condition_sets_.end();
973 ++condition_set_iter) {
974 const URLMatcherConditionSet::Conditions& conditions =
[email protected]3b001a02012-04-05 10:38:06975 condition_set_iter->second->conditions();
[email protected]fb5bcc02012-02-17 14:05:42976 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
[email protected]3b001a02012-04-05 10:38:06977 conditions.begin(); condition_iter != conditions.end();
978 ++condition_iter) {
[email protected]2280dc82013-04-11 20:04:01979 if (condition_iter->IsRegexCondition()) {
[email protected]5bcf3b72012-09-14 00:20:28980 new_patterns.push_back(condition_iter->string_pattern());
[email protected]2280dc82013-04-11 20:04:01981 } else if (condition_iter->IsOriginAndPathRegexCondition()) {
982 new_origin_and_path_patterns.push_back(
983 condition_iter->string_pattern());
984 }
[email protected]5bcf3b72012-09-14 00:20:28985 }
986 }
987
988 // Start over from scratch. We can't really do better than this, since the
989 // FilteredRE2 backend doesn't support incremental updates.
990 regex_set_matcher_.ClearPatterns();
991 regex_set_matcher_.AddPatterns(new_patterns);
[email protected]2280dc82013-04-11 20:04:01992 origin_and_path_regex_set_matcher_.ClearPatterns();
993 origin_and_path_regex_set_matcher_.AddPatterns(new_origin_and_path_patterns);
[email protected]5bcf3b72012-09-14 00:20:28994}
995
996void URLMatcher::UpdateTriggers() {
997 // Count substring pattern frequencies.
998 std::map<StringPattern::ID, size_t> substring_pattern_frequencies;
999 for (URLMatcherConditionSets::const_iterator condition_set_iter =
1000 url_matcher_condition_sets_.begin();
1001 condition_set_iter != url_matcher_condition_sets_.end();
1002 ++condition_set_iter) {
1003 const URLMatcherConditionSet::Conditions& conditions =
1004 condition_set_iter->second->conditions();
1005 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
1006 conditions.begin(); condition_iter != conditions.end();
1007 ++condition_iter) {
1008 const StringPattern* pattern = condition_iter->string_pattern();
[email protected]fb5bcc02012-02-17 14:05:421009 substring_pattern_frequencies[pattern->id()]++;
1010 }
[email protected]c967c002014-04-11 13:45:021011
1012 const URLMatcherConditionSet::QueryConditions& query_conditions =
1013 condition_set_iter->second->query_conditions();
1014 for (URLMatcherConditionSet::QueryConditions::const_iterator
1015 query_condition_iter = query_conditions.begin();
1016 query_condition_iter != query_conditions.end();
1017 ++query_condition_iter) {
1018 const StringPattern* pattern = query_condition_iter->string_pattern();
1019 substring_pattern_frequencies[pattern->id()]++;
1020 }
[email protected]fb5bcc02012-02-17 14:05:421021 }
1022
1023 // Update trigger conditions: Determine for each URLMatcherConditionSet which
[email protected]5bcf3b72012-09-14 00:20:281024 // URLMatcherCondition contains a StringPattern that occurs least
[email protected]fb5bcc02012-02-17 14:05:421025 // frequently in this URLMatcher. We assume that this condition is very
1026 // specific and occurs rarely in URLs. If a match occurs for this
1027 // URLMatcherCondition, we want to test all other URLMatcherCondition in the
1028 // respective URLMatcherConditionSet as well to see whether the entire
1029 // URLMatcherConditionSet is considered matching.
1030 substring_match_triggers_.clear();
1031 for (URLMatcherConditionSets::const_iterator condition_set_iter =
1032 url_matcher_condition_sets_.begin();
1033 condition_set_iter != url_matcher_condition_sets_.end();
1034 ++condition_set_iter) {
1035 const URLMatcherConditionSet::Conditions& conditions =
[email protected]3b001a02012-04-05 10:38:061036 condition_set_iter->second->conditions();
[email protected]fb5bcc02012-02-17 14:05:421037 if (conditions.empty())
1038 continue;
1039 URLMatcherConditionSet::Conditions::const_iterator condition_iter =
1040 conditions.begin();
[email protected]5bcf3b72012-09-14 00:20:281041 StringPattern::ID trigger = condition_iter->string_pattern()->id();
[email protected]fb5bcc02012-02-17 14:05:421042 // We skip the first element in the following loop.
1043 ++condition_iter;
1044 for (; condition_iter != conditions.end(); ++condition_iter) {
[email protected]5bcf3b72012-09-14 00:20:281045 StringPattern::ID current_id =
1046 condition_iter->string_pattern()->id();
[email protected]fb5bcc02012-02-17 14:05:421047 if (substring_pattern_frequencies[trigger] >
1048 substring_pattern_frequencies[current_id]) {
1049 trigger = current_id;
1050 }
1051 }
[email protected]c967c002014-04-11 13:45:021052
1053 const URLMatcherConditionSet::QueryConditions& query_conditions =
1054 condition_set_iter->second->query_conditions();
1055 for (URLMatcherConditionSet::QueryConditions::const_iterator
1056 query_condition_iter = query_conditions.begin();
1057 query_condition_iter != query_conditions.end();
1058 ++query_condition_iter) {
1059 StringPattern::ID current_id =
1060 query_condition_iter->string_pattern()->id();
1061 if (substring_pattern_frequencies[trigger] >
1062 substring_pattern_frequencies[current_id]) {
1063 trigger = current_id;
1064 }
1065 }
1066
[email protected]3b001a02012-04-05 10:38:061067 substring_match_triggers_[trigger].insert(condition_set_iter->second->id());
[email protected]fb5bcc02012-02-17 14:05:421068 }
1069}
1070
1071void URLMatcher::UpdateConditionFactory() {
[email protected]5bcf3b72012-09-14 00:20:281072 std::set<StringPattern::ID> used_patterns;
[email protected]fb5bcc02012-02-17 14:05:421073 for (URLMatcherConditionSets::const_iterator condition_set_iter =
1074 url_matcher_condition_sets_.begin();
1075 condition_set_iter != url_matcher_condition_sets_.end();
1076 ++condition_set_iter) {
1077 const URLMatcherConditionSet::Conditions& conditions =
[email protected]3b001a02012-04-05 10:38:061078 condition_set_iter->second->conditions();
[email protected]fb5bcc02012-02-17 14:05:421079 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
[email protected]3b001a02012-04-05 10:38:061080 conditions.begin(); condition_iter != conditions.end();
1081 ++condition_iter) {
[email protected]5bcf3b72012-09-14 00:20:281082 used_patterns.insert(condition_iter->string_pattern()->id());
[email protected]fb5bcc02012-02-17 14:05:421083 }
[email protected]c967c002014-04-11 13:45:021084 const URLMatcherConditionSet::QueryConditions& query_conditions =
1085 condition_set_iter->second->query_conditions();
1086 for (URLMatcherConditionSet::QueryConditions::const_iterator
1087 query_condition_iter = query_conditions.begin();
1088 query_condition_iter != query_conditions.end();
1089 ++query_condition_iter) {
1090 used_patterns.insert(query_condition_iter->string_pattern()->id());
1091 }
[email protected]fb5bcc02012-02-17 14:05:421092 }
1093 condition_factory_.ForgetUnusedPatterns(used_patterns);
1094}
1095
1096void URLMatcher::UpdateInternalDatastructures() {
1097 UpdateSubstringSetMatcher(false);
1098 UpdateSubstringSetMatcher(true);
[email protected]5bcf3b72012-09-14 00:20:281099 UpdateRegexSetMatcher();
[email protected]fb5bcc02012-02-17 14:05:421100 UpdateTriggers();
1101 UpdateConditionFactory();
1102}
1103
[email protected]716c0162013-12-13 20:36:531104} // namespace url_matcher