blob: d92a4a347859d0d0f0d9ab2ecfab242f2d6d178a [file] [log] [blame]
[email protected]fb5bcc02012-02-17 14:05:421// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/browser/extensions/api/declarative/url_matcher.h"
6
7#include <algorithm>
8
9#include "base/logging.h"
10#include "googleurl/src/gurl.h"
11
12namespace extensions {
13
14// This set of classes implement a mapping of URL Component Patterns, such as
15// host_prefix, host_suffix, host_equals, ..., etc., to SubstringPatterns.
16//
17// The idea of this mapping is to reduce the problem of comparing many
18// URL Component Patterns against one URL to the problem of searching many
19// substrings in one string:
20//
21// ---------------------- --------------------
22// | URL Query operator | ----translate----> | SubstringPattern |
23// ---------------------- --------------------
24// ^
25// |
26// compare
27// |
28// v
29// ---------------------- --------------------
30// | URL to compare | | |
31// | to all URL Query | ----translate----> | String |
32// | operators | | |
33// ---------------------- --------------------
34//
35// The reason for this problem reduction is that there are efficient algorithms
36// for searching many substrings in one string (see Aho-Corasick algorithm).
37//
38// Case 1: {host,path,query}_{prefix,suffix,equals} searches.
39// ==========================================================
40//
41// For searches in this class, we normalize URLs as follows:
42//
43// Step 1:
44// Remove scheme, port and segment from URL:
45// -> http://www.example.com:8080/index.html?search=foo#first_match becomes
46// www.example.com/index.html?search=foo
47//
48// We remove the scheme and port number because they can be checked later
49// in a secondary filter step. We remove the segment (the #... part) because
50// this is not guaranteed to be ASCII-7 encoded.
51//
52// Step 2:
53// Translate URL to String and add the following position markers:
54// - BU = Beginning of URL
55// - ED = End of Domain
56// - EP = End of Path
57// - EU = End of URL
58// Furthermore, the hostname is canonicalized to start with a ".".
59//
60// Position markers are represented as characters >127, which are therefore
61// guaranteed not to be part of the ASCII-7 encoded URL character set.
62//
63// -> www.example.com/index.html?search=foo becomes
64// BU .www.example.com ED /index.html EP ?search=foo EU
65//
66// -> www.example.com/index.html becomes
67// BU .www.example.com ED /index.html EP EU
68//
69// Step 3:
70// Translate URL Component Patterns as follows:
71//
72// host_prefix(prefix) = BU add_missing_dot_prefix(prefix)
73// -> host_prefix("www.example") = BU .www.example
74//
75// host_suffix(suffix) = suffix ED
76// -> host_suffix("example.com") = example.com ED
77// -> host_suffix(".example.com") = .example.com ED
78//
79// host_equals(domain) = BU add_missing_dot_prefix(domain) ED
80// -> host_equals("www.example.com") = BU .www.example.com ED
81//
82// Similarly for path query parameters ({path, query}_{prefix, suffix, equals}).
83//
84// With this, we can search the SubstringPatterns in the normalized URL.
85//
86//
87// Case 2: url_{prefix,suffix,equals,contains} searches.
88// =====================================================
89//
90// Step 1: as above
91//
92// Step 2:
93// Translate URL to String and add the following position markers:
94// - BU = Beginning of URL
95// - EU = End of URL
96// Furthermore, the hostname is canonicalized to start with a ".".
97//
98// -> www.example.com/index.html?search=foo becomes
99// BU .www.example.com/index.html?search=foo EU
100//
101// url_prefix(prefix) = BU add_missing_dot_prefix(prefix)
102// -> url_prefix("www.example") = BU .www.example
103//
104// url_contains(substring) = substring
105// -> url_contains("index") = index
106//
107//
108// Case 3: {host,path,query}_contains searches.
109// ============================================
110//
111// These kinds of searches are not supported directly but can be derived
112// by a combination of a url_contains() query followed by an explicit test:
113//
114// host_contains(str) = url_contains(str) followed by test whether str occurs
115// in host comonent of original URL.
116// -> host_contains("example.co") = example.co
117// followed by gurl.host().find("example.co");
118//
119// [similarly for path_contains and query_contains].
120
121
122//
123// URLMatcherCondition
124//
125
126URLMatcherCondition::URLMatcherCondition()
127 : criterion_(HOST_PREFIX),
128 substring_pattern_(NULL) {}
129
130URLMatcherCondition::~URLMatcherCondition() {}
131
132URLMatcherCondition::URLMatcherCondition(
133 Criterion criterion,
134 const SubstringPattern* substring_pattern)
135 : criterion_(criterion),
136 substring_pattern_(substring_pattern) {}
137
138URLMatcherCondition::URLMatcherCondition(const URLMatcherCondition& rhs)
139 : criterion_(rhs.criterion_),
140 substring_pattern_(rhs.substring_pattern_) {}
141
142URLMatcherCondition& URLMatcherCondition::operator=(
143 const URLMatcherCondition& rhs) {
144 criterion_ = rhs.criterion_;
145 substring_pattern_ = rhs.substring_pattern_;
146 return *this;
147}
148
149bool URLMatcherCondition::operator<(const URLMatcherCondition& rhs) const {
150 if (criterion_ < rhs.criterion_) return true;
151 if (criterion_ > rhs.criterion_) return false;
152 if (substring_pattern_ != NULL && rhs.substring_pattern_ != NULL)
153 return *substring_pattern_ < *rhs.substring_pattern_;
154 if (substring_pattern_ == NULL && rhs.substring_pattern_ != NULL) return true;
155 // Either substring_pattern_ != NULL && rhs.substring_pattern_ == NULL,
156 // or both are NULL.
157 return false;
158}
159
160bool URLMatcherCondition::IsFullURLCondition() const {
161 // For these criteria the SubstringMatcher needs to be executed on the
162 // GURL that is canonlizaliced with
163 // URLMatcherConditionFactory::CanonicalizeURLForFullSearches.
164 switch (criterion_) {
165 case HOST_CONTAINS:
166 case PATH_CONTAINS:
167 case QUERY_CONTAINS:
168 case URL_PREFIX:
169 case URL_SUFFIX:
170 case URL_CONTAINS:
171 case URL_EQUALS:
172 return true;
173 default:
174 break;
175 }
176 return false;
177}
178
179bool URLMatcherCondition::IsMatch(
180 const std::set<SubstringPattern::ID>& matching_substring_patterns,
181 const GURL& url) const {
182 DCHECK(substring_pattern_);
183 if (matching_substring_patterns.find(substring_pattern_->id()) ==
184 matching_substring_patterns.end())
185 return false;
186 // The criteria HOST_CONTAINS, PATH_CONTAINS, QUERY_CONTAINS are based on
187 // a substring match on the raw URL. In case of a match, we need to verify
188 // that the match was found in the correct component of the URL.
189 switch (criterion_) {
190 case HOST_CONTAINS:
191 return url.host().find(substring_pattern_->pattern()) !=
192 std::string::npos;
193 case PATH_CONTAINS:
194 return url.path().find(substring_pattern_->pattern()) !=
195 std::string::npos;
196 case QUERY_CONTAINS:
197 return url.query().find(substring_pattern_->pattern()) !=
198 std::string::npos;
199 default:
200 break;
201 }
202 return true;
203}
204
205//
206// URLMatcherConditionFactory
207//
208
209namespace {
210// These are symbols that are not contained in 7-bit ASCII used in GURLs.
211const char kBeginningOfURL[] = {-1, 0};
212const char kEndOfDomain[] = {-2, 0};
213const char kEndOfPath[] = {-3, 0};
214const char kEndOfURL[] = {-4, 0};
215} // namespace
216
217URLMatcherConditionFactory::URLMatcherConditionFactory() : id_counter_(0) {}
218
219URLMatcherConditionFactory::~URLMatcherConditionFactory() {
220 STLDeleteElements(&pattern_singletons_);
221}
222
223std::string URLMatcherConditionFactory::CanonicalizeURLForComponentSearches(
224 const GURL& url) {
225 return kBeginningOfURL + CanonicalizeHostname(url.host()) + kEndOfDomain +
226 url.path() + kEndOfPath + (url.has_query() ? "?" + url.query() : "") +
227 kEndOfURL;
228}
229
230URLMatcherCondition URLMatcherConditionFactory::CreateHostPrefixCondition(
231 const std::string& prefix) {
232 return CreateCondition(URLMatcherCondition::HOST_PREFIX,
233 kBeginningOfURL + CanonicalizeHostname(prefix));
234}
235
236URLMatcherCondition URLMatcherConditionFactory::CreateHostSuffixCondition(
237 const std::string& suffix) {
238 return CreateCondition(URLMatcherCondition::HOST_SUFFIX,
239 suffix + kEndOfDomain);
240}
241
242URLMatcherCondition URLMatcherConditionFactory::CreateHostContainsCondition(
243 const std::string& str) {
244 return CreateCondition(URLMatcherCondition::HOST_CONTAINS, str);
245}
246
247URLMatcherCondition URLMatcherConditionFactory::CreateHostEqualsCondition(
248 const std::string& str) {
249 return CreateCondition(URLMatcherCondition::HOST_EQUALS,
250 kBeginningOfURL + CanonicalizeHostname(str) + kEndOfDomain);
251}
252
253URLMatcherCondition URLMatcherConditionFactory::CreatePathPrefixCondition(
254 const std::string& prefix) {
255 return CreateCondition(URLMatcherCondition::PATH_PREFIX,
256 kEndOfDomain + prefix);
257}
258
259URLMatcherCondition URLMatcherConditionFactory::CreatePathSuffixCondition(
260 const std::string& suffix) {
261 return CreateCondition(URLMatcherCondition::HOST_SUFFIX, suffix + kEndOfPath);
262}
263
264URLMatcherCondition URLMatcherConditionFactory::CreatePathContainsCondition(
265 const std::string& str) {
266 return CreateCondition(URLMatcherCondition::PATH_CONTAINS, str);
267}
268
269URLMatcherCondition URLMatcherConditionFactory::CreatePathEqualsCondition(
270 const std::string& str) {
271 return CreateCondition(URLMatcherCondition::PATH_EQUALS,
272 kEndOfDomain + str + kEndOfPath);
273}
274
275URLMatcherCondition URLMatcherConditionFactory::CreateQueryPrefixCondition(
276 const std::string& prefix) {
277 return CreateCondition(URLMatcherCondition::QUERY_PREFIX,
278 kEndOfPath + prefix);
279}
280
281URLMatcherCondition URLMatcherConditionFactory::CreateQuerySuffixCondition(
282 const std::string& suffix) {
283 return CreateCondition(URLMatcherCondition::QUERY_SUFFIX, suffix + kEndOfURL);
284}
285
286URLMatcherCondition URLMatcherConditionFactory::CreateQueryContainsCondition(
287 const std::string& str) {
288 return CreateCondition(URLMatcherCondition::QUERY_CONTAINS, str);
289}
290
291URLMatcherCondition URLMatcherConditionFactory::CreateQueryEqualsCondition(
292 const std::string& str) {
293 return CreateCondition(URLMatcherCondition::QUERY_EQUALS,
294 kEndOfPath + str + kEndOfURL);
295}
296
297URLMatcherCondition
298 URLMatcherConditionFactory::CreateHostSuffixPathPrefixCondition(
299 const std::string& host_suffix,
300 const std::string& path_prefix) {
301 return CreateCondition(URLMatcherCondition::HOST_SUFFIX_PATH_PREFIX,
302 host_suffix + kEndOfDomain + path_prefix);
303}
304
305std::string URLMatcherConditionFactory::CanonicalizeURLForFullSearches(
306 const GURL& url) {
307 return kBeginningOfURL + CanonicalizeHostname(url.host()) + url.path() +
308 (url.has_query() ? "?" + url.query() : "") + kEndOfURL;
309}
310
311URLMatcherCondition URLMatcherConditionFactory::CreateURLPrefixCondition(
312 const std::string& prefix) {
313 return CreateCondition(URLMatcherCondition::URL_PREFIX,
314 kBeginningOfURL + CanonicalizeHostname(prefix));
315}
316
317URLMatcherCondition URLMatcherConditionFactory::CreateURLSuffixCondition(
318 const std::string& suffix) {
319 return CreateCondition(URLMatcherCondition::URL_SUFFIX, suffix + kEndOfURL);
320}
321
322URLMatcherCondition URLMatcherConditionFactory::CreateURLContainsCondition(
323 const std::string& str) {
324 return CreateCondition(URLMatcherCondition::URL_CONTAINS, str);
325}
326
327URLMatcherCondition URLMatcherConditionFactory::CreateURLEqualsCondition(
328 const std::string& str) {
329 return CreateCondition(URLMatcherCondition::QUERY_EQUALS,
330 kBeginningOfURL + CanonicalizeHostname(str) + kEndOfURL);
331}
332
333void URLMatcherConditionFactory::ForgetUnusedPatterns(
334 const std::set<SubstringPattern::ID>& used_patterns) {
335 PatternSingletons::iterator i = pattern_singletons_.begin();
336 while (i != pattern_singletons_.end()) {
[email protected]2fb51d92012-02-17 15:05:47337 if (used_patterns.find((*i)->id()) != used_patterns.end()) {
[email protected]fb5bcc02012-02-17 14:05:42338 ++i;
[email protected]2fb51d92012-02-17 15:05:47339 } else {
340 delete *i;
[email protected]fb5bcc02012-02-17 14:05:42341 pattern_singletons_.erase(i++);
[email protected]2fb51d92012-02-17 15:05:47342 }
[email protected]fb5bcc02012-02-17 14:05:42343 }
344}
345
346URLMatcherCondition URLMatcherConditionFactory::CreateCondition(
347 URLMatcherCondition::Criterion criterion,
348 const std::string& pattern) {
349 SubstringPattern search_pattern(pattern, 0);
350 PatternSingletons::const_iterator iter =
351 pattern_singletons_.find(&search_pattern);
352 if (iter != pattern_singletons_.end()) {
353 return URLMatcherCondition(criterion, *iter);
354 } else {
355 SubstringPattern* new_pattern =
356 new SubstringPattern(pattern, id_counter_++);
357 pattern_singletons_.insert(new_pattern);
358 return URLMatcherCondition(criterion, new_pattern);
359 }
360}
361
362std::string URLMatcherConditionFactory::CanonicalizeHostname(
363 const std::string& hostname) const {
364 if (!hostname.empty() && hostname[0] == '.')
365 return hostname;
366 else
367 return "." + hostname;
368}
369
370bool URLMatcherConditionFactory::SubstringPatternPointerCompare::operator()(
371 SubstringPattern* lhs,
372 SubstringPattern* rhs) const {
373 if (lhs == NULL && rhs != NULL) return true;
374 if (lhs != NULL && rhs != NULL)
375 return lhs->pattern() < rhs->pattern();
376 // Either both are NULL or only rhs is NULL.
377 return false;
378}
379
380//
381// URLMatcherConditionSet
382//
383
384URLMatcherConditionSet::URLMatcherConditionSet() : id_(-1) {}
385
386URLMatcherConditionSet::~URLMatcherConditionSet() {}
387
388URLMatcherConditionSet::URLMatcherConditionSet(
389 ID id,
390 const Conditions& conditions)
391 : id_(id),
392 conditions_(conditions) {}
393
394URLMatcherConditionSet::URLMatcherConditionSet(
395 const URLMatcherConditionSet& rhs)
396 : id_(rhs.id_), conditions_(rhs.conditions_) {}
397
398URLMatcherConditionSet& URLMatcherConditionSet::operator=(
399 const URLMatcherConditionSet& rhs) {
400 id_ = rhs.id_;
401 conditions_ = rhs.conditions_;
402 return *this;
403}
404
405bool URLMatcherConditionSet::IsMatch(
406 const std::set<SubstringPattern::ID>& matching_substring_patterns,
407 const GURL& url) const {
408 for (Conditions::const_iterator i = conditions_.begin();
409 i != conditions_.end(); ++i) {
410 if (!i->IsMatch(matching_substring_patterns, url))
411 return false;
412 }
413 return true;
414}
415
416
417//
418// URLMatcher
419//
420
421URLMatcher::URLMatcher() {}
422
423URLMatcher::~URLMatcher() {}
424
425void URLMatcher::AddConditionSets(
426 const std::vector<URLMatcherConditionSet>& condition_sets) {
427 for (std::vector<URLMatcherConditionSet>::const_iterator i =
428 condition_sets.begin(); i != condition_sets.end(); ++i) {
429 DCHECK(url_matcher_condition_sets_.find(i->id()) ==
430 url_matcher_condition_sets_.end());
431 url_matcher_condition_sets_[i->id()] = *i;
432 }
433 UpdateInternalDatastructures();
434}
435
436void URLMatcher::RemoveConditionSets(
437 const std::vector<URLMatcherConditionSet::ID>& condition_set_ids) {
438 for (std::vector<URLMatcherConditionSet::ID>::const_iterator i =
439 condition_set_ids.begin(); i != condition_set_ids.end(); ++i) {
440 DCHECK(url_matcher_condition_sets_.find(*i) !=
441 url_matcher_condition_sets_.end());
442 url_matcher_condition_sets_.erase(*i);
443 }
444 UpdateInternalDatastructures();
445}
446
447std::set<URLMatcherConditionSet::ID> URLMatcher::MatchURL(const GURL& url) {
448 // Find all IDs of SubstringPatterns that match |url|.
449 // See URLMatcherConditionFactory for the canonicalization of URLs and the
450 // distinction between full url searches and url component searches.
451 std::set<SubstringPattern::ID> matches;
452 full_url_matcher_.Match(
453 condition_factory_.CanonicalizeURLForFullSearches(url), &matches);
454 url_component_matcher_.Match(
455 condition_factory_.CanonicalizeURLForComponentSearches(url), &matches);
456
457 // Calculate all URLMatcherConditionSets for which all URLMatcherConditions
458 // were fulfilled.
459 std::set<URLMatcherConditionSet::ID> result;
460 for (std::set<SubstringPattern::ID>::const_iterator i = matches.begin();
461 i != matches.end(); ++i) {
462 // For each URLMatcherConditionSet there is exactly one condition
463 // registered in substring_match_triggers_. This means that the following
464 // logic tests each URLMatcherConditionSet exactly once if it can be
465 // completely fulfilled.
466 std::set<URLMatcherConditionSet::ID>& condition_sets =
467 substring_match_triggers_[*i];
468 for (std::set<URLMatcherConditionSet::ID>::const_iterator j =
469 condition_sets.begin(); j != condition_sets.end(); ++j) {
470 if (url_matcher_condition_sets_[*j].IsMatch(matches, url))
471 result.insert(*j);
472 }
473 }
474
475 return result;
476}
477
478void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) {
479 // The purpose of |full_url_conditions| is just that we need to execute
480 // the same logic once for Full URL searches and once for URL Component
481 // searches (see URLMatcherConditionFactory).
482
483 // Determine which patterns need to be registered when this function
484 // terminates.
485 std::set<const SubstringPattern*> new_patterns;
486 for (URLMatcherConditionSets::const_iterator condition_set_iter =
487 url_matcher_condition_sets_.begin();
488 condition_set_iter != url_matcher_condition_sets_.end();
489 ++condition_set_iter) {
490 const URLMatcherConditionSet::Conditions& conditions =
491 condition_set_iter->second.conditions();
492 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
493 conditions.begin(); condition_iter != conditions.end();
494 ++condition_iter) {
495 // If we are called to process Full URL searches, ignore all others,
496 // and vice versa.
497 if (full_url_conditions == condition_iter->IsFullURLCondition())
498 new_patterns.insert(condition_iter->substring_pattern());
499 }
500 }
501
502 // This is the set of patterns that were registered before this function
503 // is called.
504 std::set<const SubstringPattern*>& registered_patterns =
505 full_url_conditions ? registered_full_url_patterns_
506 : registered_url_component_patterns_;
507
508 // Add all patterns that are in new_patterns but not in registered_patterns.
509 std::vector<const SubstringPattern*> patterns_to_register;
510 std::set_difference(
511 new_patterns.begin(), new_patterns.end(),
512 registered_patterns.begin(), registered_patterns.end(),
513 std::back_inserter(patterns_to_register));
514
515 // Remove all patterns that are in registered_patterns but not in
516 // new_patterns.
517 std::vector<const SubstringPattern*> patterns_to_unregister;
518 std::set_difference(
519 registered_patterns.begin(), registered_patterns.end(),
520 new_patterns.begin(), new_patterns.end(),
521 std::back_inserter(patterns_to_unregister));
522
523 // Update the SubstringSetMatcher.
524 SubstringSetMatcher& url_matcher =
525 full_url_conditions ? full_url_matcher_ : url_component_matcher_;
526 url_matcher.RegisterAndUnregisterPatterns(patterns_to_register,
527 patterns_to_unregister);
528
529 // Update the set of registered_patterns for the next time this function
530 // is being called.
531 registered_patterns.swap(new_patterns);
532}
533
534void URLMatcher::UpdateTriggers() {
535 // Count substring pattern frequencies.
536 std::map<SubstringPattern::ID, size_t> substring_pattern_frequencies;
537 for (URLMatcherConditionSets::const_iterator condition_set_iter =
538 url_matcher_condition_sets_.begin();
539 condition_set_iter != url_matcher_condition_sets_.end();
540 ++condition_set_iter) {
541 const URLMatcherConditionSet::Conditions& conditions =
542 condition_set_iter->second.conditions();
543 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
544 conditions.begin(); condition_iter != conditions.end();
545 ++condition_iter) {
546 const SubstringPattern* pattern = condition_iter->substring_pattern();
547 substring_pattern_frequencies[pattern->id()]++;
548 }
549 }
550
551 // Update trigger conditions: Determine for each URLMatcherConditionSet which
552 // URLMatcherCondition contains a SubstringPattern that occurs least
553 // frequently in this URLMatcher. We assume that this condition is very
554 // specific and occurs rarely in URLs. If a match occurs for this
555 // URLMatcherCondition, we want to test all other URLMatcherCondition in the
556 // respective URLMatcherConditionSet as well to see whether the entire
557 // URLMatcherConditionSet is considered matching.
558 substring_match_triggers_.clear();
559 for (URLMatcherConditionSets::const_iterator condition_set_iter =
560 url_matcher_condition_sets_.begin();
561 condition_set_iter != url_matcher_condition_sets_.end();
562 ++condition_set_iter) {
563 const URLMatcherConditionSet::Conditions& conditions =
564 condition_set_iter->second.conditions();
565 if (conditions.empty())
566 continue;
567 URLMatcherConditionSet::Conditions::const_iterator condition_iter =
568 conditions.begin();
569 SubstringPattern::ID trigger = condition_iter->substring_pattern()->id();
570 // We skip the first element in the following loop.
571 ++condition_iter;
572 for (; condition_iter != conditions.end(); ++condition_iter) {
573 SubstringPattern::ID current_id =
574 condition_iter->substring_pattern()->id();
575 if (substring_pattern_frequencies[trigger] >
576 substring_pattern_frequencies[current_id]) {
577 trigger = current_id;
578 }
579 }
580 substring_match_triggers_[trigger].insert(condition_set_iter->second.id());
581 }
582}
583
584void URLMatcher::UpdateConditionFactory() {
585 std::set<SubstringPattern::ID> used_patterns;
586 for (URLMatcherConditionSets::const_iterator condition_set_iter =
587 url_matcher_condition_sets_.begin();
588 condition_set_iter != url_matcher_condition_sets_.end();
589 ++condition_set_iter) {
590 const URLMatcherConditionSet::Conditions& conditions =
591 condition_set_iter->second.conditions();
592 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
593 conditions.begin(); condition_iter != conditions.end();
594 ++condition_iter) {
595 used_patterns.insert(condition_iter->substring_pattern()->id());
596 }
597 }
598 condition_factory_.ForgetUnusedPatterns(used_patterns);
599}
600
601void URLMatcher::UpdateInternalDatastructures() {
602 UpdateSubstringSetMatcher(false);
603 UpdateSubstringSetMatcher(true);
604 UpdateTriggers();
605 UpdateConditionFactory();
606}
607
608} // namespace extensions