| // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #ifndef COMPONENTS_OMNIBOX_BROWSER_URL_INDEX_PRIVATE_DATA_H_ |
| #define COMPONENTS_OMNIBOX_BROWSER_URL_INDEX_PRIVATE_DATA_H_ |
| |
| #include <stddef.h> |
| |
| #include <set> |
| #include <string> |
| |
| #include "base/files/file_path.h" |
| #include "base/gtest_prod_util.h" |
| #include "base/memory/ref_counted.h" |
| #include "components/history/core/browser/history_service.h" |
| #include "components/omnibox/browser/in_memory_url_index_cache.pb.h" |
| #include "components/omnibox/browser/in_memory_url_index_types.h" |
| #include "components/omnibox/browser/scored_history_match.h" |
| |
| class HistoryQuickProviderTest; |
| class TemplateURLService; |
| |
| namespace bookmarks { |
| class BookmarkModel; |
| } |
| |
| namespace in_memory_url_index { |
| class InMemoryURLIndexCacheItem; |
| } |
| |
| namespace history { |
| class HistoryDatabase; |
| class InMemoryURLIndex; |
| class RefCountedBool; |
| } |
| |
| // Current version of the cache file. |
| static const int kCurrentCacheFileVersion = 5; |
| |
| // A structure private to InMemoryURLIndex describing its internal data and |
| // providing for restoring, rebuilding and updating that internal data. As |
| // this class is for exclusive use by the InMemoryURLIndex class there should |
| // be no calls from any other class. |
| // |
| // All public member functions are called on the main thread unless otherwise |
| // annotated. |
| class URLIndexPrivateData |
| : public base::RefCountedThreadSafe<URLIndexPrivateData> { |
| public: |
| URLIndexPrivateData(); |
| |
| // Given a base::string16 in |term_string|, scans the history index and |
| // returns a vector with all scored, matching history items. The |
| // |term_string| is broken down into individual terms (words), each of which |
| // must occur in the candidate history item's URL or page title for the item |
| // to qualify; however, the terms do not necessarily have to be adjacent. We |
| // also allow breaking |term_string| at |cursor_position| (if |
| // set). Once we have a set of candidates, they are filtered to ensure |
| // that all |term_string| terms, as separated by whitespace and the |
| // cursor (if set), occur within the candidate's URL or page title. |
| // Scores are then calculated on no more than |kItemsToScoreLimit| |
| // candidates, as the scoring of such a large number of candidates may |
| // cause perceptible typing response delays in the omnibox. This is |
| // likely to occur for short omnibox terms such as 'h' and 'w' which |
| // will be found in nearly all history candidates. Results are sorted by |
| // descending score. The full results set (i.e. beyond the |
| // |kItemsToScoreLimit| limit) will be retained and used for subsequent calls |
| // to this function. In total, |max_matches| of items will be returned in the |
| // |ScoredHistoryMatches| vector. |
| ScoredHistoryMatches HistoryItemsForTerms( |
| base::string16 term_string, |
| size_t cursor_position, |
| size_t max_matches, |
| bookmarks::BookmarkModel* bookmark_model, |
| TemplateURLService* template_url_service); |
| |
| // Adds the history item in |row| to the index if it does not already already |
| // exist and it meets the minimum 'quick' criteria. If the row already exists |
| // in the index then the index will be updated if the row still meets the |
| // criteria, otherwise the row will be removed from the index. Returns true |
| // if the index was actually updated. |scheme_whitelist| is used to filter |
| // non-qualifying schemes. |history_service| is used to schedule an update to |
| // the recent visits component of this URL's entry in the index. |
| bool UpdateURL(history::HistoryService* history_service, |
| const history::URLRow& row, |
| const std::set<std::string>& scheme_whitelist, |
| base::CancelableTaskTracker* tracker); |
| |
| // Updates the entry for |url_id| in the index, replacing its |
| // recent visits information with |recent_visits|. If |url_id| |
| // is not in the index, does nothing. |
| void UpdateRecentVisits(history::URLID url_id, |
| const history::VisitVector& recent_visits); |
| |
| // Using |history_service| schedules an update (using the historyDB |
| // thread) for the recent visits information for |url_id|. Unless |
| // something unexpectedly goes wrong, UdpateRecentVisits() should |
| // eventually be called from a callback. |
| void ScheduleUpdateRecentVisits(history::HistoryService* history_service, |
| history::URLID url_id, |
| base::CancelableTaskTracker* tracker); |
| |
| // Deletes index data for the history item with the given |url|. |
| // The item may not have actually been indexed, which is the case if it did |
| // not previously meet minimum 'quick' criteria. Returns true if the index |
| // was actually updated. |
| bool DeleteURL(const GURL& url); |
| |
| // Constructs a new object by restoring its contents from the cache file |
| // at |path|. Returns the new URLIndexPrivateData which on success will |
| // contain the restored data but upon failure will be empty. |
| // This function should be run on the the file thread. |
| static scoped_refptr<URLIndexPrivateData> RestoreFromFile( |
| const base::FilePath& path); |
| |
| // Constructs a new object by rebuilding its contents from the history |
| // database in |history_db|. Returns the new URLIndexPrivateData which on |
| // success will contain the rebuilt data but upon failure will be empty. |
| static scoped_refptr<URLIndexPrivateData> RebuildFromHistory( |
| history::HistoryDatabase* history_db, |
| const std::set<std::string>& scheme_whitelist); |
| |
| // Writes |private_data| as a cache file to |file_path| and returns success. |
| static bool WritePrivateDataToCacheFileTask( |
| scoped_refptr<URLIndexPrivateData> private_data, |
| const base::FilePath& file_path); |
| |
| // Creates a copy of ourself. |
| scoped_refptr<URLIndexPrivateData> Duplicate() const; |
| |
| // Returns true if there is no data in the index. |
| bool Empty() const; |
| |
| // Initializes all index data members in preparation for restoring the index |
| // from the cache or a complete rebuild from the history database. |
| void Clear(); |
| |
| private: |
| friend class base::RefCountedThreadSafe<URLIndexPrivateData>; |
| ~URLIndexPrivateData(); |
| |
| friend class AddHistoryMatch; |
| friend class ::HistoryQuickProviderTest; |
| friend class InMemoryURLIndexTest; |
| FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, AddHistoryMatch); |
| FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, CacheSaveRestore); |
| FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, HugeResultSet); |
| FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, ReadVisitsFromHistory); |
| FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, RebuildFromHistoryIfCacheOld); |
| FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, Scoring); |
| FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TitleSearch); |
| FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, TypedCharacterCaching); |
| FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, WhitelistedURLs); |
| FRIEND_TEST_ALL_PREFIXES(LimitedInMemoryURLIndexTest, Initialization); |
| |
| // Support caching of term results so that we can optimize searches which |
| // build upon a previous search. Each entry in this map represents one |
| // search term from the most recent search. For example, if the user had |
| // typed "google blog trans" and then typed an additional 'l' (at the end, |
| // of course) then there would be four items in the cache: 'blog', 'google', |
| // 'trans', and 'transl'. All would be marked as being in use except for the |
| // 'trans' item; its cached data would have been used when optimizing the |
| // construction of the search results candidates for 'transl' but then would |
| // no longer needed. |
| // |
| // Items stored in the search term cache. If a search term exactly matches one |
| // in the cache then we can quickly supply the proper |history_id_set_| (and |
| // marking the cache item as being |used_|. If we find a prefix for a search |
| // term in the cache (which is very likely to occur as the user types each |
| // term into the omnibox) then we can short-circuit the index search for those |
| // characters in the prefix by returning the |word_id_set|. In that case we do |
| // not mark the item as being |used_|. |
| struct SearchTermCacheItem { |
| SearchTermCacheItem(const WordIDSet& word_id_set, |
| const HistoryIDSet& history_id_set); |
| // Creates a cache item for a term which has no results. |
| SearchTermCacheItem(); |
| SearchTermCacheItem(const SearchTermCacheItem& other); |
| |
| ~SearchTermCacheItem(); |
| |
| WordIDSet word_id_set_; |
| HistoryIDSet history_id_set_; |
| bool used_; // True if this item has been used for the current term search. |
| }; |
| typedef std::map<base::string16, SearchTermCacheItem> SearchTermCacheMap; |
| |
| // A helper class which performs the final filter on each candidate |
| // history URL match, inserting accepted matches into |scored_matches_|. |
| class AddHistoryMatch { |
| public: |
| AddHistoryMatch(bookmarks::BookmarkModel* bookmark_model, |
| TemplateURLService* template_url_service, |
| const URLIndexPrivateData& private_data, |
| const base::string16& lower_string, |
| const String16Vector& lower_terms, |
| const base::Time now); |
| AddHistoryMatch(const AddHistoryMatch& other); |
| ~AddHistoryMatch(); |
| |
| void operator()(const HistoryID history_id); |
| |
| ScoredHistoryMatches ScoredMatches() const { return scored_matches_; } |
| |
| private: |
| friend class InMemoryURLIndexTest; |
| FRIEND_TEST_ALL_PREFIXES(InMemoryURLIndexTest, AddHistoryMatch); |
| bookmarks::BookmarkModel* bookmark_model_; |
| TemplateURLService* template_url_service_; |
| const URLIndexPrivateData& private_data_; |
| ScoredHistoryMatches scored_matches_; |
| const base::string16& lower_string_; |
| const String16Vector& lower_terms_; |
| WordStarts lower_terms_to_word_starts_offsets_; |
| const base::Time now_; |
| }; |
| |
| // A helper predicate class used to filter excess history items when the |
| // candidate results set is too large. |
| class HistoryItemFactorGreater { |
| public: |
| explicit HistoryItemFactorGreater(const HistoryInfoMap& history_info_map); |
| ~HistoryItemFactorGreater(); |
| |
| bool operator()(const HistoryID h1, const HistoryID h2); |
| |
| private: |
| const HistoryInfoMap& history_info_map_; |
| }; |
| |
| // URL History indexing support functions. |
| |
| // Composes a set of history item IDs by intersecting the set for each word |
| // in |unsorted_words|. |
| HistoryIDSet HistoryIDSetFromWords(const String16Vector& unsorted_words); |
| |
| // Helper function to HistoryIDSetFromWords which composes a set of history |
| // ids for the given term given in |term|. |
| HistoryIDSet HistoryIDsForTerm(const base::string16& term); |
| |
| // Given a set of Char16s, finds words containing those characters. |
| WordIDSet WordIDSetForTermChars(const Char16Set& term_chars); |
| |
| // Indexes one URL history item as described by |row|. Returns true if the |
| // row was actually indexed. |scheme_whitelist| is used to filter |
| // non-qualifying schemes. If |history_db| is not NULL then this function |
| // uses the history database synchronously to get the URL's recent visits |
| // information. This mode should/ only be used on the historyDB thread. |
| // If |history_db| is NULL, then this function uses |history_service| to |
| // schedule a task on the historyDB thread to fetch and update the recent |
| // visits information. |
| bool IndexRow(history::HistoryDatabase* history_db, |
| history::HistoryService* history_service, |
| const history::URLRow& row, |
| const std::set<std::string>& scheme_whitelist, |
| base::CancelableTaskTracker* tracker); |
| |
| // Parses and indexes the words in the URL and page title of |row| and |
| // calculate the word starts in each, saving the starts in |word_starts|. |
| void AddRowWordsToIndex(const history::URLRow& row, |
| RowWordStarts* word_starts); |
| |
| // Given a single word in |uni_word|, adds a reference for the containing |
| // history item identified by |history_id| to the index. |
| void AddWordToIndex(const base::string16& uni_word, HistoryID history_id); |
| |
| // Creates a new entry in the word/history map for |word_id| and add |
| // |history_id| as the initial element of the word's set. |
| void AddWordHistory(const base::string16& uni_word, HistoryID history_id); |
| |
| // Updates an existing entry in the word/history index by adding the |
| // |history_id| to set for |word_id| in the word_id_history_map_. |
| void UpdateWordHistory(WordID word_id, HistoryID history_id); |
| |
| // Adds |word_id| to |history_id|'s entry in the history/word map, |
| // creating a new entry if one does not already exist. |
| void AddToHistoryIDWordMap(HistoryID history_id, WordID word_id); |
| |
| // Removes |row| and all associated words and characters from the index. |
| void RemoveRowFromIndex(const history::URLRow& row); |
| |
| // Removes all words and characters associated with |row| from the index. |
| void RemoveRowWordsFromIndex(const history::URLRow& row); |
| |
| // Clears |used_| for each item in the search term cache. |
| void ResetSearchTermCache(); |
| |
| // Caches the index private data and writes the cache file to the profile |
| // directory. Called by WritePrivateDataToCacheFileTask. |
| bool SaveToFile(const base::FilePath& file_path); |
| |
| // Encode a data structure into the protobuf |cache|. |
| void SavePrivateData( |
| in_memory_url_index::InMemoryURLIndexCacheItem* cache) const; |
| void SaveWordList( |
| in_memory_url_index::InMemoryURLIndexCacheItem* cache) const; |
| void SaveWordMap(in_memory_url_index::InMemoryURLIndexCacheItem* cache) const; |
| void SaveCharWordMap( |
| in_memory_url_index::InMemoryURLIndexCacheItem* cache) const; |
| void SaveWordIDHistoryMap( |
| in_memory_url_index::InMemoryURLIndexCacheItem* cache) const; |
| void SaveHistoryInfoMap( |
| in_memory_url_index::InMemoryURLIndexCacheItem* cache) const; |
| void SaveWordStartsMap( |
| in_memory_url_index::InMemoryURLIndexCacheItem* cache) const; |
| |
| // Decode a data structure from the protobuf |cache|. Return false if there |
| // is any kind of failure. |
| bool RestorePrivateData( |
| const in_memory_url_index::InMemoryURLIndexCacheItem& cache); |
| bool RestoreWordList( |
| const in_memory_url_index::InMemoryURLIndexCacheItem& cache); |
| bool RestoreWordMap( |
| const in_memory_url_index::InMemoryURLIndexCacheItem& cache); |
| bool RestoreCharWordMap( |
| const in_memory_url_index::InMemoryURLIndexCacheItem& cache); |
| bool RestoreWordIDHistoryMap( |
| const in_memory_url_index::InMemoryURLIndexCacheItem& cache); |
| bool RestoreHistoryInfoMap( |
| const in_memory_url_index::InMemoryURLIndexCacheItem& cache); |
| bool RestoreWordStartsMap( |
| const in_memory_url_index::InMemoryURLIndexCacheItem& cache); |
| |
| // Determines if |gurl| has a whitelisted scheme and returns true if so. |
| static bool URLSchemeIsWhitelisted(const GURL& gurl, |
| const std::set<std::string>& whitelist); |
| |
| // Cache of search terms. |
| SearchTermCacheMap search_term_cache_; |
| |
| // Start of data members that are cached ------------------------------------- |
| |
| // The version of the cache file most recently used to restore this instance |
| // of the private data. If the private data was rebuilt from the history |
| // database this will be 0. |
| int restored_cache_version_; |
| |
| // The last time the data was rebuilt from the history database. |
| base::Time last_time_rebuilt_from_history_; |
| |
| // A list of all of indexed words. The index of a word in this list is the |
| // ID of the word in the word_map_. It reduces the memory overhead by |
| // replacing a potentially long and repeated string with a simple index. |
| String16Vector word_list_; |
| |
| // A list of available words slots in |word_list_|. An available word slot |
| // is the index of a unused word in word_list_ vector, also referred to as |
| // a WordID. As URL visits are added or modified new words may be added to |
| // the index, in which case any available words are used, if any, and then |
| // words are added to the end of the word_list_. When URL visits are |
| // modified or deleted old words may be removed from the index, in which |
| // case the slots for those words are added to available_words_ for resuse |
| // by future URL updates. |
| WordIDSet available_words_; |
| |
| // A one-to-one mapping from the a word string to its slot number (i.e. |
| // WordID) in the |word_list_|. |
| WordMap word_map_; |
| |
| // A one-to-many mapping from a single character to all WordIDs of words |
| // containing that character. |
| CharWordIDMap char_word_map_; |
| |
| // A one-to-many mapping from a WordID to all HistoryIDs (the row_id as |
| // used in the history database) of history items in which the word occurs. |
| WordIDHistoryMap word_id_history_map_; |
| |
| // A one-to-many mapping from a HistoryID to all WordIDs of words that occur |
| // in the URL and/or page title of the history item referenced by that |
| // HistoryID. |
| HistoryIDWordMap history_id_word_map_; |
| |
| // A one-to-one mapping from HistoryID to the history item data governing |
| // index inclusion and relevance scoring. |
| HistoryInfoMap history_info_map_; |
| |
| // A one-to-one mapping from HistoryID to the word starts detected in each |
| // item's URL and page title. |
| WordStartsMap word_starts_map_; |
| |
| // End of data members that are cached --------------------------------------- |
| |
| // For unit testing only. Specifies the version of the cache file to be saved. |
| // Used only for testing upgrading of an older version of the cache upon |
| // restore. |
| int saved_cache_version_; |
| |
| // Used for unit testing only. Records the number of candidate history items |
| // at three stages in the index searching process. |
| size_t pre_filter_item_count_; // After word index is queried. |
| size_t post_filter_item_count_; // After trimming large result set. |
| size_t post_scoring_item_count_; // After performing final filter/scoring. |
| }; |
| |
| #endif // COMPONENTS_OMNIBOX_BROWSER_URL_INDEX_PRIVATE_DATA_H_ |