blob: cb96e163c51dce60c81cbb5dfcbb68158dedba85 [file] [log] [blame]
license.botbf09a502008-08-24 00:55:551// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
initial.commit09911bf2008-07-26 23:55:294
5#include "chrome/browser/history/snippet.h"
6
7#include <algorithm>
8
9#include "base/logging.h"
10#include "base/scoped_ptr.h"
11#include "base/string_util.h"
[email protected]9fd9092f2010-03-08 23:28:4112#include "base/utf_string_conversions.h"
initial.commit09911bf2008-07-26 23:55:2913#include "unicode/brkiter.h"
14#include "unicode/utext.h"
15#include "unicode/utf8.h"
16
17namespace {
18
[email protected]c29962f22008-12-03 00:47:5819bool PairFirstLessThan(const Snippet::MatchPosition& a,
20 const Snippet::MatchPosition& b) {
initial.commit09911bf2008-07-26 23:55:2921 return a.first < b.first;
22}
23
24// Combines all pairs after offset in match_positions that are contained
25// or touch the pair at offset.
26void CoalescePositionsFrom(size_t offset,
27 Snippet::MatchPositions* match_positions) {
28 DCHECK(offset < match_positions->size());
[email protected]c29962f22008-12-03 00:47:5829 Snippet::MatchPosition& pair((*match_positions)[offset]);
initial.commit09911bf2008-07-26 23:55:2930 ++offset;
31 while (offset < match_positions->size() &&
32 pair.second >= (*match_positions)[offset].first) {
33 pair.second = std::max(pair.second, (*match_positions)[offset].second);
34 match_positions->erase(match_positions->begin() + offset);
35 }
36}
37
38// Makes sure there is a pair in match_positions that contains the specified
39// range. This keeps the pairs ordered in match_positions by first, and makes
40// sure none of the pairs in match_positions touch each other.
[email protected]c29962f22008-12-03 00:47:5841void AddMatch(size_t start,
42 size_t end,
43 Snippet::MatchPositions* match_positions) {
44 DCHECK(start < end);
45 DCHECK(match_positions);
46 Snippet::MatchPosition pair(start, end);
initial.commit09911bf2008-07-26 23:55:2947 if (match_positions->empty()) {
48 match_positions->push_back(pair);
49 return;
50 }
51 // There's at least one match. Find the position of the new match,
52 // potentially extending pairs around it.
53 Snippet::MatchPositions::iterator i =
54 std::lower_bound(match_positions->begin(), match_positions->end(),
55 pair, &PairFirstLessThan);
56 if (i != match_positions->end() && i->first == start) {
57 // Match not at the end and there is already a pair with the same
58 // start.
59 if (end > i->second) {
60 // New pair extends beyond existing pair. Extend existing pair and
61 // coalesce matches after it.
62 i->second = end;
63 CoalescePositionsFrom(i - match_positions->begin(), match_positions);
[email protected]9fd9092f2010-03-08 23:28:4164 } // else case, new pair completely contained in existing pair, nothing
65 // to do.
initial.commit09911bf2008-07-26 23:55:2966 } else if (i == match_positions->begin()) {
67 // Match at the beginning and the first pair doesn't have the same
68 // start. Insert new pair and coalesce matches after it.
69 match_positions->insert(i, pair);
70 CoalescePositionsFrom(0, match_positions);
71 } else {
72 // Not at the beginning (but may be at the end).
73 --i;
74 if (start <= i->second && end > i->second) {
75 // Previous element contains match. Extend it and coalesce.
76 i->second = end;
77 CoalescePositionsFrom(i - match_positions->begin(), match_positions);
78 } else if (end > i->second) {
79 // Region doesn't touch previous element. See if region touches current
80 // element.
81 ++i;
82 if (i == match_positions->end() || end < i->first) {
83 match_positions->insert(i, pair);
84 } else {
85 i->first = start;
86 i->second = end;
87 CoalescePositionsFrom(i - match_positions->begin(), match_positions);
88 }
89 }
90 }
91}
92
[email protected]e53668962010-06-23 15:35:2593// Converts an index in a utf8 string into the index in the corresponding utf16
94// string and returns the utf16 index. This is intended to be called in a loop
initial.commit09911bf2008-07-26 23:55:2995// iterating through a utf8 string.
96//
97// utf8_string: the utf8 string.
98// utf8_length: length of the utf8 string.
99// offset: the utf8 offset to convert.
100// utf8_pos: current offset in the utf8 string. This is modified and on return
101// matches offset.
102// wide_pos: current index in the wide string. This is the same as the return
103// value.
[email protected]e53668962010-06-23 15:35:25104size_t AdvanceAndReturnUTF16Pos(const char* utf8_string,
105 int32_t utf8_length,
106 int32_t offset,
107 int32_t* utf8_pos,
108 size_t* utf16_pos) {
initial.commit09911bf2008-07-26 23:55:29109 DCHECK(offset >= *utf8_pos && offset <= utf8_length);
110
111 UChar32 wide_char;
112 while (*utf8_pos < offset) {
113 U8_NEXT(utf8_string, *utf8_pos, utf8_length, wide_char);
[email protected]e53668962010-06-23 15:35:25114 *utf16_pos += (wide_char <= 0xFFFF) ? 1 : 2;
initial.commit09911bf2008-07-26 23:55:29115 }
[email protected]e53668962010-06-23 15:35:25116 return *utf16_pos;
initial.commit09911bf2008-07-26 23:55:29117}
118
119// Given a character break iterator over a UTF-8 string, set the iterator
120// position to |*utf8_pos| and move by |count| characters. |count| can
121// be either positive or negative.
[email protected]b5b2385a2009-08-18 05:12:29122void MoveByNGraphemes(icu::BreakIterator* bi, int count, size_t* utf8_pos) {
initial.commit09911bf2008-07-26 23:55:29123 // Ignore the return value. A side effect of the current position
124 // being set at or following |*utf8_pos| is exploited here.
125 // It's simpler than calling following(n) and then previous().
126 // isBoundary() is not very fast, but should be good enough for the
127 // snippet generation. If not, revisit the way we scan in ComputeSnippet.
128 bi->isBoundary(*utf8_pos);
129 bi->next(count);
[email protected]c29962f22008-12-03 00:47:58130 *utf8_pos = static_cast<size_t>(bi->current());
initial.commit09911bf2008-07-26 23:55:29131}
132
133// The amount of context to include for a given hit. Note that it's counted
134// in terms of graphemes rather than bytes.
135const int kSnippetContext = 50;
136
137// Returns true if next match falls within a snippet window
138// from the previous match. The window size is counted in terms
139// of graphemes rather than bytes in UTF-8.
[email protected]b5b2385a2009-08-18 05:12:29140bool IsNextMatchWithinSnippetWindow(icu::BreakIterator* bi,
[email protected]c29962f22008-12-03 00:47:58141 size_t previous_match_end,
142 size_t next_match_start) {
initial.commit09911bf2008-07-26 23:55:29143 // If it's within a window in terms of bytes, it's certain
144 // that it's within a window in terms of graphemes as well.
145 if (next_match_start < previous_match_end + kSnippetContext)
146 return true;
147 bi->isBoundary(previous_match_end);
148 // An alternative to this is to call |bi->next()| at most
149 // kSnippetContext times, compare |bi->current()| with |next_match_start|
150 // after each call and return early if possible. There are other
151 // heuristics to speed things up if necessary, but it's not likely that
152 // we need to bother.
153 bi->next(kSnippetContext);
[email protected]65d55d82009-07-28 21:15:56154 int64 current = bi->current();
155 return (next_match_start < static_cast<uint64>(current) ||
[email protected]b5b2385a2009-08-18 05:12:29156 current == icu::BreakIterator::DONE);
initial.commit09911bf2008-07-26 23:55:29157}
158
159} // namespace
160
161// static
162void Snippet::ExtractMatchPositions(const std::string& offsets_str,
163 const std::string& column_num,
164 MatchPositions* match_positions) {
165 DCHECK(match_positions);
166 if (offsets_str.empty())
167 return;
168 std::vector<std::string> offsets;
169 SplitString(offsets_str, ' ', &offsets);
170 // SQLite offsets are sets of four integers:
171 // column, query term, match offset, match length
172 // Matches within a string are marked by (start, end) pairs.
173 for (size_t i = 0; i < offsets.size() - 3; i += 4) {
174 if (offsets[i] != column_num)
175 continue;
[email protected]c29962f22008-12-03 00:47:58176 const size_t start = atoi(offsets[i + 2].c_str());
177 const size_t end = start + atoi(offsets[i + 3].c_str());
[email protected]135b1652009-08-11 21:43:11178 // Switch to DCHECK after debugging http://crbug.com/15261.
179 CHECK(end >= start);
initial.commit09911bf2008-07-26 23:55:29180 AddMatch(start, end, match_positions);
181 }
182}
183
184// static
185void Snippet::ConvertMatchPositionsToWide(
186 const std::string& utf8_string,
187 Snippet::MatchPositions* match_positions) {
188 DCHECK(match_positions);
[email protected]c29962f22008-12-03 00:47:58189 int32_t utf8_pos = 0;
[email protected]e53668962010-06-23 15:35:25190 size_t utf16_pos = 0;
initial.commit09911bf2008-07-26 23:55:29191 const char* utf8_cstring = utf8_string.c_str();
[email protected]c29962f22008-12-03 00:47:58192 const int32_t utf8_length = static_cast<int32_t>(utf8_string.size());
initial.commit09911bf2008-07-26 23:55:29193 for (Snippet::MatchPositions::iterator i = match_positions->begin();
194 i != match_positions->end(); ++i) {
[email protected]e53668962010-06-23 15:35:25195 i->first = AdvanceAndReturnUTF16Pos(utf8_cstring, utf8_length,
196 i->first, &utf8_pos, &utf16_pos);
197 i->second = AdvanceAndReturnUTF16Pos(utf8_cstring, utf8_length,
198 i->second, &utf8_pos, &utf16_pos);
initial.commit09911bf2008-07-26 23:55:29199 }
200}
201
202void Snippet::ComputeSnippet(const MatchPositions& match_positions,
203 const std::string& document) {
204 // The length of snippets we try to produce.
205 // We can generate longer snippets but stop once we cross kSnippetMaxLength.
206 const size_t kSnippetMaxLength = 200;
[email protected]e53668962010-06-23 15:35:25207 const string16 kEllipsis = ASCIIToUTF16(" ... ");
initial.commit09911bf2008-07-26 23:55:29208
initial.commit09911bf2008-07-26 23:55:29209 UText* document_utext = NULL;
210 UErrorCode status = U_ZERO_ERROR;
211 document_utext = utext_openUTF8(document_utext, document.data(),
[email protected]c29962f22008-12-03 00:47:58212 document.size(), &status);
initial.commit09911bf2008-07-26 23:55:29213 // Locale does not matter because there's no per-locale customization
214 // for character iterator.
[email protected]b5b2385a2009-08-18 05:12:29215 scoped_ptr<icu::BreakIterator> bi(icu::BreakIterator::createCharacterInstance(
216 icu::Locale::getDefault(), status));
initial.commit09911bf2008-07-26 23:55:29217 bi->setText(document_utext, status);
218 DCHECK(U_SUCCESS(status));
219
220 // We build the snippet by iterating through the matches and then grabbing
221 // context around each match. If matches are near enough each other (within
222 // kSnippetContext), we skip the "..." between them.
[email protected]e53668962010-06-23 15:35:25223 string16 snippet;
[email protected]c29962f22008-12-03 00:47:58224 size_t start = 0;
initial.commit09911bf2008-07-26 23:55:29225 for (size_t i = 0; i < match_positions.size(); ++i) {
226 // Some shorter names for the current match.
[email protected]c29962f22008-12-03 00:47:58227 const size_t match_start = match_positions[i].first;
228 const size_t match_end = match_positions[i].second;
initial.commit09911bf2008-07-26 23:55:29229
[email protected]135b1652009-08-11 21:43:11230 // Switch to DCHECK after debugging http://crbug.com/15261.
231 CHECK(match_end > match_start);
232 CHECK(match_end <= document.size());
233
initial.commit09911bf2008-07-26 23:55:29234 // Add the context, if any, to show before the match.
[email protected]c29962f22008-12-03 00:47:58235 size_t context_start = match_start;
initial.commit09911bf2008-07-26 23:55:29236 MoveByNGraphemes(bi.get(), -kSnippetContext, &context_start);
237 start = std::max(start, context_start);
238 if (start < match_start) {
239 if (start > 0)
240 snippet += kEllipsis;
[email protected]135b1652009-08-11 21:43:11241 // Switch to DCHECK after debugging http://crbug.com/15261.
242 CHECK(start < document.size());
[email protected]e53668962010-06-23 15:35:25243 snippet += UTF8ToUTF16(document.substr(start, match_start - start));
initial.commit09911bf2008-07-26 23:55:29244 }
245
246 // Add the match.
[email protected]c29962f22008-12-03 00:47:58247 const size_t first = snippet.size();
[email protected]e53668962010-06-23 15:35:25248 snippet += UTF8ToUTF16(document.substr(match_start,
initial.commit09911bf2008-07-26 23:55:29249 match_end - match_start));
[email protected]c29962f22008-12-03 00:47:58250 matches_.push_back(std::make_pair(first, snippet.size()));
initial.commit09911bf2008-07-26 23:55:29251
252 // Compute the context, if any, to show after the match.
[email protected]c29962f22008-12-03 00:47:58253 size_t end;
initial.commit09911bf2008-07-26 23:55:29254 // Check if the next match falls within our snippet window.
255 if (i + 1 < match_positions.size() &&
256 IsNextMatchWithinSnippetWindow(bi.get(), match_end,
[email protected]c29962f22008-12-03 00:47:58257 match_positions[i + 1].first)) {
initial.commit09911bf2008-07-26 23:55:29258 // Yes, it's within the window. Make the end context extend just up
259 // to the next match.
260 end = match_positions[i + 1].first;
[email protected]135b1652009-08-11 21:43:11261 // Switch to DCHECK after debugging http://crbug.com/15261.
262 CHECK(end >= match_end);
263 CHECK(end <= document.size());
[email protected]e53668962010-06-23 15:35:25264 snippet += UTF8ToUTF16(document.substr(match_end, end - match_end));
initial.commit09911bf2008-07-26 23:55:29265 } else {
266 // No, there's either no next match or the next match is too far away.
267 end = match_end;
268 MoveByNGraphemes(bi.get(), kSnippetContext, &end);
[email protected]135b1652009-08-11 21:43:11269 // Switch to DCHECK after debugging http://crbug.com/15261.
270 CHECK(end >= match_end);
271 CHECK(end <= document.size());
[email protected]e53668962010-06-23 15:35:25272 snippet += UTF8ToUTF16(document.substr(match_end, end - match_end));
[email protected]c29962f22008-12-03 00:47:58273 if (end < document.size())
initial.commit09911bf2008-07-26 23:55:29274 snippet += kEllipsis;
275 }
276 start = end;
277
278 // Stop here if we have enough snippet computed.
279 if (snippet.size() >= kSnippetMaxLength)
280 break;
281 }
282
283 utext_close(document_utext);
284 swap(text_, snippet);
285}