blob: b44bb00eaa38595df284afc643f29c4dba4c0212 [file] [log] [blame]
license.botbf09a502008-08-24 00:55:551// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
initial.commit09911bf2008-07-26 23:55:294
5#include "chrome/browser/history/snippet.h"
6
7#include <algorithm>
8
9#include "base/logging.h"
10#include "base/scoped_ptr.h"
11#include "base/string_util.h"
12#include "unicode/brkiter.h"
13#include "unicode/utext.h"
14#include "unicode/utf8.h"
15
16namespace {
17
[email protected]c29962f22008-12-03 00:47:5818bool PairFirstLessThan(const Snippet::MatchPosition& a,
19 const Snippet::MatchPosition& b) {
initial.commit09911bf2008-07-26 23:55:2920 return a.first < b.first;
21}
22
23// Combines all pairs after offset in match_positions that are contained
24// or touch the pair at offset.
25void CoalescePositionsFrom(size_t offset,
26 Snippet::MatchPositions* match_positions) {
27 DCHECK(offset < match_positions->size());
[email protected]c29962f22008-12-03 00:47:5828 Snippet::MatchPosition& pair((*match_positions)[offset]);
initial.commit09911bf2008-07-26 23:55:2929 ++offset;
30 while (offset < match_positions->size() &&
31 pair.second >= (*match_positions)[offset].first) {
32 pair.second = std::max(pair.second, (*match_positions)[offset].second);
33 match_positions->erase(match_positions->begin() + offset);
34 }
35}
36
37// Makes sure there is a pair in match_positions that contains the specified
38// range. This keeps the pairs ordered in match_positions by first, and makes
39// sure none of the pairs in match_positions touch each other.
[email protected]c29962f22008-12-03 00:47:5840void AddMatch(size_t start,
41 size_t end,
42 Snippet::MatchPositions* match_positions) {
43 DCHECK(start < end);
44 DCHECK(match_positions);
45 Snippet::MatchPosition pair(start, end);
initial.commit09911bf2008-07-26 23:55:2946 if (match_positions->empty()) {
47 match_positions->push_back(pair);
48 return;
49 }
50 // There's at least one match. Find the position of the new match,
51 // potentially extending pairs around it.
52 Snippet::MatchPositions::iterator i =
53 std::lower_bound(match_positions->begin(), match_positions->end(),
54 pair, &PairFirstLessThan);
55 if (i != match_positions->end() && i->first == start) {
56 // Match not at the end and there is already a pair with the same
57 // start.
58 if (end > i->second) {
59 // New pair extends beyond existing pair. Extend existing pair and
60 // coalesce matches after it.
61 i->second = end;
62 CoalescePositionsFrom(i - match_positions->begin(), match_positions);
63 } // else case, new pair completely contained in existing pair, nothing
64 // to do.
65 } else if (i == match_positions->begin()) {
66 // Match at the beginning and the first pair doesn't have the same
67 // start. Insert new pair and coalesce matches after it.
68 match_positions->insert(i, pair);
69 CoalescePositionsFrom(0, match_positions);
70 } else {
71 // Not at the beginning (but may be at the end).
72 --i;
73 if (start <= i->second && end > i->second) {
74 // Previous element contains match. Extend it and coalesce.
75 i->second = end;
76 CoalescePositionsFrom(i - match_positions->begin(), match_positions);
77 } else if (end > i->second) {
78 // Region doesn't touch previous element. See if region touches current
79 // element.
80 ++i;
81 if (i == match_positions->end() || end < i->first) {
82 match_positions->insert(i, pair);
83 } else {
84 i->first = start;
85 i->second = end;
86 CoalescePositionsFrom(i - match_positions->begin(), match_positions);
87 }
88 }
89 }
90}
91
92// Converts an index in a utf8 string into the index in the corresponding wide
93// string and returns the wide index. This is intended to be called in a loop
94// iterating through a utf8 string.
95//
96// utf8_string: the utf8 string.
97// utf8_length: length of the utf8 string.
98// offset: the utf8 offset to convert.
99// utf8_pos: current offset in the utf8 string. This is modified and on return
100// matches offset.
101// wide_pos: current index in the wide string. This is the same as the return
102// value.
[email protected]c29962f22008-12-03 00:47:58103size_t AdvanceAndReturnWidePos(const char* utf8_string,
104 int32_t utf8_length,
105 int32_t offset,
106 int32_t* utf8_pos,
107 size_t* wide_pos) {
initial.commit09911bf2008-07-26 23:55:29108 DCHECK(offset >= *utf8_pos && offset <= utf8_length);
109
110 UChar32 wide_char;
111 while (*utf8_pos < offset) {
112 U8_NEXT(utf8_string, *utf8_pos, utf8_length, wide_char);
113 *wide_pos += (wide_char <= 0xFFFF) ? 1 : 2;
114 }
115 return *wide_pos;
116}
117
118// Given a character break iterator over a UTF-8 string, set the iterator
119// position to |*utf8_pos| and move by |count| characters. |count| can
120// be either positive or negative.
[email protected]b5b2385a2009-08-18 05:12:29121void MoveByNGraphemes(icu::BreakIterator* bi, int count, size_t* utf8_pos) {
initial.commit09911bf2008-07-26 23:55:29122 // Ignore the return value. A side effect of the current position
123 // being set at or following |*utf8_pos| is exploited here.
124 // It's simpler than calling following(n) and then previous().
125 // isBoundary() is not very fast, but should be good enough for the
126 // snippet generation. If not, revisit the way we scan in ComputeSnippet.
127 bi->isBoundary(*utf8_pos);
128 bi->next(count);
[email protected]c29962f22008-12-03 00:47:58129 *utf8_pos = static_cast<size_t>(bi->current());
initial.commit09911bf2008-07-26 23:55:29130}
131
132// The amount of context to include for a given hit. Note that it's counted
133// in terms of graphemes rather than bytes.
134const int kSnippetContext = 50;
135
136// Returns true if next match falls within a snippet window
137// from the previous match. The window size is counted in terms
138// of graphemes rather than bytes in UTF-8.
[email protected]b5b2385a2009-08-18 05:12:29139bool IsNextMatchWithinSnippetWindow(icu::BreakIterator* bi,
[email protected]c29962f22008-12-03 00:47:58140 size_t previous_match_end,
141 size_t next_match_start) {
initial.commit09911bf2008-07-26 23:55:29142 // If it's within a window in terms of bytes, it's certain
143 // that it's within a window in terms of graphemes as well.
144 if (next_match_start < previous_match_end + kSnippetContext)
145 return true;
146 bi->isBoundary(previous_match_end);
147 // An alternative to this is to call |bi->next()| at most
148 // kSnippetContext times, compare |bi->current()| with |next_match_start|
149 // after each call and return early if possible. There are other
150 // heuristics to speed things up if necessary, but it's not likely that
151 // we need to bother.
152 bi->next(kSnippetContext);
[email protected]65d55d82009-07-28 21:15:56153 int64 current = bi->current();
154 return (next_match_start < static_cast<uint64>(current) ||
[email protected]b5b2385a2009-08-18 05:12:29155 current == icu::BreakIterator::DONE);
initial.commit09911bf2008-07-26 23:55:29156}
157
158} // namespace
159
160// static
161void Snippet::ExtractMatchPositions(const std::string& offsets_str,
162 const std::string& column_num,
163 MatchPositions* match_positions) {
164 DCHECK(match_positions);
165 if (offsets_str.empty())
166 return;
167 std::vector<std::string> offsets;
168 SplitString(offsets_str, ' ', &offsets);
169 // SQLite offsets are sets of four integers:
170 // column, query term, match offset, match length
171 // Matches within a string are marked by (start, end) pairs.
172 for (size_t i = 0; i < offsets.size() - 3; i += 4) {
173 if (offsets[i] != column_num)
174 continue;
[email protected]c29962f22008-12-03 00:47:58175 const size_t start = atoi(offsets[i + 2].c_str());
176 const size_t end = start + atoi(offsets[i + 3].c_str());
[email protected]135b1652009-08-11 21:43:11177 // Switch to DCHECK after debugging http://crbug.com/15261.
178 CHECK(end >= start);
initial.commit09911bf2008-07-26 23:55:29179 AddMatch(start, end, match_positions);
180 }
181}
182
183// static
184void Snippet::ConvertMatchPositionsToWide(
185 const std::string& utf8_string,
186 Snippet::MatchPositions* match_positions) {
187 DCHECK(match_positions);
[email protected]c29962f22008-12-03 00:47:58188 int32_t utf8_pos = 0;
189 size_t wide_pos = 0;
initial.commit09911bf2008-07-26 23:55:29190 const char* utf8_cstring = utf8_string.c_str();
[email protected]c29962f22008-12-03 00:47:58191 const int32_t utf8_length = static_cast<int32_t>(utf8_string.size());
initial.commit09911bf2008-07-26 23:55:29192 for (Snippet::MatchPositions::iterator i = match_positions->begin();
193 i != match_positions->end(); ++i) {
194 i->first = AdvanceAndReturnWidePos(utf8_cstring, utf8_length,
195 i->first, &utf8_pos, &wide_pos);
[email protected]c29962f22008-12-03 00:47:58196 i->second = AdvanceAndReturnWidePos(utf8_cstring, utf8_length,
197 i->second, &utf8_pos, &wide_pos);
initial.commit09911bf2008-07-26 23:55:29198 }
199}
200
201void Snippet::ComputeSnippet(const MatchPositions& match_positions,
202 const std::string& document) {
203 // The length of snippets we try to produce.
204 // We can generate longer snippets but stop once we cross kSnippetMaxLength.
205 const size_t kSnippetMaxLength = 200;
initial.commit09911bf2008-07-26 23:55:29206 const std::wstring kEllipsis = L" ... ";
207
initial.commit09911bf2008-07-26 23:55:29208 UText* document_utext = NULL;
209 UErrorCode status = U_ZERO_ERROR;
210 document_utext = utext_openUTF8(document_utext, document.data(),
[email protected]c29962f22008-12-03 00:47:58211 document.size(), &status);
initial.commit09911bf2008-07-26 23:55:29212 // Locale does not matter because there's no per-locale customization
213 // for character iterator.
[email protected]b5b2385a2009-08-18 05:12:29214 scoped_ptr<icu::BreakIterator> bi(icu::BreakIterator::createCharacterInstance(
215 icu::Locale::getDefault(), status));
initial.commit09911bf2008-07-26 23:55:29216 bi->setText(document_utext, status);
217 DCHECK(U_SUCCESS(status));
218
219 // We build the snippet by iterating through the matches and then grabbing
220 // context around each match. If matches are near enough each other (within
221 // kSnippetContext), we skip the "..." between them.
222 std::wstring snippet;
[email protected]c29962f22008-12-03 00:47:58223 size_t start = 0;
initial.commit09911bf2008-07-26 23:55:29224 for (size_t i = 0; i < match_positions.size(); ++i) {
225 // Some shorter names for the current match.
[email protected]c29962f22008-12-03 00:47:58226 const size_t match_start = match_positions[i].first;
227 const size_t match_end = match_positions[i].second;
initial.commit09911bf2008-07-26 23:55:29228
[email protected]135b1652009-08-11 21:43:11229 // Switch to DCHECK after debugging http://crbug.com/15261.
230 CHECK(match_end > match_start);
231 CHECK(match_end <= document.size());
232
initial.commit09911bf2008-07-26 23:55:29233 // Add the context, if any, to show before the match.
[email protected]c29962f22008-12-03 00:47:58234 size_t context_start = match_start;
initial.commit09911bf2008-07-26 23:55:29235 MoveByNGraphemes(bi.get(), -kSnippetContext, &context_start);
236 start = std::max(start, context_start);
237 if (start < match_start) {
238 if (start > 0)
239 snippet += kEllipsis;
[email protected]135b1652009-08-11 21:43:11240 // Switch to DCHECK after debugging http://crbug.com/15261.
241 CHECK(start < document.size());
initial.commit09911bf2008-07-26 23:55:29242 snippet += UTF8ToWide(document.substr(start, match_start - start));
243 }
244
245 // Add the match.
[email protected]c29962f22008-12-03 00:47:58246 const size_t first = snippet.size();
initial.commit09911bf2008-07-26 23:55:29247 snippet += UTF8ToWide(document.substr(match_start,
248 match_end - match_start));
[email protected]c29962f22008-12-03 00:47:58249 matches_.push_back(std::make_pair(first, snippet.size()));
initial.commit09911bf2008-07-26 23:55:29250
251 // Compute the context, if any, to show after the match.
[email protected]c29962f22008-12-03 00:47:58252 size_t end;
initial.commit09911bf2008-07-26 23:55:29253 // Check if the next match falls within our snippet window.
254 if (i + 1 < match_positions.size() &&
255 IsNextMatchWithinSnippetWindow(bi.get(), match_end,
[email protected]c29962f22008-12-03 00:47:58256 match_positions[i + 1].first)) {
initial.commit09911bf2008-07-26 23:55:29257 // Yes, it's within the window. Make the end context extend just up
258 // to the next match.
259 end = match_positions[i + 1].first;
[email protected]135b1652009-08-11 21:43:11260 // Switch to DCHECK after debugging http://crbug.com/15261.
261 CHECK(end >= match_end);
262 CHECK(end <= document.size());
initial.commit09911bf2008-07-26 23:55:29263 snippet += UTF8ToWide(document.substr(match_end, end - match_end));
264 } else {
265 // No, there's either no next match or the next match is too far away.
266 end = match_end;
267 MoveByNGraphemes(bi.get(), kSnippetContext, &end);
[email protected]135b1652009-08-11 21:43:11268 // Switch to DCHECK after debugging http://crbug.com/15261.
269 CHECK(end >= match_end);
270 CHECK(end <= document.size());
initial.commit09911bf2008-07-26 23:55:29271 snippet += UTF8ToWide(document.substr(match_end, end - match_end));
[email protected]c29962f22008-12-03 00:47:58272 if (end < document.size())
initial.commit09911bf2008-07-26 23:55:29273 snippet += kEllipsis;
274 }
275 start = end;
276
277 // Stop here if we have enough snippet computed.
278 if (snippet.size() >= kSnippetMaxLength)
279 break;
280 }
281
282 utext_close(document_utext);
283 swap(text_, snippet);
284}