blob: d72209e5a41fc5f917df61972405caced12eddf2 [file] [log] [blame]
license.botbf09a502008-08-24 00:55:551// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
initial.commit09911bf2008-07-26 23:55:294
5#include "chrome/browser/history/snippet.h"
6
7#include <algorithm>
8
9#include "base/logging.h"
10#include "base/scoped_ptr.h"
11#include "base/string_util.h"
12#include "unicode/brkiter.h"
13#include "unicode/utext.h"
14#include "unicode/utf8.h"
15
16namespace {
17
18bool PairFirstLessThan(const std::pair<int,int>& a,
19 const std::pair<int,int>& b) {
20 return a.first < b.first;
21}
22
23// Combines all pairs after offset in match_positions that are contained
24// or touch the pair at offset.
25void CoalescePositionsFrom(size_t offset,
26 Snippet::MatchPositions* match_positions) {
27 DCHECK(offset < match_positions->size());
28 std::pair<int,int>& pair((*match_positions)[offset]);
29 ++offset;
30 while (offset < match_positions->size() &&
31 pair.second >= (*match_positions)[offset].first) {
32 pair.second = std::max(pair.second, (*match_positions)[offset].second);
33 match_positions->erase(match_positions->begin() + offset);
34 }
35}
36
37// Makes sure there is a pair in match_positions that contains the specified
38// range. This keeps the pairs ordered in match_positions by first, and makes
39// sure none of the pairs in match_positions touch each other.
40void AddMatch(int start, int end, Snippet::MatchPositions* match_positions) {
41 DCHECK(start < end && match_positions);
42 std::pair<int,int> pair(start, end);
43 if (match_positions->empty()) {
44 match_positions->push_back(pair);
45 return;
46 }
47 // There's at least one match. Find the position of the new match,
48 // potentially extending pairs around it.
49 Snippet::MatchPositions::iterator i =
50 std::lower_bound(match_positions->begin(), match_positions->end(),
51 pair, &PairFirstLessThan);
52 if (i != match_positions->end() && i->first == start) {
53 // Match not at the end and there is already a pair with the same
54 // start.
55 if (end > i->second) {
56 // New pair extends beyond existing pair. Extend existing pair and
57 // coalesce matches after it.
58 i->second = end;
59 CoalescePositionsFrom(i - match_positions->begin(), match_positions);
60 } // else case, new pair completely contained in existing pair, nothing
61 // to do.
62 } else if (i == match_positions->begin()) {
63 // Match at the beginning and the first pair doesn't have the same
64 // start. Insert new pair and coalesce matches after it.
65 match_positions->insert(i, pair);
66 CoalescePositionsFrom(0, match_positions);
67 } else {
68 // Not at the beginning (but may be at the end).
69 --i;
70 if (start <= i->second && end > i->second) {
71 // Previous element contains match. Extend it and coalesce.
72 i->second = end;
73 CoalescePositionsFrom(i - match_positions->begin(), match_positions);
74 } else if (end > i->second) {
75 // Region doesn't touch previous element. See if region touches current
76 // element.
77 ++i;
78 if (i == match_positions->end() || end < i->first) {
79 match_positions->insert(i, pair);
80 } else {
81 i->first = start;
82 i->second = end;
83 CoalescePositionsFrom(i - match_positions->begin(), match_positions);
84 }
85 }
86 }
87}
88
89// Converts an index in a utf8 string into the index in the corresponding wide
90// string and returns the wide index. This is intended to be called in a loop
91// iterating through a utf8 string.
92//
93// utf8_string: the utf8 string.
94// utf8_length: length of the utf8 string.
95// offset: the utf8 offset to convert.
96// utf8_pos: current offset in the utf8 string. This is modified and on return
97// matches offset.
98// wide_pos: current index in the wide string. This is the same as the return
99// value.
100int AdvanceAndReturnWidePos(const char* utf8_string,
101 int utf8_length,
102 int offset,
103 int* utf8_pos,
104 int* wide_pos) {
105 DCHECK(offset >= *utf8_pos && offset <= utf8_length);
106
107 UChar32 wide_char;
108 while (*utf8_pos < offset) {
109 U8_NEXT(utf8_string, *utf8_pos, utf8_length, wide_char);
110 *wide_pos += (wide_char <= 0xFFFF) ? 1 : 2;
111 }
112 return *wide_pos;
113}
114
115// Given a character break iterator over a UTF-8 string, set the iterator
116// position to |*utf8_pos| and move by |count| characters. |count| can
117// be either positive or negative.
118void MoveByNGraphemes(BreakIterator* bi, int count, int* utf8_pos) {
119 // Ignore the return value. A side effect of the current position
120 // being set at or following |*utf8_pos| is exploited here.
121 // It's simpler than calling following(n) and then previous().
122 // isBoundary() is not very fast, but should be good enough for the
123 // snippet generation. If not, revisit the way we scan in ComputeSnippet.
124 bi->isBoundary(*utf8_pos);
125 bi->next(count);
126 *utf8_pos = static_cast<int>(bi->current());
127}
128
129// The amount of context to include for a given hit. Note that it's counted
130// in terms of graphemes rather than bytes.
131const int kSnippetContext = 50;
132
133// Returns true if next match falls within a snippet window
134// from the previous match. The window size is counted in terms
135// of graphemes rather than bytes in UTF-8.
136bool IsNextMatchWithinSnippetWindow(BreakIterator* bi,
137 int previous_match_end,
138 int next_match_start) {
139 // If it's within a window in terms of bytes, it's certain
140 // that it's within a window in terms of graphemes as well.
141 if (next_match_start < previous_match_end + kSnippetContext)
142 return true;
143 bi->isBoundary(previous_match_end);
144 // An alternative to this is to call |bi->next()| at most
145 // kSnippetContext times, compare |bi->current()| with |next_match_start|
146 // after each call and return early if possible. There are other
147 // heuristics to speed things up if necessary, but it's not likely that
148 // we need to bother.
149 bi->next(kSnippetContext);
150 int64_t current = bi->current();
151 return (next_match_start < current || current == BreakIterator::DONE);
152}
153
154} // namespace
155
156// static
157void Snippet::ExtractMatchPositions(const std::string& offsets_str,
158 const std::string& column_num,
159 MatchPositions* match_positions) {
160 DCHECK(match_positions);
161 if (offsets_str.empty())
162 return;
163 std::vector<std::string> offsets;
164 SplitString(offsets_str, ' ', &offsets);
165 // SQLite offsets are sets of four integers:
166 // column, query term, match offset, match length
167 // Matches within a string are marked by (start, end) pairs.
168 for (size_t i = 0; i < offsets.size() - 3; i += 4) {
169 if (offsets[i] != column_num)
170 continue;
171 const int start = atoi(offsets[i+2].c_str());
172 const int end = start + atoi(offsets[i+3].c_str());
173 AddMatch(start, end, match_positions);
174 }
175}
176
177// static
178void Snippet::ConvertMatchPositionsToWide(
179 const std::string& utf8_string,
180 Snippet::MatchPositions* match_positions) {
181 DCHECK(match_positions);
182 int utf8_pos = 0;
183 int wide_pos = 0;
184 const char* utf8_cstring = utf8_string.c_str();
185 const int utf8_length = static_cast<int>(utf8_string.size());
186 for (Snippet::MatchPositions::iterator i = match_positions->begin();
187 i != match_positions->end(); ++i) {
188 i->first = AdvanceAndReturnWidePos(utf8_cstring, utf8_length,
189 i->first, &utf8_pos, &wide_pos);
190 i->second =
191 AdvanceAndReturnWidePos(utf8_cstring, utf8_length, i->second, &utf8_pos,
192 &wide_pos);
193 }
194}
195
196void Snippet::ComputeSnippet(const MatchPositions& match_positions,
197 const std::string& document) {
198 // The length of snippets we try to produce.
199 // We can generate longer snippets but stop once we cross kSnippetMaxLength.
200 const size_t kSnippetMaxLength = 200;
201
202
203 const std::wstring kEllipsis = L" ... ";
204
205 // Grab the size as an int to cut down on casts later.
206 const int document_size = static_cast<int>(document.size());
207
208 UText* document_utext = NULL;
209 UErrorCode status = U_ZERO_ERROR;
210 document_utext = utext_openUTF8(document_utext, document.data(),
211 document_size, &status);
212 // Locale does not matter because there's no per-locale customization
213 // for character iterator.
214 scoped_ptr<BreakIterator> bi(
215 BreakIterator::createCharacterInstance(Locale::getDefault(), status));
216 bi->setText(document_utext, status);
217 DCHECK(U_SUCCESS(status));
218
219 // We build the snippet by iterating through the matches and then grabbing
220 // context around each match. If matches are near enough each other (within
221 // kSnippetContext), we skip the "..." between them.
222 std::wstring snippet;
223 int start = 0;
224 for (size_t i = 0; i < match_positions.size(); ++i) {
225 // Some shorter names for the current match.
226 const int match_start = match_positions[i].first;
227 const int match_end = match_positions[i].second;
228
229 // Add the context, if any, to show before the match.
230 int context_start = match_start;
231 MoveByNGraphemes(bi.get(), -kSnippetContext, &context_start);
232 start = std::max(start, context_start);
233 if (start < match_start) {
234 if (start > 0)
235 snippet += kEllipsis;
236 snippet += UTF8ToWide(document.substr(start, match_start - start));
237 }
238
239 // Add the match.
240 matches_.push_back(std::make_pair(static_cast<int>(snippet.size()), 0));
241 snippet += UTF8ToWide(document.substr(match_start,
242 match_end - match_start));
243 matches_.back().second = static_cast<int>(snippet.size());
244
245 // Compute the context, if any, to show after the match.
246 int end;
247 // Check if the next match falls within our snippet window.
248 if (i + 1 < match_positions.size() &&
249 IsNextMatchWithinSnippetWindow(bi.get(), match_end,
250 match_positions[i + 1].first)) {
251 // Yes, it's within the window. Make the end context extend just up
252 // to the next match.
253 end = match_positions[i + 1].first;
254 snippet += UTF8ToWide(document.substr(match_end, end - match_end));
255 } else {
256 // No, there's either no next match or the next match is too far away.
257 end = match_end;
258 MoveByNGraphemes(bi.get(), kSnippetContext, &end);
259 snippet += UTF8ToWide(document.substr(match_end, end - match_end));
260 if (end < document_size)
261 snippet += kEllipsis;
262 }
263 start = end;
264
265 // Stop here if we have enough snippet computed.
266 if (snippet.size() >= kSnippetMaxLength)
267 break;
268 }
269
270 utext_close(document_utext);
271 swap(text_, snippet);
272}
license.botbf09a502008-08-24 00:55:55273