blob: 5079e3462fa51b19c688d1ad472d6dc5ec417217 [file] [log] [blame]
[email protected]acf9f272014-04-15 23:04:001// Copyright 2014 The Chromium Authors. All rights reserved.
license.botbf09a502008-08-24 00:55:552// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
initial.commit09911bf2008-07-26 23:55:294
[email protected]acf9f272014-04-15 23:04:005#include "components/query_parser/snippet.h"
initial.commit09911bf2008-07-26 23:55:296
avif57136c12015-12-25 23:27:457#include <stdint.h>
8
initial.commit09911bf2008-07-26 23:55:299#include <algorithm>
dcheng82beb4f2016-04-26 00:35:0210#include <memory>
initial.commit09911bf2008-07-26 23:55:2911
12#include "base/logging.h"
[email protected]1988e1c2013-02-28 20:27:4213#include "base/strings/string_split.h"
[email protected]d8830562013-06-10 22:01:5414#include "base/strings/string_util.h"
[email protected]112158af2013-06-07 23:46:1815#include "base/strings/utf_string_conversions.h"
[email protected]8bbf6192013-07-18 11:14:0416#include "third_party/icu/source/common/unicode/brkiter.h"
17#include "third_party/icu/source/common/unicode/utext.h"
18#include "third_party/icu/source/common/unicode/utf8.h"
initial.commit09911bf2008-07-26 23:55:2919
[email protected]acf9f272014-04-15 23:04:0020namespace query_parser {
initial.commit09911bf2008-07-26 23:55:2921namespace {
22
[email protected]c29962f22008-12-03 00:47:5823bool PairFirstLessThan(const Snippet::MatchPosition& a,
24 const Snippet::MatchPosition& b) {
initial.commit09911bf2008-07-26 23:55:2925 return a.first < b.first;
26}
27
28// Combines all pairs after offset in match_positions that are contained
29// or touch the pair at offset.
30void CoalescePositionsFrom(size_t offset,
31 Snippet::MatchPositions* match_positions) {
32 DCHECK(offset < match_positions->size());
[email protected]c29962f22008-12-03 00:47:5833 Snippet::MatchPosition& pair((*match_positions)[offset]);
initial.commit09911bf2008-07-26 23:55:2934 ++offset;
35 while (offset < match_positions->size() &&
36 pair.second >= (*match_positions)[offset].first) {
37 pair.second = std::max(pair.second, (*match_positions)[offset].second);
38 match_positions->erase(match_positions->begin() + offset);
39 }
40}
41
42// Makes sure there is a pair in match_positions that contains the specified
43// range. This keeps the pairs ordered in match_positions by first, and makes
44// sure none of the pairs in match_positions touch each other.
[email protected]c29962f22008-12-03 00:47:5845void AddMatch(size_t start,
46 size_t end,
47 Snippet::MatchPositions* match_positions) {
48 DCHECK(start < end);
49 DCHECK(match_positions);
50 Snippet::MatchPosition pair(start, end);
initial.commit09911bf2008-07-26 23:55:2951 if (match_positions->empty()) {
52 match_positions->push_back(pair);
53 return;
54 }
55 // There's at least one match. Find the position of the new match,
56 // potentially extending pairs around it.
57 Snippet::MatchPositions::iterator i =
58 std::lower_bound(match_positions->begin(), match_positions->end(),
59 pair, &PairFirstLessThan);
60 if (i != match_positions->end() && i->first == start) {
61 // Match not at the end and there is already a pair with the same
62 // start.
63 if (end > i->second) {
64 // New pair extends beyond existing pair. Extend existing pair and
65 // coalesce matches after it.
66 i->second = end;
67 CoalescePositionsFrom(i - match_positions->begin(), match_positions);
[email protected]9fd9092f2010-03-08 23:28:4168 } // else case, new pair completely contained in existing pair, nothing
69 // to do.
initial.commit09911bf2008-07-26 23:55:2970 } else if (i == match_positions->begin()) {
71 // Match at the beginning and the first pair doesn't have the same
72 // start. Insert new pair and coalesce matches after it.
73 match_positions->insert(i, pair);
74 CoalescePositionsFrom(0, match_positions);
75 } else {
76 // Not at the beginning (but may be at the end).
77 --i;
78 if (start <= i->second && end > i->second) {
79 // Previous element contains match. Extend it and coalesce.
80 i->second = end;
81 CoalescePositionsFrom(i - match_positions->begin(), match_positions);
82 } else if (end > i->second) {
83 // Region doesn't touch previous element. See if region touches current
84 // element.
85 ++i;
86 if (i == match_positions->end() || end < i->first) {
87 match_positions->insert(i, pair);
88 } else {
89 i->first = start;
90 i->second = end;
91 CoalescePositionsFrom(i - match_positions->begin(), match_positions);
92 }
93 }
94 }
95}
96
[email protected]e53668962010-06-23 15:35:2597// Converts an index in a utf8 string into the index in the corresponding utf16
98// string and returns the utf16 index. This is intended to be called in a loop
initial.commit09911bf2008-07-26 23:55:2999// iterating through a utf8 string.
100//
101// utf8_string: the utf8 string.
102// utf8_length: length of the utf8 string.
103// offset: the utf8 offset to convert.
104// utf8_pos: current offset in the utf8 string. This is modified and on return
105// matches offset.
106// wide_pos: current index in the wide string. This is the same as the return
107// value.
[email protected]e53668962010-06-23 15:35:25108size_t AdvanceAndReturnUTF16Pos(const char* utf8_string,
109 int32_t utf8_length,
110 int32_t offset,
111 int32_t* utf8_pos,
112 size_t* utf16_pos) {
initial.commit09911bf2008-07-26 23:55:29113 DCHECK(offset >= *utf8_pos && offset <= utf8_length);
114
115 UChar32 wide_char;
116 while (*utf8_pos < offset) {
117 U8_NEXT(utf8_string, *utf8_pos, utf8_length, wide_char);
[email protected]e53668962010-06-23 15:35:25118 *utf16_pos += (wide_char <= 0xFFFF) ? 1 : 2;
initial.commit09911bf2008-07-26 23:55:29119 }
[email protected]e53668962010-06-23 15:35:25120 return *utf16_pos;
initial.commit09911bf2008-07-26 23:55:29121}
122
123// Given a character break iterator over a UTF-8 string, set the iterator
124// position to |*utf8_pos| and move by |count| characters. |count| can
125// be either positive or negative.
[email protected]b5b2385a2009-08-18 05:12:29126void MoveByNGraphemes(icu::BreakIterator* bi, int count, size_t* utf8_pos) {
initial.commit09911bf2008-07-26 23:55:29127 // Ignore the return value. A side effect of the current position
128 // being set at or following |*utf8_pos| is exploited here.
129 // It's simpler than calling following(n) and then previous().
130 // isBoundary() is not very fast, but should be good enough for the
131 // snippet generation. If not, revisit the way we scan in ComputeSnippet.
[email protected]acf9f272014-04-15 23:04:00132 bi->isBoundary(static_cast<int32_t>(*utf8_pos));
initial.commit09911bf2008-07-26 23:55:29133 bi->next(count);
[email protected]c29962f22008-12-03 00:47:58134 *utf8_pos = static_cast<size_t>(bi->current());
initial.commit09911bf2008-07-26 23:55:29135}
136
137// The amount of context to include for a given hit. Note that it's counted
138// in terms of graphemes rather than bytes.
139const int kSnippetContext = 50;
140
141// Returns true if next match falls within a snippet window
142// from the previous match. The window size is counted in terms
143// of graphemes rather than bytes in UTF-8.
[email protected]b5b2385a2009-08-18 05:12:29144bool IsNextMatchWithinSnippetWindow(icu::BreakIterator* bi,
[email protected]c29962f22008-12-03 00:47:58145 size_t previous_match_end,
146 size_t next_match_start) {
initial.commit09911bf2008-07-26 23:55:29147 // If it's within a window in terms of bytes, it's certain
148 // that it's within a window in terms of graphemes as well.
149 if (next_match_start < previous_match_end + kSnippetContext)
150 return true;
[email protected]acf9f272014-04-15 23:04:00151 bi->isBoundary(static_cast<int32_t>(previous_match_end));
initial.commit09911bf2008-07-26 23:55:29152 // An alternative to this is to call |bi->next()| at most
153 // kSnippetContext times, compare |bi->current()| with |next_match_start|
154 // after each call and return early if possible. There are other
155 // heuristics to speed things up if necessary, but it's not likely that
156 // we need to bother.
157 bi->next(kSnippetContext);
avif57136c12015-12-25 23:27:45158 int64_t current = bi->current();
159 return (next_match_start < static_cast<uint64_t>(current) ||
[email protected]b5b2385a2009-08-18 05:12:29160 current == icu::BreakIterator::DONE);
initial.commit09911bf2008-07-26 23:55:29161}
162
163} // namespace
164
165// static
166void Snippet::ExtractMatchPositions(const std::string& offsets_str,
167 const std::string& column_num,
168 MatchPositions* match_positions) {
169 DCHECK(match_positions);
170 if (offsets_str.empty())
171 return;
brettw8be197d12015-07-23 23:23:31172 std::vector<std::string> offsets = base::SplitString(
173 offsets_str, " ", base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);
initial.commit09911bf2008-07-26 23:55:29174 // SQLite offsets are sets of four integers:
175 // column, query term, match offset, match length
176 // Matches within a string are marked by (start, end) pairs.
177 for (size_t i = 0; i < offsets.size() - 3; i += 4) {
178 if (offsets[i] != column_num)
179 continue;
[email protected]c29962f22008-12-03 00:47:58180 const size_t start = atoi(offsets[i + 2].c_str());
181 const size_t end = start + atoi(offsets[i + 3].c_str());
[email protected]135b1652009-08-11 21:43:11182 // Switch to DCHECK after debugging http://crbug.com/15261.
183 CHECK(end >= start);
initial.commit09911bf2008-07-26 23:55:29184 AddMatch(start, end, match_positions);
185 }
186}
187
188// static
189void Snippet::ConvertMatchPositionsToWide(
190 const std::string& utf8_string,
191 Snippet::MatchPositions* match_positions) {
192 DCHECK(match_positions);
[email protected]c29962f22008-12-03 00:47:58193 int32_t utf8_pos = 0;
[email protected]e53668962010-06-23 15:35:25194 size_t utf16_pos = 0;
initial.commit09911bf2008-07-26 23:55:29195 const char* utf8_cstring = utf8_string.c_str();
[email protected]c29962f22008-12-03 00:47:58196 const int32_t utf8_length = static_cast<int32_t>(utf8_string.size());
initial.commit09911bf2008-07-26 23:55:29197 for (Snippet::MatchPositions::iterator i = match_positions->begin();
198 i != match_positions->end(); ++i) {
[email protected]e53668962010-06-23 15:35:25199 i->first = AdvanceAndReturnUTF16Pos(utf8_cstring, utf8_length,
[email protected]acf9f272014-04-15 23:04:00200 static_cast<int32_t>(i->first),
201 &utf8_pos, &utf16_pos);
[email protected]e53668962010-06-23 15:35:25202 i->second = AdvanceAndReturnUTF16Pos(utf8_cstring, utf8_length,
[email protected]acf9f272014-04-15 23:04:00203 static_cast<int32_t>(i->second),
204 &utf8_pos, &utf16_pos);
initial.commit09911bf2008-07-26 23:55:29205 }
206}
207
[email protected]20f0487a2010-09-30 20:06:30208Snippet::Snippet() {
209}
210
vmpstrb6449d512016-02-25 23:55:40211Snippet::Snippet(const Snippet& other) = default;
212
brettw57694b362017-03-31 17:23:40213// TODO(bug 706963) this should be implemented as "= default" when Android
214// toolchain is updated.
215Snippet::Snippet(Snippet&& other) noexcept
216 : text_(std::move(other.text_)), matches_(std::move(other.matches_)) {}
217
[email protected]20f0487a2010-09-30 20:06:30218Snippet::~Snippet() {
219}
220
brettw57694b362017-03-31 17:23:40221Snippet& Snippet::operator=(const Snippet&) = default;
222
initial.commit09911bf2008-07-26 23:55:29223void Snippet::ComputeSnippet(const MatchPositions& match_positions,
224 const std::string& document) {
225 // The length of snippets we try to produce.
226 // We can generate longer snippets but stop once we cross kSnippetMaxLength.
227 const size_t kSnippetMaxLength = 200;
[email protected]04338722013-12-24 23:18:05228 const base::string16 kEllipsis = base::ASCIIToUTF16(" ... ");
initial.commit09911bf2008-07-26 23:55:29229
Ivan Kotenkov75b1c3a2017-10-24 14:47:24230 UText* document_utext = nullptr;
initial.commit09911bf2008-07-26 23:55:29231 UErrorCode status = U_ZERO_ERROR;
232 document_utext = utext_openUTF8(document_utext, document.data(),
[email protected]c29962f22008-12-03 00:47:58233 document.size(), &status);
initial.commit09911bf2008-07-26 23:55:29234 // Locale does not matter because there's no per-locale customization
235 // for character iterator.
dcheng82beb4f2016-04-26 00:35:02236 std::unique_ptr<icu::BreakIterator> bi(
237 icu::BreakIterator::createCharacterInstance(icu::Locale::getDefault(),
238 status));
initial.commit09911bf2008-07-26 23:55:29239 bi->setText(document_utext, status);
240 DCHECK(U_SUCCESS(status));
241
242 // We build the snippet by iterating through the matches and then grabbing
243 // context around each match. If matches are near enough each other (within
244 // kSnippetContext), we skip the "..." between them.
[email protected]439f1e32013-12-09 20:09:09245 base::string16 snippet;
[email protected]c29962f22008-12-03 00:47:58246 size_t start = 0;
initial.commit09911bf2008-07-26 23:55:29247 for (size_t i = 0; i < match_positions.size(); ++i) {
248 // Some shorter names for the current match.
[email protected]c29962f22008-12-03 00:47:58249 const size_t match_start = match_positions[i].first;
250 const size_t match_end = match_positions[i].second;
initial.commit09911bf2008-07-26 23:55:29251
[email protected]135b1652009-08-11 21:43:11252 // Switch to DCHECK after debugging http://crbug.com/15261.
253 CHECK(match_end > match_start);
254 CHECK(match_end <= document.size());
255
initial.commit09911bf2008-07-26 23:55:29256 // Add the context, if any, to show before the match.
[email protected]c29962f22008-12-03 00:47:58257 size_t context_start = match_start;
initial.commit09911bf2008-07-26 23:55:29258 MoveByNGraphemes(bi.get(), -kSnippetContext, &context_start);
259 start = std::max(start, context_start);
260 if (start < match_start) {
261 if (start > 0)
262 snippet += kEllipsis;
[email protected]135b1652009-08-11 21:43:11263 // Switch to DCHECK after debugging http://crbug.com/15261.
264 CHECK(start < document.size());
[email protected]04338722013-12-24 23:18:05265 snippet += base::UTF8ToUTF16(document.substr(start, match_start - start));
initial.commit09911bf2008-07-26 23:55:29266 }
267
268 // Add the match.
[email protected]c29962f22008-12-03 00:47:58269 const size_t first = snippet.size();
[email protected]04338722013-12-24 23:18:05270 snippet += base::UTF8ToUTF16(document.substr(match_start,
271 match_end - match_start));
[email protected]c29962f22008-12-03 00:47:58272 matches_.push_back(std::make_pair(first, snippet.size()));
initial.commit09911bf2008-07-26 23:55:29273
274 // Compute the context, if any, to show after the match.
[email protected]c29962f22008-12-03 00:47:58275 size_t end;
initial.commit09911bf2008-07-26 23:55:29276 // Check if the next match falls within our snippet window.
277 if (i + 1 < match_positions.size() &&
278 IsNextMatchWithinSnippetWindow(bi.get(), match_end,
[email protected]c29962f22008-12-03 00:47:58279 match_positions[i + 1].first)) {
initial.commit09911bf2008-07-26 23:55:29280 // Yes, it's within the window. Make the end context extend just up
281 // to the next match.
282 end = match_positions[i + 1].first;
[email protected]135b1652009-08-11 21:43:11283 // Switch to DCHECK after debugging http://crbug.com/15261.
284 CHECK(end >= match_end);
285 CHECK(end <= document.size());
[email protected]04338722013-12-24 23:18:05286 snippet += base::UTF8ToUTF16(document.substr(match_end, end - match_end));
initial.commit09911bf2008-07-26 23:55:29287 } else {
288 // No, there's either no next match or the next match is too far away.
289 end = match_end;
290 MoveByNGraphemes(bi.get(), kSnippetContext, &end);
[email protected]135b1652009-08-11 21:43:11291 // Switch to DCHECK after debugging http://crbug.com/15261.
292 CHECK(end >= match_end);
293 CHECK(end <= document.size());
[email protected]04338722013-12-24 23:18:05294 snippet += base::UTF8ToUTF16(document.substr(match_end, end - match_end));
[email protected]c29962f22008-12-03 00:47:58295 if (end < document.size())
initial.commit09911bf2008-07-26 23:55:29296 snippet += kEllipsis;
297 }
298 start = end;
299
300 // Stop here if we have enough snippet computed.
301 if (snippet.size() >= kSnippetMaxLength)
302 break;
303 }
304
305 utext_close(document_utext);
306 swap(text_, snippet);
307}
[email protected]20f0487a2010-09-30 20:06:30308
309void Snippet::Swap(Snippet* other) {
310 text_.swap(other->text_);
311 matches_.swap(other->matches_);
312}
[email protected]acf9f272014-04-15 23:04:00313
314} // namespace query_parser