[email protected] | acf9f27 | 2014-04-15 23:04:00 | [diff] [blame] | 1 | // Copyright 2014 The Chromium Authors. All rights reserved. |
license.bot | bf09a50 | 2008-08-24 00:55:55 | [diff] [blame] | 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 4 | |
[email protected] | acf9f27 | 2014-04-15 23:04:00 | [diff] [blame] | 5 | #include "components/query_parser/snippet.h" |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 6 | |
avi | f57136c1 | 2015-12-25 23:27:45 | [diff] [blame] | 7 | #include <stdint.h> |
| 8 | |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 9 | #include <algorithm> |
dcheng | 82beb4f | 2016-04-26 00:35:02 | [diff] [blame] | 10 | #include <memory> |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 11 | |
| 12 | #include "base/logging.h" |
[email protected] | 1988e1c | 2013-02-28 20:27:42 | [diff] [blame] | 13 | #include "base/strings/string_split.h" |
[email protected] | d883056 | 2013-06-10 22:01:54 | [diff] [blame] | 14 | #include "base/strings/string_util.h" |
[email protected] | 112158af | 2013-06-07 23:46:18 | [diff] [blame] | 15 | #include "base/strings/utf_string_conversions.h" |
[email protected] | 8bbf619 | 2013-07-18 11:14:04 | [diff] [blame] | 16 | #include "third_party/icu/source/common/unicode/brkiter.h" |
| 17 | #include "third_party/icu/source/common/unicode/utext.h" |
| 18 | #include "third_party/icu/source/common/unicode/utf8.h" |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 19 | |
[email protected] | acf9f27 | 2014-04-15 23:04:00 | [diff] [blame] | 20 | namespace query_parser { |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 21 | namespace { |
| 22 | |
[email protected] | c29962f2 | 2008-12-03 00:47:58 | [diff] [blame] | 23 | bool PairFirstLessThan(const Snippet::MatchPosition& a, |
| 24 | const Snippet::MatchPosition& b) { |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 25 | return a.first < b.first; |
| 26 | } |
| 27 | |
| 28 | // Combines all pairs after offset in match_positions that are contained |
| 29 | // or touch the pair at offset. |
| 30 | void CoalescePositionsFrom(size_t offset, |
| 31 | Snippet::MatchPositions* match_positions) { |
| 32 | DCHECK(offset < match_positions->size()); |
[email protected] | c29962f2 | 2008-12-03 00:47:58 | [diff] [blame] | 33 | Snippet::MatchPosition& pair((*match_positions)[offset]); |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 34 | ++offset; |
| 35 | while (offset < match_positions->size() && |
| 36 | pair.second >= (*match_positions)[offset].first) { |
| 37 | pair.second = std::max(pair.second, (*match_positions)[offset].second); |
| 38 | match_positions->erase(match_positions->begin() + offset); |
| 39 | } |
| 40 | } |
| 41 | |
| 42 | // Makes sure there is a pair in match_positions that contains the specified |
| 43 | // range. This keeps the pairs ordered in match_positions by first, and makes |
| 44 | // sure none of the pairs in match_positions touch each other. |
[email protected] | c29962f2 | 2008-12-03 00:47:58 | [diff] [blame] | 45 | void AddMatch(size_t start, |
| 46 | size_t end, |
| 47 | Snippet::MatchPositions* match_positions) { |
| 48 | DCHECK(start < end); |
| 49 | DCHECK(match_positions); |
| 50 | Snippet::MatchPosition pair(start, end); |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 51 | if (match_positions->empty()) { |
| 52 | match_positions->push_back(pair); |
| 53 | return; |
| 54 | } |
| 55 | // There's at least one match. Find the position of the new match, |
| 56 | // potentially extending pairs around it. |
| 57 | Snippet::MatchPositions::iterator i = |
| 58 | std::lower_bound(match_positions->begin(), match_positions->end(), |
| 59 | pair, &PairFirstLessThan); |
| 60 | if (i != match_positions->end() && i->first == start) { |
| 61 | // Match not at the end and there is already a pair with the same |
| 62 | // start. |
| 63 | if (end > i->second) { |
| 64 | // New pair extends beyond existing pair. Extend existing pair and |
| 65 | // coalesce matches after it. |
| 66 | i->second = end; |
| 67 | CoalescePositionsFrom(i - match_positions->begin(), match_positions); |
[email protected] | 9fd9092f | 2010-03-08 23:28:41 | [diff] [blame] | 68 | } // else case, new pair completely contained in existing pair, nothing |
| 69 | // to do. |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 70 | } else if (i == match_positions->begin()) { |
| 71 | // Match at the beginning and the first pair doesn't have the same |
| 72 | // start. Insert new pair and coalesce matches after it. |
| 73 | match_positions->insert(i, pair); |
| 74 | CoalescePositionsFrom(0, match_positions); |
| 75 | } else { |
| 76 | // Not at the beginning (but may be at the end). |
| 77 | --i; |
| 78 | if (start <= i->second && end > i->second) { |
| 79 | // Previous element contains match. Extend it and coalesce. |
| 80 | i->second = end; |
| 81 | CoalescePositionsFrom(i - match_positions->begin(), match_positions); |
| 82 | } else if (end > i->second) { |
| 83 | // Region doesn't touch previous element. See if region touches current |
| 84 | // element. |
| 85 | ++i; |
| 86 | if (i == match_positions->end() || end < i->first) { |
| 87 | match_positions->insert(i, pair); |
| 88 | } else { |
| 89 | i->first = start; |
| 90 | i->second = end; |
| 91 | CoalescePositionsFrom(i - match_positions->begin(), match_positions); |
| 92 | } |
| 93 | } |
| 94 | } |
| 95 | } |
| 96 | |
[email protected] | e5366896 | 2010-06-23 15:35:25 | [diff] [blame] | 97 | // Converts an index in a utf8 string into the index in the corresponding utf16 |
| 98 | // string and returns the utf16 index. This is intended to be called in a loop |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 99 | // iterating through a utf8 string. |
| 100 | // |
| 101 | // utf8_string: the utf8 string. |
| 102 | // utf8_length: length of the utf8 string. |
| 103 | // offset: the utf8 offset to convert. |
| 104 | // utf8_pos: current offset in the utf8 string. This is modified and on return |
| 105 | // matches offset. |
| 106 | // wide_pos: current index in the wide string. This is the same as the return |
| 107 | // value. |
[email protected] | e5366896 | 2010-06-23 15:35:25 | [diff] [blame] | 108 | size_t AdvanceAndReturnUTF16Pos(const char* utf8_string, |
| 109 | int32_t utf8_length, |
| 110 | int32_t offset, |
| 111 | int32_t* utf8_pos, |
| 112 | size_t* utf16_pos) { |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 113 | DCHECK(offset >= *utf8_pos && offset <= utf8_length); |
| 114 | |
| 115 | UChar32 wide_char; |
| 116 | while (*utf8_pos < offset) { |
| 117 | U8_NEXT(utf8_string, *utf8_pos, utf8_length, wide_char); |
[email protected] | e5366896 | 2010-06-23 15:35:25 | [diff] [blame] | 118 | *utf16_pos += (wide_char <= 0xFFFF) ? 1 : 2; |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 119 | } |
[email protected] | e5366896 | 2010-06-23 15:35:25 | [diff] [blame] | 120 | return *utf16_pos; |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 121 | } |
| 122 | |
| 123 | // Given a character break iterator over a UTF-8 string, set the iterator |
| 124 | // position to |*utf8_pos| and move by |count| characters. |count| can |
| 125 | // be either positive or negative. |
[email protected] | b5b2385a | 2009-08-18 05:12:29 | [diff] [blame] | 126 | void MoveByNGraphemes(icu::BreakIterator* bi, int count, size_t* utf8_pos) { |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 127 | // Ignore the return value. A side effect of the current position |
| 128 | // being set at or following |*utf8_pos| is exploited here. |
| 129 | // It's simpler than calling following(n) and then previous(). |
| 130 | // isBoundary() is not very fast, but should be good enough for the |
| 131 | // snippet generation. If not, revisit the way we scan in ComputeSnippet. |
[email protected] | acf9f27 | 2014-04-15 23:04:00 | [diff] [blame] | 132 | bi->isBoundary(static_cast<int32_t>(*utf8_pos)); |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 133 | bi->next(count); |
[email protected] | c29962f2 | 2008-12-03 00:47:58 | [diff] [blame] | 134 | *utf8_pos = static_cast<size_t>(bi->current()); |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 135 | } |
| 136 | |
| 137 | // The amount of context to include for a given hit. Note that it's counted |
| 138 | // in terms of graphemes rather than bytes. |
| 139 | const int kSnippetContext = 50; |
| 140 | |
| 141 | // Returns true if next match falls within a snippet window |
| 142 | // from the previous match. The window size is counted in terms |
| 143 | // of graphemes rather than bytes in UTF-8. |
[email protected] | b5b2385a | 2009-08-18 05:12:29 | [diff] [blame] | 144 | bool IsNextMatchWithinSnippetWindow(icu::BreakIterator* bi, |
[email protected] | c29962f2 | 2008-12-03 00:47:58 | [diff] [blame] | 145 | size_t previous_match_end, |
| 146 | size_t next_match_start) { |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 147 | // If it's within a window in terms of bytes, it's certain |
| 148 | // that it's within a window in terms of graphemes as well. |
| 149 | if (next_match_start < previous_match_end + kSnippetContext) |
| 150 | return true; |
[email protected] | acf9f27 | 2014-04-15 23:04:00 | [diff] [blame] | 151 | bi->isBoundary(static_cast<int32_t>(previous_match_end)); |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 152 | // An alternative to this is to call |bi->next()| at most |
| 153 | // kSnippetContext times, compare |bi->current()| with |next_match_start| |
| 154 | // after each call and return early if possible. There are other |
| 155 | // heuristics to speed things up if necessary, but it's not likely that |
| 156 | // we need to bother. |
| 157 | bi->next(kSnippetContext); |
avi | f57136c1 | 2015-12-25 23:27:45 | [diff] [blame] | 158 | int64_t current = bi->current(); |
| 159 | return (next_match_start < static_cast<uint64_t>(current) || |
[email protected] | b5b2385a | 2009-08-18 05:12:29 | [diff] [blame] | 160 | current == icu::BreakIterator::DONE); |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 161 | } |
| 162 | |
| 163 | } // namespace |
| 164 | |
| 165 | // static |
| 166 | void Snippet::ExtractMatchPositions(const std::string& offsets_str, |
| 167 | const std::string& column_num, |
| 168 | MatchPositions* match_positions) { |
| 169 | DCHECK(match_positions); |
| 170 | if (offsets_str.empty()) |
| 171 | return; |
brettw | 8be197d1 | 2015-07-23 23:23:31 | [diff] [blame] | 172 | std::vector<std::string> offsets = base::SplitString( |
| 173 | offsets_str, " ", base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL); |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 174 | // SQLite offsets are sets of four integers: |
| 175 | // column, query term, match offset, match length |
| 176 | // Matches within a string are marked by (start, end) pairs. |
| 177 | for (size_t i = 0; i < offsets.size() - 3; i += 4) { |
| 178 | if (offsets[i] != column_num) |
| 179 | continue; |
[email protected] | c29962f2 | 2008-12-03 00:47:58 | [diff] [blame] | 180 | const size_t start = atoi(offsets[i + 2].c_str()); |
| 181 | const size_t end = start + atoi(offsets[i + 3].c_str()); |
[email protected] | 135b165 | 2009-08-11 21:43:11 | [diff] [blame] | 182 | // Switch to DCHECK after debugging http://crbug.com/15261. |
| 183 | CHECK(end >= start); |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 184 | AddMatch(start, end, match_positions); |
| 185 | } |
| 186 | } |
| 187 | |
| 188 | // static |
| 189 | void Snippet::ConvertMatchPositionsToWide( |
| 190 | const std::string& utf8_string, |
| 191 | Snippet::MatchPositions* match_positions) { |
| 192 | DCHECK(match_positions); |
[email protected] | c29962f2 | 2008-12-03 00:47:58 | [diff] [blame] | 193 | int32_t utf8_pos = 0; |
[email protected] | e5366896 | 2010-06-23 15:35:25 | [diff] [blame] | 194 | size_t utf16_pos = 0; |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 195 | const char* utf8_cstring = utf8_string.c_str(); |
[email protected] | c29962f2 | 2008-12-03 00:47:58 | [diff] [blame] | 196 | const int32_t utf8_length = static_cast<int32_t>(utf8_string.size()); |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 197 | for (Snippet::MatchPositions::iterator i = match_positions->begin(); |
| 198 | i != match_positions->end(); ++i) { |
[email protected] | e5366896 | 2010-06-23 15:35:25 | [diff] [blame] | 199 | i->first = AdvanceAndReturnUTF16Pos(utf8_cstring, utf8_length, |
[email protected] | acf9f27 | 2014-04-15 23:04:00 | [diff] [blame] | 200 | static_cast<int32_t>(i->first), |
| 201 | &utf8_pos, &utf16_pos); |
[email protected] | e5366896 | 2010-06-23 15:35:25 | [diff] [blame] | 202 | i->second = AdvanceAndReturnUTF16Pos(utf8_cstring, utf8_length, |
[email protected] | acf9f27 | 2014-04-15 23:04:00 | [diff] [blame] | 203 | static_cast<int32_t>(i->second), |
| 204 | &utf8_pos, &utf16_pos); |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 205 | } |
| 206 | } |
| 207 | |
[email protected] | 20f0487a | 2010-09-30 20:06:30 | [diff] [blame] | 208 | Snippet::Snippet() { |
| 209 | } |
| 210 | |
vmpstr | b6449d51 | 2016-02-25 23:55:40 | [diff] [blame] | 211 | Snippet::Snippet(const Snippet& other) = default; |
| 212 | |
brettw | 57694b36 | 2017-03-31 17:23:40 | [diff] [blame] | 213 | // TODO(bug 706963) this should be implemented as "= default" when Android |
| 214 | // toolchain is updated. |
| 215 | Snippet::Snippet(Snippet&& other) noexcept |
| 216 | : text_(std::move(other.text_)), matches_(std::move(other.matches_)) {} |
| 217 | |
[email protected] | 20f0487a | 2010-09-30 20:06:30 | [diff] [blame] | 218 | Snippet::~Snippet() { |
| 219 | } |
| 220 | |
brettw | 57694b36 | 2017-03-31 17:23:40 | [diff] [blame] | 221 | Snippet& Snippet::operator=(const Snippet&) = default; |
| 222 | |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 223 | void Snippet::ComputeSnippet(const MatchPositions& match_positions, |
| 224 | const std::string& document) { |
| 225 | // The length of snippets we try to produce. |
| 226 | // We can generate longer snippets but stop once we cross kSnippetMaxLength. |
| 227 | const size_t kSnippetMaxLength = 200; |
[email protected] | 0433872 | 2013-12-24 23:18:05 | [diff] [blame] | 228 | const base::string16 kEllipsis = base::ASCIIToUTF16(" ... "); |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 229 | |
Ivan Kotenkov | 75b1c3a | 2017-10-24 14:47:24 | [diff] [blame] | 230 | UText* document_utext = nullptr; |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 231 | UErrorCode status = U_ZERO_ERROR; |
| 232 | document_utext = utext_openUTF8(document_utext, document.data(), |
[email protected] | c29962f2 | 2008-12-03 00:47:58 | [diff] [blame] | 233 | document.size(), &status); |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 234 | // Locale does not matter because there's no per-locale customization |
| 235 | // for character iterator. |
dcheng | 82beb4f | 2016-04-26 00:35:02 | [diff] [blame] | 236 | std::unique_ptr<icu::BreakIterator> bi( |
| 237 | icu::BreakIterator::createCharacterInstance(icu::Locale::getDefault(), |
| 238 | status)); |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 239 | bi->setText(document_utext, status); |
| 240 | DCHECK(U_SUCCESS(status)); |
| 241 | |
| 242 | // We build the snippet by iterating through the matches and then grabbing |
| 243 | // context around each match. If matches are near enough each other (within |
| 244 | // kSnippetContext), we skip the "..." between them. |
[email protected] | 439f1e3 | 2013-12-09 20:09:09 | [diff] [blame] | 245 | base::string16 snippet; |
[email protected] | c29962f2 | 2008-12-03 00:47:58 | [diff] [blame] | 246 | size_t start = 0; |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 247 | for (size_t i = 0; i < match_positions.size(); ++i) { |
| 248 | // Some shorter names for the current match. |
[email protected] | c29962f2 | 2008-12-03 00:47:58 | [diff] [blame] | 249 | const size_t match_start = match_positions[i].first; |
| 250 | const size_t match_end = match_positions[i].second; |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 251 | |
[email protected] | 135b165 | 2009-08-11 21:43:11 | [diff] [blame] | 252 | // Switch to DCHECK after debugging http://crbug.com/15261. |
| 253 | CHECK(match_end > match_start); |
| 254 | CHECK(match_end <= document.size()); |
| 255 | |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 256 | // Add the context, if any, to show before the match. |
[email protected] | c29962f2 | 2008-12-03 00:47:58 | [diff] [blame] | 257 | size_t context_start = match_start; |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 258 | MoveByNGraphemes(bi.get(), -kSnippetContext, &context_start); |
| 259 | start = std::max(start, context_start); |
| 260 | if (start < match_start) { |
| 261 | if (start > 0) |
| 262 | snippet += kEllipsis; |
[email protected] | 135b165 | 2009-08-11 21:43:11 | [diff] [blame] | 263 | // Switch to DCHECK after debugging http://crbug.com/15261. |
| 264 | CHECK(start < document.size()); |
[email protected] | 0433872 | 2013-12-24 23:18:05 | [diff] [blame] | 265 | snippet += base::UTF8ToUTF16(document.substr(start, match_start - start)); |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 266 | } |
| 267 | |
| 268 | // Add the match. |
[email protected] | c29962f2 | 2008-12-03 00:47:58 | [diff] [blame] | 269 | const size_t first = snippet.size(); |
[email protected] | 0433872 | 2013-12-24 23:18:05 | [diff] [blame] | 270 | snippet += base::UTF8ToUTF16(document.substr(match_start, |
| 271 | match_end - match_start)); |
[email protected] | c29962f2 | 2008-12-03 00:47:58 | [diff] [blame] | 272 | matches_.push_back(std::make_pair(first, snippet.size())); |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 273 | |
| 274 | // Compute the context, if any, to show after the match. |
[email protected] | c29962f2 | 2008-12-03 00:47:58 | [diff] [blame] | 275 | size_t end; |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 276 | // Check if the next match falls within our snippet window. |
| 277 | if (i + 1 < match_positions.size() && |
| 278 | IsNextMatchWithinSnippetWindow(bi.get(), match_end, |
[email protected] | c29962f2 | 2008-12-03 00:47:58 | [diff] [blame] | 279 | match_positions[i + 1].first)) { |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 280 | // Yes, it's within the window. Make the end context extend just up |
| 281 | // to the next match. |
| 282 | end = match_positions[i + 1].first; |
[email protected] | 135b165 | 2009-08-11 21:43:11 | [diff] [blame] | 283 | // Switch to DCHECK after debugging http://crbug.com/15261. |
| 284 | CHECK(end >= match_end); |
| 285 | CHECK(end <= document.size()); |
[email protected] | 0433872 | 2013-12-24 23:18:05 | [diff] [blame] | 286 | snippet += base::UTF8ToUTF16(document.substr(match_end, end - match_end)); |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 287 | } else { |
| 288 | // No, there's either no next match or the next match is too far away. |
| 289 | end = match_end; |
| 290 | MoveByNGraphemes(bi.get(), kSnippetContext, &end); |
[email protected] | 135b165 | 2009-08-11 21:43:11 | [diff] [blame] | 291 | // Switch to DCHECK after debugging http://crbug.com/15261. |
| 292 | CHECK(end >= match_end); |
| 293 | CHECK(end <= document.size()); |
[email protected] | 0433872 | 2013-12-24 23:18:05 | [diff] [blame] | 294 | snippet += base::UTF8ToUTF16(document.substr(match_end, end - match_end)); |
[email protected] | c29962f2 | 2008-12-03 00:47:58 | [diff] [blame] | 295 | if (end < document.size()) |
initial.commit | 09911bf | 2008-07-26 23:55:29 | [diff] [blame] | 296 | snippet += kEllipsis; |
| 297 | } |
| 298 | start = end; |
| 299 | |
| 300 | // Stop here if we have enough snippet computed. |
| 301 | if (snippet.size() >= kSnippetMaxLength) |
| 302 | break; |
| 303 | } |
| 304 | |
| 305 | utext_close(document_utext); |
| 306 | swap(text_, snippet); |
| 307 | } |
[email protected] | 20f0487a | 2010-09-30 20:06:30 | [diff] [blame] | 308 | |
| 309 | void Snippet::Swap(Snippet* other) { |
| 310 | text_.swap(other->text_); |
| 311 | matches_.swap(other->matches_); |
| 312 | } |
[email protected] | acf9f27 | 2014-04-15 23:04:00 | [diff] [blame] | 313 | |
| 314 | } // namespace query_parser |