Blame - chrome/browser/history/snippet.cc - chromium/src.git

blob: d72209e5a41fc5f917df61972405caced12eddf2 [file] [log] [blame]

license.bot	bf09a50	2008-08-24 00:55:55	[diff] [blame^]	1	// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
				2	// Use of this source code is governed by a BSD-style license that can be
				3	// found in the LICENSE file.
initial.commit	09911bf	2008-07-26 23:55:29	[diff] [blame]	4
				5	#include "chrome/browser/history/snippet.h"
				6
				7	#include <algorithm>
				8
				9	#include "base/logging.h"
				10	#include "base/scoped_ptr.h"
				11	#include "base/string_util.h"
				12	#include "unicode/brkiter.h"
				13	#include "unicode/utext.h"
				14	#include "unicode/utf8.h"
				15
				16	namespace {
				17
				18	bool PairFirstLessThan(const std::pair<int,int>& a,
				19	const std::pair<int,int>& b) {
				20	return a.first < b.first;
				21	}
				22
				23	// Combines all pairs after offset in match_positions that are contained
				24	// or touch the pair at offset.
				25	void CoalescePositionsFrom(size_t offset,
				26	Snippet::MatchPositions* match_positions) {
				27	DCHECK(offset < match_positions->size());
				28	std::pair<int,int>& pair((*match_positions)[offset]);
				29	++offset;
				30	while (offset < match_positions->size() &&
				31	pair.second >= (*match_positions)[offset].first) {
				32	pair.second = std::max(pair.second, (*match_positions)[offset].second);
				33	match_positions->erase(match_positions->begin() + offset);
				34	}
				35	}
				36
				37	// Makes sure there is a pair in match_positions that contains the specified
				38	// range. This keeps the pairs ordered in match_positions by first, and makes
				39	// sure none of the pairs in match_positions touch each other.
				40	void AddMatch(int start, int end, Snippet::MatchPositions* match_positions) {
				41	DCHECK(start < end && match_positions);
				42	std::pair<int,int> pair(start, end);
				43	if (match_positions->empty()) {
				44	match_positions->push_back(pair);
				45	return;
				46	}
				47	// There's at least one match. Find the position of the new match,
				48	// potentially extending pairs around it.
				49	Snippet::MatchPositions::iterator i =
				50	std::lower_bound(match_positions->begin(), match_positions->end(),
				51	pair, &PairFirstLessThan);
				52	if (i != match_positions->end() && i->first == start) {
				53	// Match not at the end and there is already a pair with the same
				54	// start.
				55	if (end > i->second) {
				56	// New pair extends beyond existing pair. Extend existing pair and
				57	// coalesce matches after it.
				58	i->second = end;
				59	CoalescePositionsFrom(i - match_positions->begin(), match_positions);
				60	} // else case, new pair completely contained in existing pair, nothing
				61	// to do.
				62	} else if (i == match_positions->begin()) {
				63	// Match at the beginning and the first pair doesn't have the same
				64	// start. Insert new pair and coalesce matches after it.
				65	match_positions->insert(i, pair);
				66	CoalescePositionsFrom(0, match_positions);
				67	} else {
				68	// Not at the beginning (but may be at the end).
				69	--i;
				70	if (start <= i->second && end > i->second) {
				71	// Previous element contains match. Extend it and coalesce.
				72	i->second = end;
				73	CoalescePositionsFrom(i - match_positions->begin(), match_positions);
				74	} else if (end > i->second) {
				75	// Region doesn't touch previous element. See if region touches current
				76	// element.
				77	++i;
				78	if (i == match_positions->end() \|\| end < i->first) {
				79	match_positions->insert(i, pair);
				80	} else {
				81	i->first = start;
				82	i->second = end;
				83	CoalescePositionsFrom(i - match_positions->begin(), match_positions);
				84	}
				85	}
				86	}
				87	}
				88
				89	// Converts an index in a utf8 string into the index in the corresponding wide
				90	// string and returns the wide index. This is intended to be called in a loop
				91	// iterating through a utf8 string.
				92	//
				93	// utf8_string: the utf8 string.
				94	// utf8_length: length of the utf8 string.
				95	// offset: the utf8 offset to convert.
				96	// utf8_pos: current offset in the utf8 string. This is modified and on return
				97	// matches offset.
				98	// wide_pos: current index in the wide string. This is the same as the return
				99	// value.
				100	int AdvanceAndReturnWidePos(const char* utf8_string,
				101	int utf8_length,
				102	int offset,
				103	int* utf8_pos,
				104	int* wide_pos) {
				105	DCHECK(offset >= *utf8_pos && offset <= utf8_length);
				106
				107	UChar32 wide_char;
				108	while (*utf8_pos < offset) {
				109	U8_NEXT(utf8_string, *utf8_pos, utf8_length, wide_char);
				110	*wide_pos += (wide_char <= 0xFFFF) ? 1 : 2;
				111	}
				112	return *wide_pos;
				113	}
				114
				115	// Given a character break iterator over a UTF-8 string, set the iterator
				116	// position to \|*utf8_pos\| and move by \|count\| characters. \|count\| can
				117	// be either positive or negative.
				118	void MoveByNGraphemes(BreakIterator* bi, int count, int* utf8_pos) {
				119	// Ignore the return value. A side effect of the current position
				120	// being set at or following \|*utf8_pos\| is exploited here.
				121	// It's simpler than calling following(n) and then previous().
				122	// isBoundary() is not very fast, but should be good enough for the
				123	// snippet generation. If not, revisit the way we scan in ComputeSnippet.
				124	bi->isBoundary(*utf8_pos);
				125	bi->next(count);
				126	*utf8_pos = static_cast<int>(bi->current());
				127	}
				128
				129	// The amount of context to include for a given hit. Note that it's counted
				130	// in terms of graphemes rather than bytes.
				131	const int kSnippetContext = 50;
				132
				133	// Returns true if next match falls within a snippet window
				134	// from the previous match. The window size is counted in terms
				135	// of graphemes rather than bytes in UTF-8.
				136	bool IsNextMatchWithinSnippetWindow(BreakIterator* bi,
				137	int previous_match_end,
				138	int next_match_start) {
				139	// If it's within a window in terms of bytes, it's certain
				140	// that it's within a window in terms of graphemes as well.
				141	if (next_match_start < previous_match_end + kSnippetContext)
				142	return true;
				143	bi->isBoundary(previous_match_end);
				144	// An alternative to this is to call \|bi->next()\| at most
				145	// kSnippetContext times, compare \|bi->current()\| with \|next_match_start\|
				146	// after each call and return early if possible. There are other
				147	// heuristics to speed things up if necessary, but it's not likely that
				148	// we need to bother.
				149	bi->next(kSnippetContext);
				150	int64_t current = bi->current();
				151	return (next_match_start < current \|\| current == BreakIterator::DONE);
				152	}
				153
				154	} // namespace
				155
				156	// static
				157	void Snippet::ExtractMatchPositions(const std::string& offsets_str,
				158	const std::string& column_num,
				159	MatchPositions* match_positions) {
				160	DCHECK(match_positions);
				161	if (offsets_str.empty())
				162	return;
				163	std::vector<std::string> offsets;
				164	SplitString(offsets_str, ' ', &offsets);
				165	// SQLite offsets are sets of four integers:
				166	// column, query term, match offset, match length
				167	// Matches within a string are marked by (start, end) pairs.
				168	for (size_t i = 0; i < offsets.size() - 3; i += 4) {
				169	if (offsets[i] != column_num)
				170	continue;
				171	const int start = atoi(offsets[i+2].c_str());
				172	const int end = start + atoi(offsets[i+3].c_str());
				173	AddMatch(start, end, match_positions);
				174	}
				175	}
				176
				177	// static
				178	void Snippet::ConvertMatchPositionsToWide(
				179	const std::string& utf8_string,
				180	Snippet::MatchPositions* match_positions) {
				181	DCHECK(match_positions);
				182	int utf8_pos = 0;
				183	int wide_pos = 0;
				184	const char* utf8_cstring = utf8_string.c_str();
				185	const int utf8_length = static_cast<int>(utf8_string.size());
				186	for (Snippet::MatchPositions::iterator i = match_positions->begin();
				187	i != match_positions->end(); ++i) {
				188	i->first = AdvanceAndReturnWidePos(utf8_cstring, utf8_length,
				189	i->first, &utf8_pos, &wide_pos);
				190	i->second =
				191	AdvanceAndReturnWidePos(utf8_cstring, utf8_length, i->second, &utf8_pos,
				192	&wide_pos);
				193	}
				194	}
				195
				196	void Snippet::ComputeSnippet(const MatchPositions& match_positions,
				197	const std::string& document) {
				198	// The length of snippets we try to produce.
				199	// We can generate longer snippets but stop once we cross kSnippetMaxLength.
				200	const size_t kSnippetMaxLength = 200;
				201
				202
				203	const std::wstring kEllipsis = L" ... ";
				204
				205	// Grab the size as an int to cut down on casts later.
				206	const int document_size = static_cast<int>(document.size());
				207
				208	UText* document_utext = NULL;
				209	UErrorCode status = U_ZERO_ERROR;
				210	document_utext = utext_openUTF8(document_utext, document.data(),
				211	document_size, &status);
				212	// Locale does not matter because there's no per-locale customization
				213	// for character iterator.
				214	scoped_ptr<BreakIterator> bi(
				215	BreakIterator::createCharacterInstance(Locale::getDefault(), status));
				216	bi->setText(document_utext, status);
				217	DCHECK(U_SUCCESS(status));
				218
				219	// We build the snippet by iterating through the matches and then grabbing
				220	// context around each match. If matches are near enough each other (within
				221	// kSnippetContext), we skip the "..." between them.
				222	std::wstring snippet;
				223	int start = 0;
				224	for (size_t i = 0; i < match_positions.size(); ++i) {
				225	// Some shorter names for the current match.
				226	const int match_start = match_positions[i].first;
				227	const int match_end = match_positions[i].second;
				228
				229	// Add the context, if any, to show before the match.
				230	int context_start = match_start;
				231	MoveByNGraphemes(bi.get(), -kSnippetContext, &context_start);
				232	start = std::max(start, context_start);
				233	if (start < match_start) {
				234	if (start > 0)
				235	snippet += kEllipsis;
				236	snippet += UTF8ToWide(document.substr(start, match_start - start));
				237	}
				238
				239	// Add the match.
				240	matches_.push_back(std::make_pair(static_cast<int>(snippet.size()), 0));
				241	snippet += UTF8ToWide(document.substr(match_start,
				242	match_end - match_start));
				243	matches_.back().second = static_cast<int>(snippet.size());
				244
				245	// Compute the context, if any, to show after the match.
				246	int end;
				247	// Check if the next match falls within our snippet window.
				248	if (i + 1 < match_positions.size() &&
				249	IsNextMatchWithinSnippetWindow(bi.get(), match_end,
				250	match_positions[i + 1].first)) {
				251	// Yes, it's within the window. Make the end context extend just up
				252	// to the next match.
				253	end = match_positions[i + 1].first;
				254	snippet += UTF8ToWide(document.substr(match_end, end - match_end));
				255	} else {
				256	// No, there's either no next match or the next match is too far away.
				257	end = match_end;
				258	MoveByNGraphemes(bi.get(), kSnippetContext, &end);
				259	snippet += UTF8ToWide(document.substr(match_end, end - match_end));
				260	if (end < document_size)
				261	snippet += kEllipsis;
				262	}
				263	start = end;
				264
				265	// Stop here if we have enough snippet computed.
				266	if (snippet.size() >= kSnippetMaxLength)
				267	break;
				268	}
				269
				270	utext_close(document_utext);
				271	swap(text_, snippet);
				272	}
license.bot	bf09a50	2008-08-24 00:55:55	[diff] [blame^]	273