blob: 00d05ce64a6accb42a6c6eabcddd59b4619e894b [file] [log] [blame]
// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <limits>
#include <set>
#include "chrome/browser/history/text_database.h"
#include "base/file_util.h"
#include "base/logging.h"
#include "base/string_util.h"
#include "chrome/common/sqlite_utils.h"
// There are two tables in each database, one full-text search (FTS) table which
// indexes the contents and title of the pages. The other is a regular SQLITE
// table which contains non-indexed information about the page. All columns of
// a FTS table are indexed using the text search algorithm, which isn't what we
// want for things like times. If this were in the FTS table, there would be
// different words in the index for each time number.
//
// "pages" FTS table:
// url URL of the page so searches will match the URL.
// title Title of the page.
// body Body of the page.
//
// "info" regular table:
// time Time the corresponding FTS entry was visited.
//
// We do joins across these two tables by using their internal rowids, which we
// keep in sync between the two tables. The internal rowid is the only part of
// an FTS table that is indexed like a normal table, and the index over it is
// free since sqlite always indexes the internal rowid.
using base::Time;
namespace history {
namespace {
static const int kCurrentVersionNumber = 1;
static const int kCompatibleVersionNumber = 1;
// Snippet computation relies on the index of the columns in the original
// create statement. These are the 0-based indices (as strings) of the
// corresponding columns.
const char kTitleColumnIndex[] = "1";
const char kBodyColumnIndex[] = "2";
// The string prepended to the database identifier to generate the filename.
const wchar_t kFilePrefix[] = L"History Index ";
const size_t kFilePrefixLen = arraysize(kFilePrefix) - 1; // Don't count NULL.
// We do not allow rollback, but this simple scoper makes it easy to always
// remember to commit a begun transaction. This protects against some errors
// caused by a crash in the middle of a transaction, although doesn't give us
// the full protection of a transaction's rollback abilities.
class ScopedTransactionCommitter {
public:
ScopedTransactionCommitter(TextDatabase* db) : db_(db) {
db_->BeginTransaction();
}
~ScopedTransactionCommitter() {
db_->CommitTransaction();
}
private:
TextDatabase* db_;
};
} // namespace
TextDatabase::TextDatabase(const std::wstring& path,
DBIdent id,
bool allow_create)
: db_(NULL),
statement_cache_(NULL),
path_(path),
ident_(id),
allow_create_(allow_create),
transaction_nesting_(0) {
// Compute the file name.
file_name_ = path_;
file_util::AppendToPath(&file_name_, IDToFileName(ident_));
}
TextDatabase::~TextDatabase() {
if (statement_cache_) {
// Must release these statements before closing the DB.
delete statement_cache_;
statement_cache_ = NULL;
}
if (db_) {
sqlite3_close(db_);
db_ = NULL;
}
}
// static
const wchar_t* TextDatabase::file_base() {
return kFilePrefix;
}
// static
std::wstring TextDatabase::IDToFileName(DBIdent id) {
// Identifiers are intended to be a combination of the year and month, for
// example, 200801 for January 2008. We convert this to
// "History Index 2008-01". However, we don't make assumptions about this
// scheme: the caller should assign IDs as it feels fit with the knowledge
// that they will apppear on disk in this form.
return StringPrintf(L"%ls%d-%02d", file_base(), id / 100, id % 100);
}
// static
TextDatabase::DBIdent TextDatabase::FileNameToID(const std::wstring& file_path){
std::wstring file_name = file_util::GetFilenameFromPath(file_path);
// We don't actually check the prefix here. Since the file system could
// be case insensitive in ways we can't predict (NTFS), checking could
// potentially be the wrong thing to do. Instead, we just look for a suffix.
static const size_t kIDStringLength = 7; // Room for "xxxx-xx".
if (file_name.length() < kIDStringLength)
return 0;
const std::wstring suffix(&file_name[file_name.length() - kIDStringLength]);
if (suffix.length() != kIDStringLength || suffix[4] != L'-') {
return 0;
}
int year = StringToInt(suffix.substr(0, 4));
int month = StringToInt(suffix.substr(5, 2));
return year * 100 + month;
}
bool TextDatabase::Init() {
// Make sure, if we're not allowed to create the file, that it exists.
if (!allow_create_) {
if (!file_util::PathExists(file_name_))
return false;
}
// Attach the database to our index file.
if (sqlite3_open(WideToUTF8(file_name_).c_str(), &db_) != SQLITE_OK)
return false;
statement_cache_ = new SqliteStatementCache(db_);
// Set the database page size to something a little larger to give us
// better performance (we're typically seek rather than bandwidth limited).
// This only has an effect before any tables have been created, otherwise
// this is a NOP. Must be a power of 2 and a max of 8192.
sqlite3_exec(db_, "PRAGMA page_size=4096", NULL, NULL, NULL);
// The default cache size is 2000 which give >8MB of data. Since we will often
// have 2-3 of these objects, each with their own 8MB, this adds up very fast.
// We therefore reduce the size so when there are multiple objects, we're not
// too big.
sqlite3_exec(db_, "PRAGMA cache_size=512", NULL, NULL, NULL);
// Run the database in exclusive mode. Nobody else should be accessing the
// database while we're running, and this will give somewhat improved perf.
sqlite3_exec(db_, "PRAGMA locking_mode=EXCLUSIVE", NULL, NULL, NULL);
// Meta table tracking version information.
if (!meta_table_.Init(std::string(), kCurrentVersionNumber,
kCompatibleVersionNumber, db_))
return false;
if (meta_table_.GetCompatibleVersionNumber() > kCurrentVersionNumber) {
// This version is too new. We don't bother notifying the user on this
// error, and just fail to use the file. Normally if they have version skew,
// they will get it for the main history file and it won't be necessary
// here. If that's not the case, since this is only indexed data, it's
// probably better to just not give FTS results than strange errors when
// everything else is working OK.
LOG(WARNING) << "Text database is too new.";
return false;
}
return CreateTables();
}
void TextDatabase::BeginTransaction() {
if (!transaction_nesting_)
sqlite3_exec(db_, "BEGIN TRANSACTION", NULL, NULL, NULL);
transaction_nesting_++;
}
void TextDatabase::CommitTransaction() {
DCHECK(transaction_nesting_);
transaction_nesting_--;
if (!transaction_nesting_)
sqlite3_exec(db_, "COMMIT", NULL, NULL, NULL);
}
bool TextDatabase::CreateTables() {
// FTS table of page contents.
if (!DoesSqliteTableExist(db_, "pages")) {
if (sqlite3_exec(db_,
"CREATE VIRTUAL TABLE pages USING fts2("
"TOKENIZE icu,"
"url LONGVARCHAR,"
"title LONGVARCHAR,"
"body LONGVARCHAR)", NULL, NULL, NULL) != SQLITE_OK)
return false;
}
// Non-FTS table containing URLs and times so we can efficiently find them
// using a regular index (all FTS columns are special and are treated as
// full-text-search, which is not what we want when retrieving this data).
if (!DoesSqliteTableExist(db_, "info")) {
// Note that there is no point in creating an index over time. Since
// we must always query the entire FTS table (it can not efficiently do
// subsets), we will always end up doing that first, and joining the info
// table off of that.
if (sqlite3_exec(db_, "CREATE TABLE info(time INTEGER NOT NULL)",
NULL, NULL, NULL) != SQLITE_OK)
return false;
}
// Create the index. This will fail when the index already exists, so we just
// ignore the error.
sqlite3_exec(db_, "CREATE INDEX info_time ON info(time)", NULL, NULL, NULL);
return true;
}
bool TextDatabase::AddPageData(Time time,
const std::string& url,
const std::string& title,
const std::string& contents) {
ScopedTransactionCommitter committer(this);
// Add to the pages table.
SQLITE_UNIQUE_STATEMENT(add_to_pages, *statement_cache_,
"INSERT INTO pages (url, title, body) VALUES (?,?,?)");
if (!add_to_pages.is_valid())
return false;
add_to_pages->bind_string(0, url);
add_to_pages->bind_string(1, title);
add_to_pages->bind_string(2, contents);
if (add_to_pages->step() != SQLITE_DONE) {
NOTREACHED() << sqlite3_errmsg(db_);
return false;
}
int64 rowid = sqlite3_last_insert_rowid(db_);
// Add to the info table with the same rowid.
SQLITE_UNIQUE_STATEMENT(add_to_info, *statement_cache_,
"INSERT INTO info (rowid, time) VALUES (?,?)");
if (!add_to_info.is_valid())
return false;
add_to_info->bind_int64(0, rowid);
add_to_info->bind_int64(1, time.ToInternalValue());
if (add_to_info->step() != SQLITE_DONE) {
NOTREACHED() << sqlite3_errmsg(db_);
return false;
}
return true;
}
void TextDatabase::DeletePageData(Time time, const std::string& url) {
// First get all rows that match. Selecing on time (which has an index) allows
// us to avoid brute-force searches on the full-text-index table (there will
// generally be only one match per time).
SQLITE_UNIQUE_STATEMENT(select_ids, *statement_cache_,
"SELECT info.rowid "
"FROM info JOIN pages ON info.rowid = pages.rowid "
"WHERE info.time=? AND pages.url=?");
if (!select_ids.is_valid())
return;
select_ids->bind_int64(0, time.ToInternalValue());
select_ids->bind_string(1, url);
std::set<int64> rows_to_delete;
while (select_ids->step() == SQLITE_ROW)
rows_to_delete.insert(select_ids->column_int64(0));
// Delete from the pages table.
SQLITE_UNIQUE_STATEMENT(delete_page, *statement_cache_,
"DELETE FROM pages WHERE rowid=?");
if (!delete_page.is_valid())
return;
for (std::set<int64>::const_iterator i = rows_to_delete.begin();
i != rows_to_delete.end(); ++i) {
delete_page->bind_int64(0, *i);
delete_page->step();
delete_page->reset();
}
// Delete from the info table.
SQLITE_UNIQUE_STATEMENT(delete_info, *statement_cache_,
"DELETE FROM info WHERE rowid=?");
if (!delete_info.is_valid())
return;
for (std::set<int64>::const_iterator i = rows_to_delete.begin();
i != rows_to_delete.end(); ++i) {
delete_info->bind_int64(0, *i);
delete_info->step();
delete_info->reset();
}
}
void TextDatabase::Optimize() {
SQLITE_UNIQUE_STATEMENT(statement, *statement_cache_,
"SELECT OPTIMIZE(pages) FROM pages LIMIT 1");
if (!statement.is_valid())
return;
statement->step();
}
void TextDatabase::GetTextMatches(const std::string& query,
const QueryOptions& options,
std::vector<Match>* results,
URLSet* found_urls,
Time* first_time_searched) {
*first_time_searched = options.begin_time;
SQLITE_UNIQUE_STATEMENT(statement, *statement_cache_,
"SELECT url, title, time, offsets(pages), body "
"FROM pages LEFT OUTER JOIN info ON pages.rowid = info.rowid "
"WHERE pages MATCH ? AND time >= ? AND time < ? "
"ORDER BY time DESC "
"LIMIT ?");
if (!statement.is_valid())
return;
// When their values indicate "unspecified", saturate the numbers to the max
// or min to get the correct result.
int64 effective_begin_time = options.begin_time.is_null() ?
0 : options.begin_time.ToInternalValue();
int64 effective_end_time = options.end_time.is_null() ?
std::numeric_limits<int64>::max() : options.end_time.ToInternalValue();
int effective_max_count = options.max_count ?
options.max_count : std::numeric_limits<int>::max();
statement->bind_string(0, query);
statement->bind_int64(1, effective_begin_time);
statement->bind_int64(2, effective_end_time);
statement->bind_int(3, effective_max_count);
while (statement->step() == SQLITE_ROW) {
// TODO(brettw) allow canceling the query in the middle.
// if (canceled_or_something)
// break;
GURL url(statement->column_string(0));
if (options.most_recent_visit_only) {
URLSet::const_iterator found_url = found_urls->find(url);
if (found_url != found_urls->end())
continue; // Don't add this duplicate when unique URLs are requested.
}
// Fill the results into the vector (avoid copying the URL with Swap()).
results->resize(results->size() + 1);
Match& match = results->at(results->size() - 1);
match.url.Swap(&url);
match.title = UTF8ToWide(statement->column_string(1));
match.time = Time::FromInternalValue(statement->column_int64(2));
// Extract any matches in the title.
std::string offsets_str = statement->column_string(3);
Snippet::ExtractMatchPositions(offsets_str, kTitleColumnIndex,
&match.title_match_positions);
Snippet::ConvertMatchPositionsToWide(statement->column_string(1),
&match.title_match_positions);
// Extract the matches in the body.
Snippet::MatchPositions match_positions;
Snippet::ExtractMatchPositions(offsets_str, kBodyColumnIndex,
&match_positions);
// Compute the snippet based on those matches.
std::string body = statement->column_string(4);
match.snippet.ComputeSnippet(match_positions, body);
}
// When we have returned all the results possible (or determined that there
// are none), then we have searched all the time requested, so we can
// set the first_time_searched to that value.
if (results->size() == 0 ||
options.max_count == 0 || // Special case for wanting all the results.
static_cast<int>(results->size()) < options.max_count) {
*first_time_searched = options.begin_time;
} else {
// Since we got the results in order, we know the last item is the last
// time we considered.
*first_time_searched = results->back().time;
}
statement->reset();
}
} // namespace history