blob: 3a7cc14c4466911b66c46082193723980f640e1d [file] [log] [blame]
[email protected]200bd332013-08-05 16:19:111// Copyright 2013 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/browser/devtools/devtools_file_system_indexer.h"
6
7#include <iterator>
8
9#include "base/bind.h"
10#include "base/callback.h"
[email protected]200bd332013-08-05 16:19:1111#include "base/files/file_enumerator.h"
thestig18dfb7a52014-08-26 10:44:0412#include "base/files/file_util.h"
[email protected]200bd332013-08-05 16:19:1113#include "base/files/file_util_proxy.h"
14#include "base/lazy_instance.h"
15#include "base/logging.h"
[email protected]09f3fde82014-05-14 15:08:1516#include "base/stl_util.h"
[email protected]200bd332013-08-05 16:19:1117#include "base/strings/utf_string_conversions.h"
18#include "content/public/browser/browser_thread.h"
19
20using base::Bind;
21using base::Callback;
22using base::FileEnumerator;
23using base::FilePath;
[email protected]200bd332013-08-05 16:19:1124using base::Time;
25using base::TimeDelta;
26using base::TimeTicks;
[email protected]200bd332013-08-05 16:19:1127using content::BrowserThread;
28using std::map;
29using std::set;
30using std::string;
31using std::vector;
32
33namespace {
34
35typedef int32 Trigram;
36typedef char TrigramChar;
37typedef uint16 FileId;
38
39const int kMinTimeoutBetweenWorkedNitification = 200;
40// Trigram characters include all ASCII printable characters (32-126) except for
41// the capital letters, because the index is case insensitive.
42const size_t kTrigramCharacterCount = 126 - 'Z' - 1 + 'A' - ' ' + 1;
43const size_t kTrigramCount =
44 kTrigramCharacterCount * kTrigramCharacterCount * kTrigramCharacterCount;
45const int kMaxReadLength = 10 * 1024;
46const TrigramChar kUndefinedTrigramChar = -1;
vsevik3ef1c9d2014-10-23 14:17:3547const TrigramChar kBinaryTrigramChar = -2;
[email protected]200bd332013-08-05 16:19:1148const Trigram kUndefinedTrigram = -1;
49
[email protected]200bd332013-08-05 16:19:1150class Index {
51 public:
52 Index();
53 Time LastModifiedTimeForFile(const FilePath& file_path);
54 void SetTrigramsForFile(const FilePath& file_path,
55 const vector<Trigram>& index,
56 const Time& time);
57 vector<FilePath> Search(string query);
58 void PrintStats();
59 void NormalizeVectors();
60
61 private:
62 ~Index();
63
64 FileId GetFileId(const FilePath& file_path);
65
66 typedef map<FilePath, FileId> FileIdsMap;
67 FileIdsMap file_ids_;
68 FileId last_file_id_;
69 // The index in this vector is the trigram id.
70 vector<vector<FileId> > index_;
71 typedef map<FilePath, Time> IndexedFilesMap;
72 IndexedFilesMap index_times_;
73 vector<bool> is_normalized_;
74
75 DISALLOW_COPY_AND_ASSIGN(Index);
76};
77
78base::LazyInstance<Index>::Leaky g_trigram_index = LAZY_INSTANCE_INITIALIZER;
79
[email protected]200bd332013-08-05 16:19:1180TrigramChar TrigramCharForChar(char c) {
vsevik3ef1c9d2014-10-23 14:17:3581 static TrigramChar* trigram_chars = nullptr;
82 if (!trigram_chars) {
83 trigram_chars = new TrigramChar[256];
84 for (size_t i = 0; i < 256; ++i) {
85 if (i > 127) {
86 trigram_chars[i] = kUndefinedTrigramChar;
87 continue;
88 }
89 char ch = static_cast<char>(i);
90 if (ch == '\t')
91 ch = ' ';
92 if (ch >= 'A' && ch <= 'Z')
93 ch = ch - 'A' + 'a';
94
95 bool is_binary_char = ch < 9 || (ch >= 14 && ch < 32) || ch == 127;
96 if (is_binary_char) {
97 trigram_chars[i] = kBinaryTrigramChar;
98 continue;
99 }
100
101 if (ch < ' ') {
102 trigram_chars[i] = kUndefinedTrigramChar;
103 continue;
104 }
105
106 if (ch >= 'Z')
107 ch = ch - 'Z' - 1 + 'A';
108 ch -= ' ';
109 char signed_trigram_count = static_cast<char>(kTrigramCharacterCount);
110 CHECK(ch >= 0 && ch < signed_trigram_count);
111 trigram_chars[i] = ch;
112 }
113 }
[email protected]200bd332013-08-05 16:19:11114 unsigned char uc = static_cast<unsigned char>(c);
vsevik3ef1c9d2014-10-23 14:17:35115 return trigram_chars[uc];
[email protected]200bd332013-08-05 16:19:11116}
117
[email protected]8f8304732013-08-05 22:25:52118Trigram TrigramAtIndex(const vector<TrigramChar>& trigram_chars, size_t index) {
[email protected]200bd332013-08-05 16:19:11119 static int kTrigramCharacterCountSquared =
120 kTrigramCharacterCount * kTrigramCharacterCount;
121 if (trigram_chars[index] == kUndefinedTrigramChar ||
122 trigram_chars[index + 1] == kUndefinedTrigramChar ||
123 trigram_chars[index + 2] == kUndefinedTrigramChar)
124 return kUndefinedTrigram;
125 Trigram trigram = kTrigramCharacterCountSquared * trigram_chars[index] +
126 kTrigramCharacterCount * trigram_chars[index + 1] +
127 trigram_chars[index + 2];
128 return trigram;
129}
130
131Index::Index() : last_file_id_(0) {
132 index_.resize(kTrigramCount);
133 is_normalized_.resize(kTrigramCount);
134 std::fill(is_normalized_.begin(), is_normalized_.end(), true);
135}
136
137Index::~Index() {}
138
139Time Index::LastModifiedTimeForFile(const FilePath& file_path) {
mostynb13260d52015-03-26 09:12:09140 DCHECK_CURRENTLY_ON(BrowserThread::FILE);
[email protected]200bd332013-08-05 16:19:11141 Time last_modified_time;
142 if (index_times_.find(file_path) != index_times_.end())
143 last_modified_time = index_times_[file_path];
144 return last_modified_time;
145}
146
147void Index::SetTrigramsForFile(const FilePath& file_path,
148 const vector<Trigram>& index,
149 const Time& time) {
mostynb13260d52015-03-26 09:12:09150 DCHECK_CURRENTLY_ON(BrowserThread::FILE);
[email protected]200bd332013-08-05 16:19:11151 FileId file_id = GetFileId(file_path);
152 vector<Trigram>::const_iterator it = index.begin();
153 for (; it != index.end(); ++it) {
154 Trigram trigram = *it;
155 index_[trigram].push_back(file_id);
156 is_normalized_[trigram] = false;
157 }
158 index_times_[file_path] = time;
159}
160
161vector<FilePath> Index::Search(string query) {
mostynb13260d52015-03-26 09:12:09162 DCHECK_CURRENTLY_ON(BrowserThread::FILE);
[email protected]200bd332013-08-05 16:19:11163 const char* data = query.c_str();
164 vector<TrigramChar> trigram_chars;
165 trigram_chars.reserve(query.size());
vsevik3ef1c9d2014-10-23 14:17:35166 for (size_t i = 0; i < query.size(); ++i) {
167 TrigramChar trigram_char = TrigramCharForChar(data[i]);
168 if (trigram_char == kBinaryTrigramChar)
169 trigram_char = kUndefinedTrigramChar;
170 trigram_chars.push_back(trigram_char);
171 }
[email protected]200bd332013-08-05 16:19:11172 vector<Trigram> trigrams;
173 for (size_t i = 0; i + 2 < query.size(); ++i) {
174 Trigram trigram = TrigramAtIndex(trigram_chars, i);
175 if (trigram != kUndefinedTrigram)
176 trigrams.push_back(trigram);
177 }
178 set<FileId> file_ids;
179 bool first = true;
180 vector<Trigram>::const_iterator it = trigrams.begin();
181 for (; it != trigrams.end(); ++it) {
182 Trigram trigram = *it;
183 if (first) {
184 std::copy(index_[trigram].begin(),
185 index_[trigram].end(),
186 std::inserter(file_ids, file_ids.begin()));
187 first = false;
188 continue;
189 }
[email protected]09f3fde82014-05-14 15:08:15190 set<FileId> intersection = base::STLSetIntersection<set<FileId> >(
191 file_ids, index_[trigram]);
[email protected]200bd332013-08-05 16:19:11192 file_ids.swap(intersection);
193 }
194 vector<FilePath> result;
195 FileIdsMap::const_iterator ids_it = file_ids_.begin();
196 for (; ids_it != file_ids_.end(); ++ids_it) {
197 if (trigrams.size() == 0 ||
198 file_ids.find(ids_it->second) != file_ids.end()) {
199 result.push_back(ids_it->first);
200 }
201 }
202 return result;
203}
204
205FileId Index::GetFileId(const FilePath& file_path) {
mostynb13260d52015-03-26 09:12:09206 DCHECK_CURRENTLY_ON(BrowserThread::FILE);
[email protected]200bd332013-08-05 16:19:11207 string file_path_str = file_path.AsUTF8Unsafe();
208 if (file_ids_.find(file_path) != file_ids_.end())
209 return file_ids_[file_path];
210 file_ids_[file_path] = ++last_file_id_;
211 return last_file_id_;
212}
213
214void Index::NormalizeVectors() {
mostynb13260d52015-03-26 09:12:09215 DCHECK_CURRENTLY_ON(BrowserThread::FILE);
[email protected]200bd332013-08-05 16:19:11216 for (size_t i = 0; i < kTrigramCount; ++i) {
217 if (!is_normalized_[i]) {
218 std::sort(index_[i].begin(), index_[i].end());
219 if (index_[i].capacity() > index_[i].size())
220 vector<FileId>(index_[i]).swap(index_[i]);
221 is_normalized_[i] = true;
222 }
223 }
224}
225
226void Index::PrintStats() {
mostynb13260d52015-03-26 09:12:09227 DCHECK_CURRENTLY_ON(BrowserThread::FILE);
[email protected]200bd332013-08-05 16:19:11228 LOG(ERROR) << "Index stats:";
229 size_t size = 0;
230 size_t maxSize = 0;
231 size_t capacity = 0;
232 for (size_t i = 0; i < kTrigramCount; ++i) {
233 if (index_[i].size() > maxSize)
234 maxSize = index_[i].size();
235 size += index_[i].size();
236 capacity += index_[i].capacity();
237 }
238 LOG(ERROR) << " - total trigram count: " << size;
239 LOG(ERROR) << " - max file count per trigram: " << maxSize;
240 LOG(ERROR) << " - total vectors capacity " << capacity;
241 size_t total_index_size =
242 capacity * sizeof(FileId) + sizeof(vector<FileId>) * kTrigramCount;
243 LOG(ERROR) << " - estimated total index size " << total_index_size;
244}
245
246typedef Callback<void(bool, const vector<bool>&)> IndexerCallback;
247
248} // namespace
249
250DevToolsFileSystemIndexer::FileSystemIndexingJob::FileSystemIndexingJob(
251 const FilePath& file_system_path,
252 const TotalWorkCallback& total_work_callback,
253 const WorkedCallback& worked_callback,
254 const DoneCallback& done_callback)
255 : file_system_path_(file_system_path),
256 total_work_callback_(total_work_callback),
257 worked_callback_(worked_callback),
258 done_callback_(done_callback),
[email protected]bda135f2014-04-10 21:55:06259 current_file_(BrowserThread::GetMessageLoopProxyForThread(
260 BrowserThread::FILE).get()),
[email protected]200bd332013-08-05 16:19:11261 files_indexed_(0),
262 stopped_(false) {
263 current_trigrams_set_.resize(kTrigramCount);
264 current_trigrams_.reserve(kTrigramCount);
265}
266
267DevToolsFileSystemIndexer::FileSystemIndexingJob::~FileSystemIndexingJob() {}
268
269void DevToolsFileSystemIndexer::FileSystemIndexingJob::Start() {
mostynb13260d52015-03-26 09:12:09270 DCHECK_CURRENTLY_ON(BrowserThread::UI);
[email protected]200bd332013-08-05 16:19:11271 BrowserThread::PostTask(
272 BrowserThread::FILE,
273 FROM_HERE,
274 Bind(&FileSystemIndexingJob::CollectFilesToIndex, this));
275}
276
277void DevToolsFileSystemIndexer::FileSystemIndexingJob::Stop() {
mostynb13260d52015-03-26 09:12:09278 DCHECK_CURRENTLY_ON(BrowserThread::UI);
[email protected]200bd332013-08-05 16:19:11279 BrowserThread::PostTask(BrowserThread::FILE,
280 FROM_HERE,
281 Bind(&FileSystemIndexingJob::StopOnFileThread, this));
282}
283
284void DevToolsFileSystemIndexer::FileSystemIndexingJob::StopOnFileThread() {
285 stopped_ = true;
286}
287
288void DevToolsFileSystemIndexer::FileSystemIndexingJob::CollectFilesToIndex() {
mostynb13260d52015-03-26 09:12:09289 DCHECK_CURRENTLY_ON(BrowserThread::FILE);
[email protected]200bd332013-08-05 16:19:11290 if (stopped_)
291 return;
292 if (!file_enumerator_) {
293 file_enumerator_.reset(
294 new FileEnumerator(file_system_path_, true, FileEnumerator::FILES));
295 }
296 FilePath file_path = file_enumerator_->Next();
297 if (file_path.empty()) {
298 BrowserThread::PostTask(
299 BrowserThread::UI,
300 FROM_HERE,
301 Bind(total_work_callback_, file_path_times_.size()));
302 indexing_it_ = file_path_times_.begin();
303 IndexFiles();
304 return;
305 }
306 Time saved_last_modified_time =
307 g_trigram_index.Get().LastModifiedTimeForFile(file_path);
308 FileEnumerator::FileInfo file_info = file_enumerator_->GetInfo();
309 Time current_last_modified_time = file_info.GetLastModifiedTime();
310 if (current_last_modified_time > saved_last_modified_time) {
311 file_path_times_[file_path] = current_last_modified_time;
312 }
313 BrowserThread::PostTask(
314 BrowserThread::FILE,
315 FROM_HERE,
316 Bind(&FileSystemIndexingJob::CollectFilesToIndex, this));
317}
318
319void DevToolsFileSystemIndexer::FileSystemIndexingJob::IndexFiles() {
mostynb13260d52015-03-26 09:12:09320 DCHECK_CURRENTLY_ON(BrowserThread::FILE);
[email protected]200bd332013-08-05 16:19:11321 if (stopped_)
322 return;
323 if (indexing_it_ == file_path_times_.end()) {
324 g_trigram_index.Get().NormalizeVectors();
325 BrowserThread::PostTask(BrowserThread::UI, FROM_HERE, done_callback_);
326 return;
327 }
328 FilePath file_path = indexing_it_->first;
[email protected]bda135f2014-04-10 21:55:06329 current_file_.CreateOrOpen(
330 file_path,
331 base::File::FLAG_OPEN | base::File::FLAG_READ,
332 Bind(&FileSystemIndexingJob::StartFileIndexing, this));
[email protected]200bd332013-08-05 16:19:11333}
334
335void DevToolsFileSystemIndexer::FileSystemIndexingJob::StartFileIndexing(
[email protected]bda135f2014-04-10 21:55:06336 base::File::Error error) {
337 if (!current_file_.IsValid()) {
[email protected]200bd332013-08-05 16:19:11338 FinishFileIndexing(false);
339 return;
340 }
[email protected]200bd332013-08-05 16:19:11341 current_file_offset_ = 0;
342 current_trigrams_.clear();
343 std::fill(current_trigrams_set_.begin(), current_trigrams_set_.end(), false);
344 ReadFromFile();
345}
346
347void DevToolsFileSystemIndexer::FileSystemIndexingJob::ReadFromFile() {
348 if (stopped_) {
349 CloseFile();
350 return;
351 }
[email protected]bda135f2014-04-10 21:55:06352 current_file_.Read(current_file_offset_, kMaxReadLength,
353 Bind(&FileSystemIndexingJob::OnRead, this));
[email protected]200bd332013-08-05 16:19:11354}
355
356void DevToolsFileSystemIndexer::FileSystemIndexingJob::OnRead(
[email protected]141bcc52014-01-27 21:36:00357 base::File::Error error,
[email protected]200bd332013-08-05 16:19:11358 const char* data,
359 int bytes_read) {
[email protected]141bcc52014-01-27 21:36:00360 if (error != base::File::FILE_OK) {
[email protected]200bd332013-08-05 16:19:11361 FinishFileIndexing(false);
362 return;
363 }
364
365 if (!bytes_read || bytes_read < 3) {
366 FinishFileIndexing(true);
367 return;
368 }
369
370 size_t size = static_cast<size_t>(bytes_read);
371 vector<TrigramChar> trigram_chars;
372 trigram_chars.reserve(size);
373 for (size_t i = 0; i < size; ++i) {
vsevik3ef1c9d2014-10-23 14:17:35374 TrigramChar trigram_char = TrigramCharForChar(data[i]);
375 if (trigram_char == kBinaryTrigramChar) {
[email protected]200bd332013-08-05 16:19:11376 current_trigrams_.clear();
377 FinishFileIndexing(true);
378 return;
379 }
vsevik3ef1c9d2014-10-23 14:17:35380 trigram_chars.push_back(trigram_char);
[email protected]200bd332013-08-05 16:19:11381 }
382
383 for (size_t i = 0; i + 2 < size; ++i) {
384 Trigram trigram = TrigramAtIndex(trigram_chars, i);
385 if ((trigram != kUndefinedTrigram) && !current_trigrams_set_[trigram]) {
386 current_trigrams_set_[trigram] = true;
387 current_trigrams_.push_back(trigram);
388 }
389 }
390 current_file_offset_ += bytes_read - 2;
391 ReadFromFile();
392}
393
394void DevToolsFileSystemIndexer::FileSystemIndexingJob::FinishFileIndexing(
395 bool success) {
mostynb13260d52015-03-26 09:12:09396 DCHECK_CURRENTLY_ON(BrowserThread::FILE);
[email protected]200bd332013-08-05 16:19:11397 CloseFile();
398 if (success) {
399 FilePath file_path = indexing_it_->first;
400 g_trigram_index.Get().SetTrigramsForFile(
401 file_path, current_trigrams_, file_path_times_[file_path]);
402 }
403 ReportWorked();
404 ++indexing_it_;
405 IndexFiles();
406}
407
408void DevToolsFileSystemIndexer::FileSystemIndexingJob::CloseFile() {
[email protected]bda135f2014-04-10 21:55:06409 if (current_file_.IsValid())
410 current_file_.Close(Bind(&FileSystemIndexingJob::CloseCallback, this));
[email protected]200bd332013-08-05 16:19:11411}
412
413void DevToolsFileSystemIndexer::FileSystemIndexingJob::CloseCallback(
[email protected]141bcc52014-01-27 21:36:00414 base::File::Error error) {}
[email protected]200bd332013-08-05 16:19:11415
416void DevToolsFileSystemIndexer::FileSystemIndexingJob::ReportWorked() {
417 TimeTicks current_time = TimeTicks::Now();
418 bool should_send_worked_nitification = true;
419 if (!last_worked_notification_time_.is_null()) {
420 TimeDelta delta = current_time - last_worked_notification_time_;
421 if (delta.InMilliseconds() < kMinTimeoutBetweenWorkedNitification)
422 should_send_worked_nitification = false;
423 }
424 ++files_indexed_;
425 if (should_send_worked_nitification) {
426 last_worked_notification_time_ = current_time;
427 BrowserThread::PostTask(
428 BrowserThread::UI, FROM_HERE, Bind(worked_callback_, files_indexed_));
429 files_indexed_ = 0;
430 }
431}
432
433DevToolsFileSystemIndexer::DevToolsFileSystemIndexer() {
[email protected]200bd332013-08-05 16:19:11434}
435
436DevToolsFileSystemIndexer::~DevToolsFileSystemIndexer() {}
437
438scoped_refptr<DevToolsFileSystemIndexer::FileSystemIndexingJob>
439DevToolsFileSystemIndexer::IndexPath(
440 const string& file_system_path,
441 const TotalWorkCallback& total_work_callback,
442 const WorkedCallback& worked_callback,
443 const DoneCallback& done_callback) {
mostynb13260d52015-03-26 09:12:09444 DCHECK_CURRENTLY_ON(BrowserThread::UI);
[email protected]200bd332013-08-05 16:19:11445 scoped_refptr<FileSystemIndexingJob> indexing_job =
446 new FileSystemIndexingJob(FilePath::FromUTF8Unsafe(file_system_path),
447 total_work_callback,
448 worked_callback,
449 done_callback);
450 indexing_job->Start();
451 return indexing_job;
452}
453
454void DevToolsFileSystemIndexer::SearchInPath(const string& file_system_path,
455 const string& query,
456 const SearchCallback& callback) {
mostynb13260d52015-03-26 09:12:09457 DCHECK_CURRENTLY_ON(BrowserThread::UI);
[email protected]200bd332013-08-05 16:19:11458 BrowserThread::PostTask(
459 BrowserThread::FILE,
460 FROM_HERE,
461 Bind(&DevToolsFileSystemIndexer::SearchInPathOnFileThread,
462 this,
463 file_system_path,
464 query,
465 callback));
466}
467
468void DevToolsFileSystemIndexer::SearchInPathOnFileThread(
469 const string& file_system_path,
470 const string& query,
471 const SearchCallback& callback) {
mostynb13260d52015-03-26 09:12:09472 DCHECK_CURRENTLY_ON(BrowserThread::FILE);
[email protected]200bd332013-08-05 16:19:11473 vector<FilePath> file_paths = g_trigram_index.Get().Search(query);
474 vector<string> result;
475 FilePath path = FilePath::FromUTF8Unsafe(file_system_path);
476 vector<FilePath>::const_iterator it = file_paths.begin();
477 for (; it != file_paths.end(); ++it) {
478 if (path.IsParent(*it))
479 result.push_back(it->AsUTF8Unsafe());
480 }
481 BrowserThread::PostTask(BrowserThread::UI, FROM_HERE, Bind(callback, result));
482}