blob: 82c23eef33c99a7109669bc620dd7f01ad8dccf2 [file] [log] [blame]
[email protected]04ca1bc2009-05-08 23:00:291// Copyright (c) 2009 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// We want to measure the similarity of two sequences of bytes as a surrogate
6// for measuring how well a second sequence will compress differentially to the
7// first sequence.
8
9#include "courgette/difference_estimator.h"
10
[email protected]14c1c232013-06-11 17:52:4411#include "base/containers/hash_tables.h"
[email protected]04ca1bc2009-05-08 23:00:2912
13namespace courgette {
14
15// Our difference measure is the number of k-tuples that occur in Subject that
16// don't occur in Base.
17const int kTupleSize = 4;
18
19namespace {
20
anujk.sharma1b06a962015-01-20 15:54:5921static_assert(kTupleSize >= 4 && kTupleSize <= 8,
22 "kTupleSize should be between 4 and 8");
[email protected]04ca1bc2009-05-08 23:00:2923
24size_t HashTuple(const uint8* source) {
25 size_t hash1 = *reinterpret_cast<const uint32*>(source);
26 size_t hash2 = *reinterpret_cast<const uint32*>(source + kTupleSize - 4);
27 size_t hash = ((hash1 * 17 + hash2 * 37) + (hash1 >> 17)) ^ (hash2 >> 23);
28 return hash;
29}
30
31bool RegionsEqual(const Region& a, const Region& b) {
32 if (a.length() != b.length())
33 return false;
34 return memcmp(a.start(), b.start(), a.length()) == 0;
35}
36
37} // anonymous namepace
38
39class DifferenceEstimator::Base {
40 public:
41 explicit Base(const Region& region) : region_(region) { }
42
43 void Init() {
halyavinb7d5b8f2015-03-13 06:07:5644 if (region_.length() < kTupleSize)
45 return;
[email protected]04ca1bc2009-05-08 23:00:2946 const uint8* start = region_.start();
47 const uint8* end = region_.end() - (kTupleSize - 1);
48 for (const uint8* p = start; p < end; ++p) {
49 size_t hash = HashTuple(p);
50 hashes_.insert(hash);
51 }
52 }
53
54 const Region& region() const { return region_; }
55
56 private:
57 Region region_;
58 base::hash_set<size_t> hashes_;
59
60 friend class DifferenceEstimator;
61 DISALLOW_COPY_AND_ASSIGN(Base);
62};
63
64class DifferenceEstimator::Subject {
65 public:
66 explicit Subject(const Region& region) : region_(region) {}
67
68 const Region& region() const { return region_; }
69
70 private:
71 Region region_;
72
73 DISALLOW_COPY_AND_ASSIGN(Subject);
74};
75
76DifferenceEstimator::DifferenceEstimator() {}
77
78DifferenceEstimator::~DifferenceEstimator() {
79 for (size_t i = 0; i < owned_bases_.size(); ++i)
80 delete owned_bases_[i];
81 for (size_t i = 0; i < owned_subjects_.size(); ++i)
82 delete owned_subjects_[i];
83}
84
85DifferenceEstimator::Base* DifferenceEstimator::MakeBase(const Region& region) {
86 Base* base = new Base(region);
87 base->Init();
88 owned_bases_.push_back(base);
89 return base;
90}
91
92DifferenceEstimator::Subject* DifferenceEstimator::MakeSubject(
93 const Region& region) {
94 Subject* subject = new Subject(region);
95 owned_subjects_.push_back(subject);
96 return subject;
97}
98
99size_t DifferenceEstimator::Measure(Base* base, Subject* subject) {
100 size_t mismatches = 0;
halyavinb7d5b8f2015-03-13 06:07:56101 if (subject->region().length() >= kTupleSize) {
102 const uint8* start = subject->region().start();
103 const uint8* end = subject->region().end() - (kTupleSize - 1);
[email protected]04ca1bc2009-05-08 23:00:29104
halyavinb7d5b8f2015-03-13 06:07:56105 const uint8* p = start;
106 while (p < end) {
107 size_t hash = HashTuple(p);
108 if (base->hashes_.find(hash) == base->hashes_.end()) {
109 ++mismatches;
110 }
111 p += 1;
[email protected]04ca1bc2009-05-08 23:00:29112 }
[email protected]04ca1bc2009-05-08 23:00:29113 }
114
115 if (mismatches == 0) {
116 if (RegionsEqual(base->region(), subject->region()))
117 return 0;
118 }
119 ++mismatches; // Guarantee not zero.
120 return mismatches;
121}
122
123} // namespace