Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 1 | // Copyright 2017 The Chromium Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
Samuel Huang | 577ef6c | 2018-03-13 18:19:34 | [diff] [blame] | 5 | #include "components/zucchini/equivalence_map.h" |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 6 | |
| 7 | #include <algorithm> |
Samuel Huang | 577ef6c | 2018-03-13 18:19:34 | [diff] [blame] | 8 | #include <utility> |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 9 | |
| 10 | #include "base/logging.h" |
Samuel Huang | ad7a5c0 | 2018-06-26 14:47:02 | [diff] [blame] | 11 | #include "base/numerics/safe_conversions.h" |
Jdragon | a248d5c | 2018-08-24 12:46:42 | [diff] [blame] | 12 | #include "base/stl_util.h" |
Samuel Huang | 577ef6c | 2018-03-13 18:19:34 | [diff] [blame] | 13 | #include "components/zucchini/encoded_view.h" |
| 14 | #include "components/zucchini/patch_reader.h" |
| 15 | #include "components/zucchini/suffix_array.h" |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 16 | |
| 17 | namespace zucchini { |
| 18 | |
Calder Kitagawa | 94722d4e | 2018-06-28 15:32:16 | [diff] [blame] | 19 | namespace { |
| 20 | |
| 21 | // TODO(haungs): Tune these numbers to improve pathological case results. |
| 22 | |
| 23 | // In pathological cases Zucchini can exhibit O(n^2) behavior if the seed |
| 24 | // selection process runs to completion. To prevent this we impose a quota for |
| 25 | // the total length of equivalences the seed selection process can perform |
| 26 | // trials on. For regular use cases it is unlikely this quota will be exceeded, |
| 27 | // and if it is the effects on patch size are expected to be small. |
| 28 | constexpr uint64_t kSeedSelectionTotalVisitLengthQuota = 1 << 18; // 256 KiB |
| 29 | |
| 30 | // The aforementioned quota alone is insufficient, as exploring backwards will |
| 31 | // still be very successful resulting in O(n) behavior in the case of a limited |
| 32 | // seed selection trials. This results in O(n^2) behavior returning. To mitigate |
| 33 | // this we also impose a cap on the ExtendEquivalenceBackward() exploration. |
| 34 | constexpr offset_t kBackwardsExtendLimit = 1 << 16; // 64 KiB |
| 35 | |
| 36 | } // namespace |
| 37 | |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 38 | /******** Utility Functions ********/ |
| 39 | |
Etienne Pierre-Doray | efe2834 | 2018-01-09 13:47:53 | [diff] [blame] | 40 | double GetTokenSimilarity( |
| 41 | const ImageIndex& old_image_index, |
| 42 | const ImageIndex& new_image_index, |
| 43 | const std::vector<TargetsAffinity>& targets_affinities, |
| 44 | offset_t src, |
| 45 | offset_t dst) { |
Etienne Pierre-Doray | 0131956 | 2017-12-30 20:53:33 | [diff] [blame] | 46 | DCHECK(old_image_index.IsToken(src)); |
| 47 | DCHECK(new_image_index.IsToken(dst)); |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 48 | |
Etienne Pierre-Doray | 0131956 | 2017-12-30 20:53:33 | [diff] [blame] | 49 | TypeTag old_type = old_image_index.LookupType(src); |
| 50 | TypeTag new_type = new_image_index.LookupType(dst); |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 51 | if (old_type != new_type) |
| 52 | return kMismatchFatal; |
| 53 | |
| 54 | // Raw comparison. |
Etienne Pierre-Doray | 0131956 | 2017-12-30 20:53:33 | [diff] [blame] | 55 | if (!old_image_index.IsReference(src) && !new_image_index.IsReference(dst)) { |
| 56 | return old_image_index.GetRawValue(src) == new_image_index.GetRawValue(dst) |
| 57 | ? 1.0 |
| 58 | : -1.5; |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 59 | } |
| 60 | |
Etienne Pierre-Doray | 0131956 | 2017-12-30 20:53:33 | [diff] [blame] | 61 | const ReferenceSet& old_ref_set = old_image_index.refs(old_type); |
| 62 | const ReferenceSet& new_ref_set = new_image_index.refs(new_type); |
Etienne Pierre-doray | 0434f5b | 2018-08-13 18:49:00 | [diff] [blame] | 63 | Reference old_reference = old_ref_set.at(src); |
| 64 | Reference new_reference = new_ref_set.at(dst); |
Etienne Pierre-Doray | efe2834 | 2018-01-09 13:47:53 | [diff] [blame] | 65 | PoolTag pool_tag = old_ref_set.pool_tag(); |
Etienne Pierre-Doray | 0131956 | 2017-12-30 20:53:33 | [diff] [blame] | 66 | |
Etienne Pierre-Doray | efe2834 | 2018-01-09 13:47:53 | [diff] [blame] | 67 | double affinity = targets_affinities[pool_tag.value()].AffinityBetween( |
Etienne Pierre-doray | 0434f5b | 2018-08-13 18:49:00 | [diff] [blame] | 68 | old_ref_set.target_pool().KeyForOffset(old_reference.target), |
| 69 | new_ref_set.target_pool().KeyForOffset(new_reference.target)); |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 70 | |
| 71 | // Both targets are not associated, which implies a weak match. |
Etienne Pierre-Doray | efe2834 | 2018-01-09 13:47:53 | [diff] [blame] | 72 | if (affinity == 0.0) |
Etienne Pierre-Doray | 0131956 | 2017-12-30 20:53:33 | [diff] [blame] | 73 | return 0.5 * old_ref_set.width(); |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 74 | |
| 75 | // At least one target is associated, so values are compared. |
Etienne Pierre-Doray | efe2834 | 2018-01-09 13:47:53 | [diff] [blame] | 76 | return affinity > 0.0 ? old_ref_set.width() : -2.0; |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 77 | } |
| 78 | |
Etienne Pierre-Doray | efe2834 | 2018-01-09 13:47:53 | [diff] [blame] | 79 | double GetEquivalenceSimilarity( |
| 80 | const ImageIndex& old_image_index, |
| 81 | const ImageIndex& new_image_index, |
| 82 | const std::vector<TargetsAffinity>& targets_affinities, |
| 83 | const Equivalence& equivalence) { |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 84 | double similarity = 0.0; |
| 85 | for (offset_t k = 0; k < equivalence.length; ++k) { |
| 86 | // Non-tokens are joined with the nearest previous token: skip until we |
| 87 | // cover the unit. |
Etienne Pierre-Doray | 0131956 | 2017-12-30 20:53:33 | [diff] [blame] | 88 | if (!new_image_index.IsToken(equivalence.dst_offset + k)) |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 89 | continue; |
| 90 | |
Etienne Pierre-Doray | efe2834 | 2018-01-09 13:47:53 | [diff] [blame] | 91 | similarity += GetTokenSimilarity( |
| 92 | old_image_index, new_image_index, targets_affinities, |
| 93 | equivalence.src_offset + k, equivalence.dst_offset + k); |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 94 | if (similarity == kMismatchFatal) |
| 95 | return kMismatchFatal; |
| 96 | } |
| 97 | return similarity; |
| 98 | } |
| 99 | |
| 100 | EquivalenceCandidate ExtendEquivalenceForward( |
Etienne Pierre-Doray | 0131956 | 2017-12-30 20:53:33 | [diff] [blame] | 101 | const ImageIndex& old_image_index, |
| 102 | const ImageIndex& new_image_index, |
Etienne Pierre-Doray | efe2834 | 2018-01-09 13:47:53 | [diff] [blame] | 103 | const std::vector<TargetsAffinity>& targets_affinities, |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 104 | const EquivalenceCandidate& candidate, |
| 105 | double min_similarity) { |
| 106 | Equivalence equivalence = candidate.eq; |
| 107 | offset_t best_k = equivalence.length; |
| 108 | double current_similarity = candidate.similarity; |
| 109 | double best_similarity = current_similarity; |
| 110 | double current_penalty = min_similarity; |
Etienne Pierre-Doray | 0131956 | 2017-12-30 20:53:33 | [diff] [blame] | 111 | for (offset_t k = best_k; |
| 112 | equivalence.src_offset + k < old_image_index.size() && |
| 113 | equivalence.dst_offset + k < new_image_index.size(); |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 114 | ++k) { |
| 115 | // Mismatch in type, |candidate| cannot be extended further. |
Etienne Pierre-Doray | 0131956 | 2017-12-30 20:53:33 | [diff] [blame] | 116 | if (old_image_index.LookupType(equivalence.src_offset + k) != |
| 117 | new_image_index.LookupType(equivalence.dst_offset + k)) { |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 118 | break; |
Etienne Pierre-Doray | a846f68 | 2017-08-31 20:30:47 | [diff] [blame] | 119 | } |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 120 | |
Etienne Pierre-Doray | 0131956 | 2017-12-30 20:53:33 | [diff] [blame] | 121 | if (!new_image_index.IsToken(equivalence.dst_offset + k)) { |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 122 | // Non-tokens are joined with the nearest previous token: skip until we |
| 123 | // cover the unit, and extend |best_k| if applicable. |
| 124 | if (best_k == k) |
| 125 | best_k = k + 1; |
| 126 | continue; |
| 127 | } |
| 128 | |
Etienne Pierre-Doray | efe2834 | 2018-01-09 13:47:53 | [diff] [blame] | 129 | double similarity = GetTokenSimilarity( |
| 130 | old_image_index, new_image_index, targets_affinities, |
| 131 | equivalence.src_offset + k, equivalence.dst_offset + k); |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 132 | current_similarity += similarity; |
| 133 | current_penalty = std::max(0.0, current_penalty) - similarity; |
| 134 | |
| 135 | if (current_similarity < 0.0 || current_penalty >= min_similarity) |
| 136 | break; |
| 137 | if (current_similarity >= best_similarity) { |
| 138 | best_similarity = current_similarity; |
| 139 | best_k = k + 1; |
| 140 | } |
| 141 | } |
| 142 | equivalence.length = best_k; |
| 143 | return {equivalence, best_similarity}; |
| 144 | } |
| 145 | |
| 146 | EquivalenceCandidate ExtendEquivalenceBackward( |
Etienne Pierre-Doray | 0131956 | 2017-12-30 20:53:33 | [diff] [blame] | 147 | const ImageIndex& old_image_index, |
| 148 | const ImageIndex& new_image_index, |
Etienne Pierre-Doray | efe2834 | 2018-01-09 13:47:53 | [diff] [blame] | 149 | const std::vector<TargetsAffinity>& targets_affinities, |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 150 | const EquivalenceCandidate& candidate, |
| 151 | double min_similarity) { |
| 152 | Equivalence equivalence = candidate.eq; |
| 153 | offset_t best_k = 0; |
| 154 | double current_similarity = candidate.similarity; |
| 155 | double best_similarity = current_similarity; |
| 156 | double current_penalty = 0.0; |
Calder Kitagawa | 94722d4e | 2018-06-28 15:32:16 | [diff] [blame] | 157 | offset_t k_min = std::min( |
| 158 | {equivalence.dst_offset, equivalence.src_offset, kBackwardsExtendLimit}); |
| 159 | for (offset_t k = 1; k <= k_min; ++k) { |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 160 | // Mismatch in type, |candidate| cannot be extended further. |
Etienne Pierre-Doray | 0131956 | 2017-12-30 20:53:33 | [diff] [blame] | 161 | if (old_image_index.LookupType(equivalence.src_offset - k) != |
| 162 | new_image_index.LookupType(equivalence.dst_offset - k)) { |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 163 | break; |
Etienne Pierre-Doray | a846f68 | 2017-08-31 20:30:47 | [diff] [blame] | 164 | } |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 165 | |
| 166 | // Non-tokens are joined with the nearest previous token: skip until we |
| 167 | // reach the next token. |
Etienne Pierre-Doray | 0131956 | 2017-12-30 20:53:33 | [diff] [blame] | 168 | if (!new_image_index.IsToken(equivalence.dst_offset - k)) |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 169 | continue; |
| 170 | |
Etienne Pierre-Doray | 0131956 | 2017-12-30 20:53:33 | [diff] [blame] | 171 | DCHECK_EQ(old_image_index.LookupType(equivalence.src_offset - k), |
| 172 | new_image_index.LookupType(equivalence.dst_offset - |
| 173 | k)); // Sanity check. |
Etienne Pierre-Doray | efe2834 | 2018-01-09 13:47:53 | [diff] [blame] | 174 | double similarity = GetTokenSimilarity( |
| 175 | old_image_index, new_image_index, targets_affinities, |
| 176 | equivalence.src_offset - k, equivalence.dst_offset - k); |
| 177 | |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 178 | current_similarity += similarity; |
| 179 | current_penalty = std::max(0.0, current_penalty) - similarity; |
| 180 | |
| 181 | if (current_similarity < 0.0 || current_penalty >= min_similarity) |
| 182 | break; |
| 183 | if (current_similarity >= best_similarity) { |
| 184 | best_similarity = current_similarity; |
| 185 | best_k = k; |
| 186 | } |
| 187 | } |
| 188 | |
| 189 | equivalence.dst_offset -= best_k; |
| 190 | equivalence.src_offset -= best_k; |
| 191 | equivalence.length += best_k; |
| 192 | return {equivalence, best_similarity}; |
| 193 | } |
| 194 | |
Etienne Pierre-Doray | efe2834 | 2018-01-09 13:47:53 | [diff] [blame] | 195 | EquivalenceCandidate VisitEquivalenceSeed( |
| 196 | const ImageIndex& old_image_index, |
| 197 | const ImageIndex& new_image_index, |
| 198 | const std::vector<TargetsAffinity>& targets_affinities, |
| 199 | offset_t src, |
| 200 | offset_t dst, |
| 201 | double min_similarity) { |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 202 | EquivalenceCandidate candidate{{src, dst, 0}, 0.0}; // Empty. |
Etienne Pierre-Doray | efe2834 | 2018-01-09 13:47:53 | [diff] [blame] | 203 | if (!old_image_index.IsToken(src)) |
| 204 | return candidate; |
| 205 | candidate = |
| 206 | ExtendEquivalenceForward(old_image_index, new_image_index, |
| 207 | targets_affinities, candidate, min_similarity); |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 208 | if (candidate.similarity < min_similarity) |
| 209 | return candidate; // Not worth exploring any more. |
Etienne Pierre-Doray | efe2834 | 2018-01-09 13:47:53 | [diff] [blame] | 210 | return ExtendEquivalenceBackward(old_image_index, new_image_index, |
| 211 | targets_affinities, candidate, |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 212 | min_similarity); |
| 213 | } |
| 214 | |
Etienne Pierre-Doray | 78754aa | 2018-03-07 16:26:52 | [diff] [blame] | 215 | /******** OffsetMapper ********/ |
| 216 | |
Samuel Huang | ad7a5c0 | 2018-06-26 14:47:02 | [diff] [blame] | 217 | OffsetMapper::OffsetMapper(std::vector<Equivalence>&& equivalences, |
Etienne Pierre-doray | 5946dbfa | 2018-09-10 16:19:33 | [diff] [blame] | 218 | offset_t old_image_size, |
| 219 | offset_t new_image_size) |
Samuel Huang | ad7a5c0 | 2018-06-26 14:47:02 | [diff] [blame] | 220 | : equivalences_(std::move(equivalences)), |
| 221 | old_image_size_(old_image_size), |
| 222 | new_image_size_(new_image_size) { |
| 223 | DCHECK_GT(new_image_size_, 0U); |
Etienne Pierre-Doray | 78754aa | 2018-03-07 16:26:52 | [diff] [blame] | 224 | DCHECK(std::is_sorted(equivalences_.begin(), equivalences_.end(), |
| 225 | [](const Equivalence& a, const Equivalence& b) { |
| 226 | return a.src_offset < b.src_offset; |
| 227 | })); |
Samuel Huang | ad7a5c0 | 2018-06-26 14:47:02 | [diff] [blame] | 228 | // This is for testing. Assume pruned. |
Etienne Pierre-Doray | 78754aa | 2018-03-07 16:26:52 | [diff] [blame] | 229 | } |
| 230 | |
Samuel Huang | ad7a5c0 | 2018-06-26 14:47:02 | [diff] [blame] | 231 | OffsetMapper::OffsetMapper(EquivalenceSource&& equivalence_source, |
Etienne Pierre-doray | 5946dbfa | 2018-09-10 16:19:33 | [diff] [blame] | 232 | offset_t old_image_size, |
| 233 | offset_t new_image_size) |
Samuel Huang | ad7a5c0 | 2018-06-26 14:47:02 | [diff] [blame] | 234 | : old_image_size_(old_image_size), new_image_size_(new_image_size) { |
| 235 | DCHECK_GT(new_image_size_, 0U); |
Etienne Pierre-Doray | 78754aa | 2018-03-07 16:26:52 | [diff] [blame] | 236 | for (auto e = equivalence_source.GetNext(); e.has_value(); |
| 237 | e = equivalence_source.GetNext()) { |
| 238 | equivalences_.push_back(*e); |
| 239 | } |
| 240 | PruneEquivalencesAndSortBySource(&equivalences_); |
| 241 | } |
| 242 | |
Samuel Huang | ad7a5c0 | 2018-06-26 14:47:02 | [diff] [blame] | 243 | OffsetMapper::OffsetMapper(const EquivalenceMap& equivalence_map, |
Etienne Pierre-doray | 5946dbfa | 2018-09-10 16:19:33 | [diff] [blame] | 244 | offset_t old_image_size, |
| 245 | offset_t new_image_size) |
Samuel Huang | ad7a5c0 | 2018-06-26 14:47:02 | [diff] [blame] | 246 | : equivalences_(equivalence_map.size()), |
| 247 | old_image_size_(old_image_size), |
| 248 | new_image_size_(new_image_size) { |
| 249 | DCHECK_GT(new_image_size_, 0U); |
Etienne Pierre-Doray | 78754aa | 2018-03-07 16:26:52 | [diff] [blame] | 250 | std::transform(equivalence_map.begin(), equivalence_map.end(), |
| 251 | equivalences_.begin(), |
| 252 | [](const EquivalenceCandidate& c) { return c.eq; }); |
| 253 | PruneEquivalencesAndSortBySource(&equivalences_); |
| 254 | } |
| 255 | |
| 256 | OffsetMapper::~OffsetMapper() = default; |
| 257 | |
Samuel Huang | ad7a5c0 | 2018-06-26 14:47:02 | [diff] [blame] | 258 | // Safely evaluates |offset - unit.src_offset + unit.dst_offset| with signed |
| 259 | // arithmetic, then clips the result to |[0, new_image_size_)|. |
| 260 | offset_t OffsetMapper::NaiveExtendedForwardProject(const Equivalence& unit, |
| 261 | offset_t offset) const { |
| 262 | int64_t old_offset64 = offset; |
| 263 | int64_t src_offset64 = unit.src_offset; |
| 264 | int64_t dst_offset64 = unit.dst_offset; |
| 265 | uint64_t new_offset64 = std::min<uint64_t>( |
| 266 | std::max<int64_t>(0LL, old_offset64 - src_offset64 + dst_offset64), |
| 267 | new_image_size_ - 1); |
| 268 | return base::checked_cast<offset_t>(new_offset64); |
| 269 | } |
| 270 | |
| 271 | offset_t OffsetMapper::ExtendedForwardProject(offset_t offset) const { |
| 272 | DCHECK(!equivalences_.empty()); |
| 273 | if (offset < old_image_size_) { |
| 274 | // Finds the equivalence unit whose "old" block is nearest to |offset|, |
| 275 | // favoring the block with lower offset in case of a tie. |
| 276 | auto pos = std::upper_bound( |
| 277 | equivalences_.begin(), equivalences_.end(), offset, |
| 278 | [](offset_t a, const Equivalence& b) { return a < b.src_offset; }); |
| 279 | // For tiebreaking: |offset - pos[-1].src_end()| is actually 1 less than |
| 280 | // |offset|'s distance to "old" block of |pos[-1]|. Therefore "<" is used. |
| 281 | if (pos != equivalences_.begin() && |
| 282 | (pos == equivalences_.end() || offset < pos[-1].src_end() || |
| 283 | offset - pos[-1].src_end() < pos->src_offset - offset)) { |
Etienne Pierre-Doray | 78754aa | 2018-03-07 16:26:52 | [diff] [blame] | 284 | --pos; |
| 285 | } |
Samuel Huang | ad7a5c0 | 2018-06-26 14:47:02 | [diff] [blame] | 286 | return NaiveExtendedForwardProject(*pos, offset); |
Etienne Pierre-Doray | 78754aa | 2018-03-07 16:26:52 | [diff] [blame] | 287 | } |
Samuel Huang | ad7a5c0 | 2018-06-26 14:47:02 | [diff] [blame] | 288 | // Fake offsets. |
| 289 | offset_t delta = offset - old_image_size_; |
| 290 | return delta < kOffsetBound - new_image_size_ ? new_image_size_ + delta |
| 291 | : kOffsetBound - 1; |
Etienne Pierre-Doray | 78754aa | 2018-03-07 16:26:52 | [diff] [blame] | 292 | } |
| 293 | |
| 294 | void OffsetMapper::ForwardProjectAll(std::vector<offset_t>* offsets) const { |
| 295 | DCHECK(std::is_sorted(offsets->begin(), offsets->end())); |
| 296 | auto current = equivalences_.begin(); |
| 297 | for (auto& src : *offsets) { |
| 298 | while (current != end() && current->src_end() <= src) { |
| 299 | ++current; |
| 300 | } |
| 301 | |
| 302 | if (current != end() && current->src_offset <= src) { |
| 303 | src = src - current->src_offset + current->dst_offset; |
| 304 | } else { |
| 305 | src = kInvalidOffset; |
| 306 | } |
| 307 | } |
Jdragon | a248d5c | 2018-08-24 12:46:42 | [diff] [blame] | 308 | base::Erase(*offsets, kInvalidOffset); |
Etienne Pierre-Doray | 78754aa | 2018-03-07 16:26:52 | [diff] [blame] | 309 | offsets->shrink_to_fit(); |
| 310 | } |
| 311 | |
| 312 | void OffsetMapper::PruneEquivalencesAndSortBySource( |
| 313 | std::vector<Equivalence>* equivalences) { |
| 314 | std::sort(equivalences->begin(), equivalences->end(), |
| 315 | [](const Equivalence& a, const Equivalence& b) { |
| 316 | return a.src_offset < b.src_offset; |
| 317 | }); |
| 318 | |
| 319 | for (auto current = equivalences->begin(); current != equivalences->end(); |
| 320 | ++current) { |
| 321 | // A "reaper" is an equivalence after |current| that overlaps with it, but |
| 322 | // is longer, and so truncates |current|. For example: |
| 323 | // ****** <= |current| |
| 324 | // ** |
| 325 | // **** |
| 326 | // **** |
| 327 | // ********** <= |next| as reaper. |
| 328 | // If a reaper is found (as |next|), every equivalence strictly between |
| 329 | // |current| and |next| would be truncated to 0 and discarded. Handling this |
| 330 | // case is important to avoid O(n^2) behavior. |
| 331 | bool next_is_reaper = false; |
| 332 | |
| 333 | // Look ahead to resolve overlaps, until a better candidate is found. |
| 334 | auto next = current + 1; |
| 335 | for (; next != equivalences->end(); ++next) { |
| 336 | DCHECK_GE(next->src_offset, current->src_offset); |
| 337 | if (next->src_offset >= current->src_end()) |
| 338 | break; // No more overlap. |
| 339 | |
| 340 | if (current->length < next->length) { |
| 341 | // |next| is better: So it is a reaper that shrinks |current|. |
| 342 | offset_t delta = current->src_end() - next->src_offset; |
| 343 | current->length -= delta; |
| 344 | next_is_reaper = true; |
| 345 | break; |
| 346 | } |
| 347 | } |
| 348 | |
| 349 | if (next_is_reaper) { |
| 350 | // Discard all equivalences strictly between |cur| and |next|. |
| 351 | for (auto reduced = current + 1; reduced != next; ++reduced) |
| 352 | reduced->length = 0; |
| 353 | current = next - 1; |
| 354 | } else { |
| 355 | // Shrink all equivalences that overlap with |current|. These are all |
| 356 | // worse than |current| since no reaper is found. |
| 357 | for (auto reduced = current + 1; reduced != next; ++reduced) { |
Calder Kitagawa | bdc23714 | 2018-03-09 19:08:41 | [diff] [blame] | 358 | offset_t delta = current->src_end() - reduced->src_offset; |
| 359 | reduced->length -= std::min(reduced->length, delta); |
Etienne Pierre-Doray | 78754aa | 2018-03-07 16:26:52 | [diff] [blame] | 360 | reduced->src_offset += delta; |
| 361 | reduced->dst_offset += delta; |
| 362 | DCHECK_EQ(reduced->src_offset, current->src_end()); |
| 363 | } |
| 364 | } |
| 365 | } |
| 366 | |
| 367 | // Discard all equivalences with length == 0. |
Jdragon | a248d5c | 2018-08-24 12:46:42 | [diff] [blame] | 368 | base::EraseIf(*equivalences, [](const Equivalence& equivalence) { |
| 369 | return equivalence.length == 0; |
| 370 | }); |
Etienne Pierre-Doray | 78754aa | 2018-03-07 16:26:52 | [diff] [blame] | 371 | } |
| 372 | |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 373 | /******** EquivalenceMap ********/ |
| 374 | |
| 375 | EquivalenceMap::EquivalenceMap() = default; |
Etienne Pierre-Doray | f5e4fa8 | 2017-08-15 17:58:18 | [diff] [blame] | 376 | |
Etienne Pierre-Doray | 78754aa | 2018-03-07 16:26:52 | [diff] [blame] | 377 | EquivalenceMap::EquivalenceMap(std::vector<EquivalenceCandidate>&& equivalences) |
| 378 | : candidates_(std::move(equivalences)) { |
Etienne Pierre-Doray | f5e4fa8 | 2017-08-15 17:58:18 | [diff] [blame] | 379 | SortByDestination(); |
| 380 | } |
| 381 | |
Etienne Pierre-Doray | 65b0a03a | 2017-08-18 17:24:02 | [diff] [blame] | 382 | EquivalenceMap::EquivalenceMap(EquivalenceMap&&) = default; |
| 383 | |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 384 | EquivalenceMap::~EquivalenceMap() = default; |
| 385 | |
Etienne Pierre-Doray | efe2834 | 2018-01-09 13:47:53 | [diff] [blame] | 386 | void EquivalenceMap::Build( |
| 387 | const std::vector<offset_t>& old_sa, |
| 388 | const EncodedView& old_view, |
| 389 | const EncodedView& new_view, |
| 390 | const std::vector<TargetsAffinity>& targets_affinities, |
| 391 | double min_similarity) { |
Etienne Pierre-Doray | 0131956 | 2017-12-30 20:53:33 | [diff] [blame] | 392 | DCHECK_EQ(old_sa.size(), old_view.size()); |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 393 | |
Etienne Pierre-Doray | efe2834 | 2018-01-09 13:47:53 | [diff] [blame] | 394 | CreateCandidates(old_sa, old_view, new_view, targets_affinities, |
| 395 | min_similarity); |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 396 | SortByDestination(); |
Etienne Pierre-Doray | efe2834 | 2018-01-09 13:47:53 | [diff] [blame] | 397 | Prune(old_view, new_view, targets_affinities, min_similarity); |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 398 | |
| 399 | offset_t coverage = 0; |
| 400 | offset_t current_offset = 0; |
| 401 | for (auto candidate : candidates_) { |
| 402 | DCHECK_GE(candidate.eq.dst_offset, current_offset); |
| 403 | coverage += candidate.eq.length; |
| 404 | current_offset = candidate.eq.dst_end(); |
| 405 | } |
| 406 | LOG(INFO) << "Equivalence Count: " << size(); |
| 407 | LOG(INFO) << "Coverage / Extra / Total: " << coverage << " / " |
Etienne Pierre-Doray | 0131956 | 2017-12-30 20:53:33 | [diff] [blame] | 408 | << new_view.size() - coverage << " / " << new_view.size(); |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 409 | } |
| 410 | |
Etienne Pierre-Doray | efe2834 | 2018-01-09 13:47:53 | [diff] [blame] | 411 | void EquivalenceMap::CreateCandidates( |
| 412 | const std::vector<offset_t>& old_sa, |
| 413 | const EncodedView& old_view, |
| 414 | const EncodedView& new_view, |
| 415 | const std::vector<TargetsAffinity>& targets_affinities, |
| 416 | double min_similarity) { |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 417 | candidates_.clear(); |
| 418 | |
| 419 | // This is an heuristic to find 'good' equivalences on encoded views. |
| 420 | // Equivalences are found in ascending order of |new_image|. |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 421 | offset_t dst_offset = 0; |
| 422 | |
Etienne Pierre-Doray | 0131956 | 2017-12-30 20:53:33 | [diff] [blame] | 423 | while (dst_offset < new_view.size()) { |
| 424 | if (!new_view.IsToken(dst_offset)) { |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 425 | ++dst_offset; |
| 426 | continue; |
| 427 | } |
| 428 | auto match = |
| 429 | SuffixLowerBound(old_sa, old_view.begin(), |
| 430 | new_view.begin() + dst_offset, new_view.end()); |
| 431 | |
| 432 | offset_t next_dst_offset = dst_offset + 1; |
| 433 | // TODO(huangs): Clean up. |
| 434 | double best_similarity = min_similarity; |
Calder Kitagawa | 94722d4e | 2018-06-28 15:32:16 | [diff] [blame] | 435 | uint64_t total_visit_length = 0; |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 436 | EquivalenceCandidate best_candidate = {{0, 0, 0}, 0.0}; |
| 437 | for (auto it = match; it != old_sa.end(); ++it) { |
Etienne Pierre-Doray | 0131956 | 2017-12-30 20:53:33 | [diff] [blame] | 438 | EquivalenceCandidate candidate = VisitEquivalenceSeed( |
Etienne Pierre-Doray | efe2834 | 2018-01-09 13:47:53 | [diff] [blame] | 439 | old_view.image_index(), new_view.image_index(), targets_affinities, |
Etienne Pierre-Doray | 0131956 | 2017-12-30 20:53:33 | [diff] [blame] | 440 | static_cast<offset_t>(*it), dst_offset, min_similarity); |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 441 | if (candidate.similarity > best_similarity) { |
| 442 | best_candidate = candidate; |
| 443 | best_similarity = candidate.similarity; |
| 444 | next_dst_offset = candidate.eq.dst_end(); |
Calder Kitagawa | 94722d4e | 2018-06-28 15:32:16 | [diff] [blame] | 445 | total_visit_length += candidate.eq.length; |
| 446 | if (total_visit_length > kSeedSelectionTotalVisitLengthQuota) { |
| 447 | break; |
| 448 | } |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 449 | } else { |
| 450 | break; |
| 451 | } |
| 452 | } |
Calder Kitagawa | 94722d4e | 2018-06-28 15:32:16 | [diff] [blame] | 453 | total_visit_length = 0; |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 454 | for (auto it = match; it != old_sa.begin(); --it) { |
| 455 | EquivalenceCandidate candidate = VisitEquivalenceSeed( |
Etienne Pierre-Doray | efe2834 | 2018-01-09 13:47:53 | [diff] [blame] | 456 | old_view.image_index(), new_view.image_index(), targets_affinities, |
Etienne Pierre-Doray | 0131956 | 2017-12-30 20:53:33 | [diff] [blame] | 457 | static_cast<offset_t>(it[-1]), dst_offset, min_similarity); |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 458 | if (candidate.similarity > best_similarity) { |
| 459 | best_candidate = candidate; |
| 460 | best_similarity = candidate.similarity; |
| 461 | next_dst_offset = candidate.eq.dst_end(); |
Calder Kitagawa | 94722d4e | 2018-06-28 15:32:16 | [diff] [blame] | 462 | total_visit_length += candidate.eq.length; |
| 463 | if (total_visit_length > kSeedSelectionTotalVisitLengthQuota) { |
| 464 | break; |
| 465 | } |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 466 | } else { |
| 467 | break; |
| 468 | } |
| 469 | } |
| 470 | if (best_candidate.similarity >= min_similarity) { |
| 471 | candidates_.push_back(best_candidate); |
| 472 | } |
| 473 | |
| 474 | dst_offset = next_dst_offset; |
| 475 | } |
| 476 | } |
| 477 | |
| 478 | void EquivalenceMap::SortByDestination() { |
| 479 | std::sort(candidates_.begin(), candidates_.end(), |
| 480 | [](const EquivalenceCandidate& a, const EquivalenceCandidate& b) { |
| 481 | return a.eq.dst_offset < b.eq.dst_offset; |
| 482 | }); |
| 483 | } |
| 484 | |
Etienne Pierre-Doray | efe2834 | 2018-01-09 13:47:53 | [diff] [blame] | 485 | void EquivalenceMap::Prune( |
| 486 | const EncodedView& old_view, |
| 487 | const EncodedView& new_view, |
| 488 | const std::vector<TargetsAffinity>& target_affinities, |
| 489 | double min_similarity) { |
Etienne Pierre-Doray | 78754aa | 2018-03-07 16:26:52 | [diff] [blame] | 490 | // TODO(etiennep): unify with |
| 491 | // OffsetMapper::PruneEquivalencesAndSortBySource(). |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 492 | for (auto current = candidates_.begin(); current != candidates_.end(); |
| 493 | ++current) { |
| 494 | if (current->similarity < min_similarity) |
| 495 | continue; // This candidate will be discarded anyways. |
| 496 | |
Etienne Pierre-Doray | 78754aa | 2018-03-07 16:26:52 | [diff] [blame] | 497 | bool next_is_reaper = false; |
| 498 | |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 499 | // Look ahead to resolve overlaps, until a better candidate is found. |
Etienne Pierre-Doray | 78754aa | 2018-03-07 16:26:52 | [diff] [blame] | 500 | auto next = current + 1; |
| 501 | for (; next != candidates_.end(); ++next) { |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 502 | DCHECK_GE(next->eq.dst_offset, current->eq.dst_offset); |
| 503 | if (next->eq.dst_offset >= current->eq.dst_offset + current->eq.length) |
| 504 | break; // No more overlap. |
| 505 | |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 506 | if (current->similarity < next->similarity) { |
Etienne Pierre-Doray | 78754aa | 2018-03-07 16:26:52 | [diff] [blame] | 507 | // |next| is better: So it is a reaper that shrinks |current|. |
| 508 | offset_t delta = current->eq.dst_end() - next->eq.dst_offset; |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 509 | current->eq.length -= delta; |
Etienne Pierre-Doray | 0131956 | 2017-12-30 20:53:33 | [diff] [blame] | 510 | current->similarity = GetEquivalenceSimilarity( |
Etienne Pierre-Doray | efe2834 | 2018-01-09 13:47:53 | [diff] [blame] | 511 | old_view.image_index(), new_view.image_index(), target_affinities, |
| 512 | current->eq); |
Etienne Pierre-Doray | 78754aa | 2018-03-07 16:26:52 | [diff] [blame] | 513 | |
| 514 | next_is_reaper = true; |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 515 | break; |
| 516 | } |
| 517 | } |
| 518 | |
Etienne Pierre-Doray | 78754aa | 2018-03-07 16:26:52 | [diff] [blame] | 519 | if (next_is_reaper) { |
| 520 | // Discard all equivalences strictly between |cur| and |next|. |
| 521 | for (auto reduced = current + 1; reduced != next; ++reduced) { |
| 522 | reduced->eq.length = 0; |
| 523 | reduced->similarity = 0; |
| 524 | } |
| 525 | current = next - 1; |
| 526 | } else { |
| 527 | // Shrinks all overlapping candidates following and worse than |current|. |
| 528 | for (auto reduced = current + 1; reduced != next; ++reduced) { |
Calder Kitagawa | bdc23714 | 2018-03-09 19:08:41 | [diff] [blame] | 529 | offset_t delta = current->eq.dst_end() - reduced->eq.dst_offset; |
| 530 | reduced->eq.length -= std::min(reduced->eq.length, delta); |
Etienne Pierre-Doray | 78754aa | 2018-03-07 16:26:52 | [diff] [blame] | 531 | reduced->eq.src_offset += delta; |
| 532 | reduced->eq.dst_offset += delta; |
| 533 | reduced->similarity = GetEquivalenceSimilarity( |
| 534 | old_view.image_index(), new_view.image_index(), target_affinities, |
| 535 | reduced->eq); |
| 536 | DCHECK_EQ(reduced->eq.dst_offset, current->eq.dst_end()); |
| 537 | } |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 538 | } |
| 539 | } |
| 540 | |
| 541 | // Discard all candidates with similarity smaller than |min_similarity|. |
Jdragon | a248d5c | 2018-08-24 12:46:42 | [diff] [blame] | 542 | base::EraseIf(candidates_, |
| 543 | [min_similarity](const EquivalenceCandidate& candidate) { |
| 544 | return candidate.similarity < min_similarity; |
| 545 | }); |
Etienne Pierre-Doray | 16083913 | 2017-08-11 01:22:35 | [diff] [blame] | 546 | } |
| 547 | |
| 548 | } // namespace zucchini |