Avi Drissman | 8ba1bad | 2022-09-13 19:22:36 | [diff] [blame] | 1 | // Copyright 2017 The Chromium Authors |
Samuel Huang | fdb2f3a | 2017-12-20 17:45:14 | [diff] [blame] | 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
Samuel Huang | 577ef6c | 2018-03-13 18:19:34 | [diff] [blame] | 5 | #include "components/zucchini/heuristic_ensemble_matcher.h" |
Samuel Huang | fdb2f3a | 2017-12-20 17:45:14 | [diff] [blame] | 6 | |
| 7 | #include <algorithm> |
| 8 | #include <memory> |
| 9 | #include <string> |
| 10 | #include <utility> |
| 11 | #include <vector> |
| 12 | |
Avi Drissman | 12be031 | 2023-01-11 09:16:09 | [diff] [blame] | 13 | #include "base/functional/bind.h" |
Hans Wennborg | 5bafbb9 | 2020-06-18 09:13:57 | [diff] [blame] | 14 | #include "base/logging.h" |
Ali Hijazi | a709b48b | 2022-11-09 01:27:44 | [diff] [blame] | 15 | #include "base/memory/raw_ref.h" |
Samuel Huang | fdb2f3a | 2017-12-20 17:45:14 | [diff] [blame] | 16 | #include "base/numerics/safe_conversions.h" |
| 17 | #include "base/strings/stringprintf.h" |
Samuel Huang | 577ef6c | 2018-03-13 18:19:34 | [diff] [blame] | 18 | #include "components/zucchini/binary_data_histogram.h" |
| 19 | #include "components/zucchini/element_detection.h" |
| 20 | #include "components/zucchini/image_utils.h" |
| 21 | #include "components/zucchini/io_utils.h" |
Samuel Huang | fdb2f3a | 2017-12-20 17:45:14 | [diff] [blame] | 22 | |
| 23 | namespace zucchini { |
| 24 | |
| 25 | namespace { |
| 26 | |
| 27 | /******** Helper Functions ********/ |
| 28 | |
| 29 | // Uses |detector| to find embedded executables inside |image|, and returns the |
Anton Bikineev | 1156b5f | 2021-05-15 22:35:36 | [diff] [blame] | 30 | // result on success, or absl::nullopt on failure, which occurs if too many (> |
Samuel Huang | fdb2f3a | 2017-12-20 17:45:14 | [diff] [blame] | 31 | // |kElementLimit|) elements are found. |
Anton Bikineev | 1156b5f | 2021-05-15 22:35:36 | [diff] [blame] | 32 | absl::optional<std::vector<Element>> FindEmbeddedElements( |
Samuel Huang | fdb2f3a | 2017-12-20 17:45:14 | [diff] [blame] | 33 | ConstBufferView image, |
| 34 | const std::string& name, |
| 35 | ElementDetector&& detector) { |
| 36 | // Maximum number of Elements in a file. This is enforced because our matching |
| 37 | // algorithm is O(n^2), which suffices for regular archive files that should |
| 38 | // have up to 10's of executable files. An archive containing 100's of |
| 39 | // executables is likely pathological, and is rejected to prevent exploits. |
| 40 | static constexpr size_t kElementLimit = 256; |
| 41 | std::vector<Element> elements; |
| 42 | ElementFinder element_finder(image, std::move(detector)); |
| 43 | for (auto element = element_finder.GetNext(); |
| 44 | element.has_value() && elements.size() <= kElementLimit; |
| 45 | element = element_finder.GetNext()) { |
| 46 | elements.push_back(*element); |
| 47 | } |
| 48 | if (elements.size() >= kElementLimit) { |
| 49 | LOG(WARNING) << name << ": Found too many elements."; |
Anton Bikineev | 1156b5f | 2021-05-15 22:35:36 | [diff] [blame] | 50 | return absl::nullopt; |
Samuel Huang | fdb2f3a | 2017-12-20 17:45:14 | [diff] [blame] | 51 | } |
| 52 | LOG(INFO) << name << ": Found " << elements.size() << " elements."; |
| 53 | return elements; |
| 54 | } |
| 55 | |
| 56 | // Determines whether a proposed comparison between Elements should be rejected |
| 57 | // early, to decrease the likelihood of creating false-positive matches, which |
| 58 | // may be costly for patching. Our heuristic simply prohibits big difference in |
| 59 | // size (relative and absolute) between matched elements. |
| 60 | bool UnsafeDifference(const Element& old_element, const Element& new_element) { |
| 61 | static constexpr double kMaxBloat = 2.0; |
| 62 | static constexpr size_t kMinWorrysomeDifference = 2 << 20; // 2MB |
| 63 | size_t lo_size = std::min(old_element.size, new_element.size); |
| 64 | size_t hi_size = std::max(old_element.size, new_element.size); |
| 65 | if (hi_size - lo_size < kMinWorrysomeDifference) |
| 66 | return false; |
| 67 | if (hi_size < lo_size * kMaxBloat) |
| 68 | return false; |
| 69 | return true; |
| 70 | } |
| 71 | |
| 72 | std::ostream& operator<<(std::ostream& stream, const Element& elt) { |
Calder Kitagawa | da4335f | 2018-04-24 13:54:46 | [diff] [blame] | 73 | stream << "(" << CastExecutableTypeToString(elt.exe_type) << ", " |
| 74 | << AsHex<8, size_t>(elt.offset) << " +" << AsHex<8, size_t>(elt.size) |
| 75 | << ")"; |
Samuel Huang | fdb2f3a | 2017-12-20 17:45:14 | [diff] [blame] | 76 | return stream; |
| 77 | } |
| 78 | |
| 79 | /******** MatchingInfoOut ********/ |
| 80 | |
| 81 | // A class to output detailed information during ensemble matching. Extracting |
| 82 | // the functionality to a separate class decouples formatting and printing logic |
| 83 | // from matching logic. The base class consists of stubs. |
| 84 | class MatchingInfoOut { |
| 85 | protected: |
| 86 | MatchingInfoOut() = default; |
Samuel Huang | ba0e1f5 | 2021-08-13 15:42:26 | [diff] [blame] | 87 | MatchingInfoOut(const MatchingInfoOut&) = delete; |
| 88 | const MatchingInfoOut& operator=(const MatchingInfoOut&) = delete; |
Samuel Huang | fdb2f3a | 2017-12-20 17:45:14 | [diff] [blame] | 89 | |
| 90 | public: |
| 91 | virtual ~MatchingInfoOut() = default; |
| 92 | virtual void InitSizes(size_t old_size, size_t new_size) {} |
| 93 | virtual void DeclareTypeMismatch(int iold, int inew) {} |
| 94 | virtual void DeclareUnsafeDistance(int iold, int inew) {} |
| 95 | virtual void DeclareCandidate(int iold, int inew) {} |
| 96 | virtual void DeclareMatch(int iold, |
| 97 | int inew, |
| 98 | double dist, |
| 99 | bool is_identical) {} |
| 100 | virtual void DeclareOutlier(int iold, int inew) {} |
| 101 | |
| 102 | virtual void OutputCompare(const Element& old_element, |
| 103 | const Element& new_element, |
| 104 | double dist) {} |
| 105 | |
| 106 | virtual void OutputMatch(const Element& best_old_element, |
| 107 | const Element& new_element, |
| 108 | bool is_identical, |
| 109 | double best_dist) {} |
| 110 | |
| 111 | virtual void OutputScores(const std::string& stats) {} |
| 112 | |
| 113 | virtual void OutputTextGrid() {} |
Samuel Huang | fdb2f3a | 2017-12-20 17:45:14 | [diff] [blame] | 114 | }; |
| 115 | |
| 116 | /******** MatchingInfoTerse ********/ |
| 117 | |
| 118 | // A terse MatchingInfoOut that prints only basic information, using LOG(). |
| 119 | class MatchingInfoOutTerse : public MatchingInfoOut { |
| 120 | public: |
| 121 | MatchingInfoOutTerse() = default; |
Samuel Huang | ba0e1f5 | 2021-08-13 15:42:26 | [diff] [blame] | 122 | MatchingInfoOutTerse(const MatchingInfoOutTerse&) = delete; |
| 123 | const MatchingInfoOutTerse& operator=(const MatchingInfoOutTerse&) = delete; |
Samuel Huang | fdb2f3a | 2017-12-20 17:45:14 | [diff] [blame] | 124 | ~MatchingInfoOutTerse() override = default; |
| 125 | |
| 126 | void OutputScores(const std::string& stats) override { |
| 127 | LOG(INFO) << "Best dists: " << stats; |
| 128 | } |
Samuel Huang | fdb2f3a | 2017-12-20 17:45:14 | [diff] [blame] | 129 | }; |
| 130 | |
| 131 | /******** MatchingInfoOutVerbose ********/ |
| 132 | |
| 133 | // A verbose MatchingInfoOut that prints detailed information using |out_|, |
| 134 | // including comparison pairs, scores, and a text grid representation of |
| 135 | // pairwise matching results. |
| 136 | class MatchingInfoOutVerbose : public MatchingInfoOut { |
| 137 | public: |
| 138 | explicit MatchingInfoOutVerbose(std::ostream& out) : out_(out) {} |
Samuel Huang | ba0e1f5 | 2021-08-13 15:42:26 | [diff] [blame] | 139 | MatchingInfoOutVerbose(const MatchingInfoOutVerbose&) = delete; |
| 140 | const MatchingInfoOutVerbose& operator=(const MatchingInfoOutVerbose&) = |
| 141 | delete; |
Samuel Huang | fdb2f3a | 2017-12-20 17:45:14 | [diff] [blame] | 142 | ~MatchingInfoOutVerbose() override = default; |
| 143 | |
| 144 | // Outputs sizes and initializes |text_grid_|. |
| 145 | void InitSizes(size_t old_size, size_t new_size) override { |
Ali Hijazi | a709b48b | 2022-11-09 01:27:44 | [diff] [blame] | 146 | *out_ << "Comparing old (" << old_size << " elements) and new (" << new_size |
| 147 | << " elements)" << std::endl; |
Samuel Huang | fdb2f3a | 2017-12-20 17:45:14 | [diff] [blame] | 148 | text_grid_.assign(new_size, std::string(old_size, '-')); |
| 149 | best_dist_.assign(new_size, -1.0); |
| 150 | } |
| 151 | |
| 152 | // Functions to update match status in text grid representation. |
| 153 | |
| 154 | void DeclareTypeMismatch(int iold, int inew) override { |
| 155 | text_grid_[inew][iold] = 'T'; |
| 156 | } |
| 157 | void DeclareUnsafeDistance(int iold, int inew) override { |
| 158 | text_grid_[inew][iold] = 'U'; |
| 159 | } |
| 160 | void DeclareCandidate(int iold, int inew) override { |
| 161 | text_grid_[inew][iold] = 'C'; // Provisional. |
| 162 | } |
| 163 | void DeclareMatch(int iold, |
| 164 | int inew, |
| 165 | double dist, |
| 166 | bool is_identical) override { |
| 167 | text_grid_[inew][iold] = is_identical ? 'I' : 'M'; |
| 168 | best_dist_[inew] = dist; |
| 169 | } |
| 170 | void DeclareOutlier(int iold, int inew) override { |
| 171 | text_grid_[inew][iold] = 'O'; |
| 172 | } |
| 173 | |
| 174 | // Functions to print detailed information. |
| 175 | |
| 176 | void OutputCompare(const Element& old_element, |
| 177 | const Element& new_element, |
| 178 | double dist) override { |
Ali Hijazi | a709b48b | 2022-11-09 01:27:44 | [diff] [blame] | 179 | *out_ << "Compare old" << old_element << " to new" << new_element << " --> " |
| 180 | << base::StringPrintf("%.5f", dist) << std::endl; |
Samuel Huang | fdb2f3a | 2017-12-20 17:45:14 | [diff] [blame] | 181 | } |
| 182 | |
| 183 | void OutputMatch(const Element& best_old_element, |
| 184 | const Element& new_element, |
| 185 | bool is_identical, |
| 186 | double best_dist) override { |
| 187 | if (is_identical) { |
Ali Hijazi | a709b48b | 2022-11-09 01:27:44 | [diff] [blame] | 188 | *out_ << "Skipped old" << best_old_element << " - identical to new" |
| 189 | << new_element; |
Samuel Huang | fdb2f3a | 2017-12-20 17:45:14 | [diff] [blame] | 190 | } else { |
Ali Hijazi | a709b48b | 2022-11-09 01:27:44 | [diff] [blame] | 191 | *out_ << "Matched old" << best_old_element << " to new" << new_element |
| 192 | << " --> " << base::StringPrintf("%.5f", best_dist); |
Samuel Huang | fdb2f3a | 2017-12-20 17:45:14 | [diff] [blame] | 193 | } |
Ali Hijazi | a709b48b | 2022-11-09 01:27:44 | [diff] [blame] | 194 | *out_ << std::endl; |
Samuel Huang | fdb2f3a | 2017-12-20 17:45:14 | [diff] [blame] | 195 | } |
| 196 | |
| 197 | void OutputScores(const std::string& stats) override { |
Ali Hijazi | a709b48b | 2022-11-09 01:27:44 | [diff] [blame] | 198 | *out_ << "Best dists: " << stats << std::endl; |
Samuel Huang | fdb2f3a | 2017-12-20 17:45:14 | [diff] [blame] | 199 | } |
| 200 | |
| 201 | void OutputTextGrid() override { |
| 202 | int new_size = static_cast<int>(text_grid_.size()); |
| 203 | for (int inew = 0; inew < new_size; ++inew) { |
| 204 | const std::string& line = text_grid_[inew]; |
Ali Hijazi | a709b48b | 2022-11-09 01:27:44 | [diff] [blame] | 205 | *out_ << " "; |
Samuel Huang | fdb2f3a | 2017-12-20 17:45:14 | [diff] [blame] | 206 | for (char ch : line) { |
| 207 | char prefix = (ch == 'I' || ch == 'M') ? '(' : ' '; |
| 208 | char suffix = (ch == 'I' || ch == 'M') ? ')' : ' '; |
Ali Hijazi | a709b48b | 2022-11-09 01:27:44 | [diff] [blame] | 209 | *out_ << prefix << ch << suffix; |
Samuel Huang | fdb2f3a | 2017-12-20 17:45:14 | [diff] [blame] | 210 | } |
| 211 | if (best_dist_[inew] >= 0) |
Ali Hijazi | a709b48b | 2022-11-09 01:27:44 | [diff] [blame] | 212 | *out_ << " " << base::StringPrintf("%.5f", best_dist_[inew]); |
| 213 | *out_ << std::endl; |
Samuel Huang | fdb2f3a | 2017-12-20 17:45:14 | [diff] [blame] | 214 | } |
| 215 | if (!text_grid_.empty()) { |
Ali Hijazi | a709b48b | 2022-11-09 01:27:44 | [diff] [blame] | 216 | *out_ << " Legend: I = identical, M = matched, T = type mismatch, " |
| 217 | "U = unsafe distance, C = candidate, O = outlier, - = skipped." |
| 218 | << std::endl; |
Samuel Huang | fdb2f3a | 2017-12-20 17:45:14 | [diff] [blame] | 219 | } |
| 220 | } |
| 221 | |
| 222 | private: |
Ali Hijazi | a709b48b | 2022-11-09 01:27:44 | [diff] [blame] | 223 | const raw_ref<std::ostream> out_; |
Samuel Huang | fdb2f3a | 2017-12-20 17:45:14 | [diff] [blame] | 224 | |
| 225 | // Text grid representation of matches. Rows correspond to "old" and columns |
| 226 | // correspond to "new". |
| 227 | std::vector<std::string> text_grid_; |
| 228 | |
| 229 | // For each "new" element, distance of best match. -1 denotes no match. |
| 230 | std::vector<double> best_dist_; |
Samuel Huang | fdb2f3a | 2017-12-20 17:45:14 | [diff] [blame] | 231 | }; |
| 232 | |
| 233 | } // namespace |
| 234 | |
| 235 | /******** HeuristicEnsembleMatcher ********/ |
| 236 | |
| 237 | HeuristicEnsembleMatcher::HeuristicEnsembleMatcher(std::ostream* out) |
| 238 | : out_(out) {} |
| 239 | |
| 240 | HeuristicEnsembleMatcher::~HeuristicEnsembleMatcher() = default; |
| 241 | |
| 242 | bool HeuristicEnsembleMatcher::RunMatch(ConstBufferView old_image, |
| 243 | ConstBufferView new_image) { |
| 244 | DCHECK(matches_.empty()); |
Samuel Huang | fdb2f3a | 2017-12-20 17:45:14 | [diff] [blame] | 245 | LOG(INFO) << "Start matching."; |
| 246 | |
| 247 | // Find all elements in "old" and "new". |
Anton Bikineev | 1156b5f | 2021-05-15 22:35:36 | [diff] [blame] | 248 | absl::optional<std::vector<Element>> old_elements = |
Samuel Huang | fdb2f3a | 2017-12-20 17:45:14 | [diff] [blame] | 249 | FindEmbeddedElements(old_image, "Old file", |
| 250 | base::BindRepeating(DetectElementFromDisassembler)); |
| 251 | if (!old_elements.has_value()) |
| 252 | return false; |
Anton Bikineev | 1156b5f | 2021-05-15 22:35:36 | [diff] [blame] | 253 | absl::optional<std::vector<Element>> new_elements = |
Samuel Huang | fdb2f3a | 2017-12-20 17:45:14 | [diff] [blame] | 254 | FindEmbeddedElements(new_image, "New file", |
| 255 | base::BindRepeating(DetectElementFromDisassembler)); |
| 256 | if (!new_elements.has_value()) |
| 257 | return false; |
| 258 | |
| 259 | std::unique_ptr<MatchingInfoOut> info_out; |
| 260 | if (out_) |
| 261 | info_out = std::make_unique<MatchingInfoOutVerbose>(*out_); |
| 262 | else |
| 263 | info_out = std::make_unique<MatchingInfoOutTerse>(); |
| 264 | |
| 265 | const int num_new_elements = base::checked_cast<int>(new_elements->size()); |
| 266 | const int num_old_elements = base::checked_cast<int>(old_elements->size()); |
| 267 | info_out->InitSizes(num_old_elements, num_new_elements); |
| 268 | |
| 269 | // For each "new" element, match it with the "old" element that's nearest to |
| 270 | // it, with distance determined by BinaryDataHistogram. The resulting |
| 271 | // "old"-"new" pairs are stored into |results|. Possibilities: |
| 272 | // - Type mismatch: No match. |
| 273 | // - UnsafeDifference() heuristics fail: No match. |
| 274 | // - Identical match: Skip "new" since this is a trivial case. |
| 275 | // - Non-identical match: Match "new" with "old" with min distance. |
| 276 | // - No match: Skip "new". |
| 277 | struct Results { |
| 278 | int iold; |
| 279 | int inew; |
| 280 | double dist; |
| 281 | }; |
| 282 | std::vector<Results> results; |
| 283 | |
| 284 | // Precompute histograms for "old" since they get reused. |
| 285 | std::vector<BinaryDataHistogram> old_his(num_old_elements); |
| 286 | for (int iold = 0; iold < num_old_elements; ++iold) { |
| 287 | ConstBufferView sub_image(old_image[(*old_elements)[iold]]); |
| 288 | old_his[iold].Compute(sub_image); |
| 289 | // ProgramDetector should have imposed minimal size limit to |sub_image|. |
| 290 | // Therefore resulting histogram are expected to be valid. |
| 291 | CHECK(old_his[iold].IsValid()); |
| 292 | } |
| 293 | |
| 294 | const int kUninitIold = num_old_elements; |
| 295 | for (int inew = 0; inew < num_new_elements; ++inew) { |
| 296 | const Element& cur_new_element = (*new_elements)[inew]; |
| 297 | ConstBufferView cur_new_sub_image(new_image[cur_new_element.region()]); |
| 298 | BinaryDataHistogram new_his; |
| 299 | new_his.Compute(cur_new_sub_image); |
| 300 | CHECK(new_his.IsValid()); |
| 301 | |
| 302 | double best_dist = HUGE_VAL; |
| 303 | int best_iold = kUninitIold; |
| 304 | bool is_identical = false; |
| 305 | |
| 306 | for (int iold = 0; iold < num_old_elements; ++iold) { |
| 307 | const Element& cur_old_element = (*old_elements)[iold]; |
| 308 | if (cur_old_element.exe_type != cur_new_element.exe_type) { |
| 309 | info_out->DeclareTypeMismatch(iold, inew); |
| 310 | continue; |
| 311 | } |
| 312 | if (UnsafeDifference(cur_old_element, cur_new_element)) { |
| 313 | info_out->DeclareUnsafeDistance(iold, inew); |
| 314 | continue; |
| 315 | } |
| 316 | double dist = old_his[iold].Distance(new_his); |
| 317 | info_out->DeclareCandidate(iold, inew); |
| 318 | info_out->OutputCompare(cur_old_element, cur_new_element, dist); |
| 319 | if (best_dist > dist) { // Tie resolution: First-one, first-serve. |
| 320 | best_iold = iold; |
| 321 | best_dist = dist; |
| 322 | if (best_dist == 0) { |
| 323 | ConstBufferView sub_image(old_image[cur_old_element.region()]); |
| 324 | if (sub_image.equals(cur_new_sub_image)) { |
| 325 | is_identical = true; |
| 326 | break; |
| 327 | } |
| 328 | } |
| 329 | } |
| 330 | } |
| 331 | |
| 332 | if (best_iold != kUninitIold) { |
| 333 | const Element& best_old_element = (*old_elements)[best_iold]; |
| 334 | info_out->DeclareMatch(best_iold, inew, best_dist, is_identical); |
| 335 | if (is_identical) // Skip "new" if identical match is found. |
| 336 | ++num_identical_; |
| 337 | else |
| 338 | results.push_back({best_iold, inew, best_dist}); |
| 339 | info_out->OutputMatch(best_old_element, cur_new_element, is_identical, |
| 340 | best_dist); |
| 341 | } |
| 342 | } |
| 343 | |
| 344 | // Populate |matches_| from |result|. To reduce that chance of false-positive |
| 345 | // matches, statistics on dists are computed. If a match's |dist| is an |
| 346 | // outlier then it is rejected. |
| 347 | if (results.size() > 0) { |
| 348 | OutlierDetector detector; |
| 349 | for (const auto& result : results) { |
| 350 | if (result.dist > 0) |
| 351 | detector.Add(result.dist); |
| 352 | } |
| 353 | detector.Prepare(); |
| 354 | info_out->OutputScores(detector.RenderStats()); |
| 355 | for (const Results& result : results) { |
| 356 | if (detector.DecideOutlier(result.dist) > 0) { |
| 357 | info_out->DeclareOutlier(result.iold, result.inew); |
| 358 | } else { |
| 359 | matches_.push_back( |
| 360 | {(*old_elements)[result.iold], (*new_elements)[result.inew]}); |
| 361 | } |
| 362 | } |
| 363 | info_out->OutputTextGrid(); |
| 364 | } |
| 365 | |
| 366 | Trim(); |
Samuel Huang | fdb2f3a | 2017-12-20 17:45:14 | [diff] [blame] | 367 | return true; |
| 368 | } |
| 369 | |
| 370 | } // namespace zucchini |