blob: b3171b300387121ef7dbc652d847a9444bd473f0 [file] [log] [blame]
Avi Drissman8ba1bad2022-09-13 19:22:361// Copyright 2017 The Chromium Authors
Samuel Huangfdb2f3a2017-12-20 17:45:142// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
Samuel Huang577ef6c2018-03-13 18:19:345#include "components/zucchini/heuristic_ensemble_matcher.h"
Samuel Huangfdb2f3a2017-12-20 17:45:146
7#include <algorithm>
8#include <memory>
9#include <string>
10#include <utility>
11#include <vector>
12
Avi Drissman12be0312023-01-11 09:16:0913#include "base/functional/bind.h"
Hans Wennborg5bafbb92020-06-18 09:13:5714#include "base/logging.h"
Ali Hijazia709b48b2022-11-09 01:27:4415#include "base/memory/raw_ref.h"
Samuel Huangfdb2f3a2017-12-20 17:45:1416#include "base/numerics/safe_conversions.h"
17#include "base/strings/stringprintf.h"
Samuel Huang577ef6c2018-03-13 18:19:3418#include "components/zucchini/binary_data_histogram.h"
19#include "components/zucchini/element_detection.h"
20#include "components/zucchini/image_utils.h"
21#include "components/zucchini/io_utils.h"
Samuel Huangfdb2f3a2017-12-20 17:45:1422
23namespace zucchini {
24
25namespace {
26
27/******** Helper Functions ********/
28
29// Uses |detector| to find embedded executables inside |image|, and returns the
Anton Bikineev1156b5f2021-05-15 22:35:3630// result on success, or absl::nullopt on failure, which occurs if too many (>
Samuel Huangfdb2f3a2017-12-20 17:45:1431// |kElementLimit|) elements are found.
Anton Bikineev1156b5f2021-05-15 22:35:3632absl::optional<std::vector<Element>> FindEmbeddedElements(
Samuel Huangfdb2f3a2017-12-20 17:45:1433 ConstBufferView image,
34 const std::string& name,
35 ElementDetector&& detector) {
36 // Maximum number of Elements in a file. This is enforced because our matching
37 // algorithm is O(n^2), which suffices for regular archive files that should
38 // have up to 10's of executable files. An archive containing 100's of
39 // executables is likely pathological, and is rejected to prevent exploits.
40 static constexpr size_t kElementLimit = 256;
41 std::vector<Element> elements;
42 ElementFinder element_finder(image, std::move(detector));
43 for (auto element = element_finder.GetNext();
44 element.has_value() && elements.size() <= kElementLimit;
45 element = element_finder.GetNext()) {
46 elements.push_back(*element);
47 }
48 if (elements.size() >= kElementLimit) {
49 LOG(WARNING) << name << ": Found too many elements.";
Anton Bikineev1156b5f2021-05-15 22:35:3650 return absl::nullopt;
Samuel Huangfdb2f3a2017-12-20 17:45:1451 }
52 LOG(INFO) << name << ": Found " << elements.size() << " elements.";
53 return elements;
54}
55
56// Determines whether a proposed comparison between Elements should be rejected
57// early, to decrease the likelihood of creating false-positive matches, which
58// may be costly for patching. Our heuristic simply prohibits big difference in
59// size (relative and absolute) between matched elements.
60bool UnsafeDifference(const Element& old_element, const Element& new_element) {
61 static constexpr double kMaxBloat = 2.0;
62 static constexpr size_t kMinWorrysomeDifference = 2 << 20; // 2MB
63 size_t lo_size = std::min(old_element.size, new_element.size);
64 size_t hi_size = std::max(old_element.size, new_element.size);
65 if (hi_size - lo_size < kMinWorrysomeDifference)
66 return false;
67 if (hi_size < lo_size * kMaxBloat)
68 return false;
69 return true;
70}
71
72std::ostream& operator<<(std::ostream& stream, const Element& elt) {
Calder Kitagawada4335f2018-04-24 13:54:4673 stream << "(" << CastExecutableTypeToString(elt.exe_type) << ", "
74 << AsHex<8, size_t>(elt.offset) << " +" << AsHex<8, size_t>(elt.size)
75 << ")";
Samuel Huangfdb2f3a2017-12-20 17:45:1476 return stream;
77}
78
79/******** MatchingInfoOut ********/
80
81// A class to output detailed information during ensemble matching. Extracting
82// the functionality to a separate class decouples formatting and printing logic
83// from matching logic. The base class consists of stubs.
84class MatchingInfoOut {
85 protected:
86 MatchingInfoOut() = default;
Samuel Huangba0e1f52021-08-13 15:42:2687 MatchingInfoOut(const MatchingInfoOut&) = delete;
88 const MatchingInfoOut& operator=(const MatchingInfoOut&) = delete;
Samuel Huangfdb2f3a2017-12-20 17:45:1489
90 public:
91 virtual ~MatchingInfoOut() = default;
92 virtual void InitSizes(size_t old_size, size_t new_size) {}
93 virtual void DeclareTypeMismatch(int iold, int inew) {}
94 virtual void DeclareUnsafeDistance(int iold, int inew) {}
95 virtual void DeclareCandidate(int iold, int inew) {}
96 virtual void DeclareMatch(int iold,
97 int inew,
98 double dist,
99 bool is_identical) {}
100 virtual void DeclareOutlier(int iold, int inew) {}
101
102 virtual void OutputCompare(const Element& old_element,
103 const Element& new_element,
104 double dist) {}
105
106 virtual void OutputMatch(const Element& best_old_element,
107 const Element& new_element,
108 bool is_identical,
109 double best_dist) {}
110
111 virtual void OutputScores(const std::string& stats) {}
112
113 virtual void OutputTextGrid() {}
Samuel Huangfdb2f3a2017-12-20 17:45:14114};
115
116/******** MatchingInfoTerse ********/
117
118// A terse MatchingInfoOut that prints only basic information, using LOG().
119class MatchingInfoOutTerse : public MatchingInfoOut {
120 public:
121 MatchingInfoOutTerse() = default;
Samuel Huangba0e1f52021-08-13 15:42:26122 MatchingInfoOutTerse(const MatchingInfoOutTerse&) = delete;
123 const MatchingInfoOutTerse& operator=(const MatchingInfoOutTerse&) = delete;
Samuel Huangfdb2f3a2017-12-20 17:45:14124 ~MatchingInfoOutTerse() override = default;
125
126 void OutputScores(const std::string& stats) override {
127 LOG(INFO) << "Best dists: " << stats;
128 }
Samuel Huangfdb2f3a2017-12-20 17:45:14129};
130
131/******** MatchingInfoOutVerbose ********/
132
133// A verbose MatchingInfoOut that prints detailed information using |out_|,
134// including comparison pairs, scores, and a text grid representation of
135// pairwise matching results.
136class MatchingInfoOutVerbose : public MatchingInfoOut {
137 public:
138 explicit MatchingInfoOutVerbose(std::ostream& out) : out_(out) {}
Samuel Huangba0e1f52021-08-13 15:42:26139 MatchingInfoOutVerbose(const MatchingInfoOutVerbose&) = delete;
140 const MatchingInfoOutVerbose& operator=(const MatchingInfoOutVerbose&) =
141 delete;
Samuel Huangfdb2f3a2017-12-20 17:45:14142 ~MatchingInfoOutVerbose() override = default;
143
144 // Outputs sizes and initializes |text_grid_|.
145 void InitSizes(size_t old_size, size_t new_size) override {
Ali Hijazia709b48b2022-11-09 01:27:44146 *out_ << "Comparing old (" << old_size << " elements) and new (" << new_size
147 << " elements)" << std::endl;
Samuel Huangfdb2f3a2017-12-20 17:45:14148 text_grid_.assign(new_size, std::string(old_size, '-'));
149 best_dist_.assign(new_size, -1.0);
150 }
151
152 // Functions to update match status in text grid representation.
153
154 void DeclareTypeMismatch(int iold, int inew) override {
155 text_grid_[inew][iold] = 'T';
156 }
157 void DeclareUnsafeDistance(int iold, int inew) override {
158 text_grid_[inew][iold] = 'U';
159 }
160 void DeclareCandidate(int iold, int inew) override {
161 text_grid_[inew][iold] = 'C'; // Provisional.
162 }
163 void DeclareMatch(int iold,
164 int inew,
165 double dist,
166 bool is_identical) override {
167 text_grid_[inew][iold] = is_identical ? 'I' : 'M';
168 best_dist_[inew] = dist;
169 }
170 void DeclareOutlier(int iold, int inew) override {
171 text_grid_[inew][iold] = 'O';
172 }
173
174 // Functions to print detailed information.
175
176 void OutputCompare(const Element& old_element,
177 const Element& new_element,
178 double dist) override {
Ali Hijazia709b48b2022-11-09 01:27:44179 *out_ << "Compare old" << old_element << " to new" << new_element << " --> "
180 << base::StringPrintf("%.5f", dist) << std::endl;
Samuel Huangfdb2f3a2017-12-20 17:45:14181 }
182
183 void OutputMatch(const Element& best_old_element,
184 const Element& new_element,
185 bool is_identical,
186 double best_dist) override {
187 if (is_identical) {
Ali Hijazia709b48b2022-11-09 01:27:44188 *out_ << "Skipped old" << best_old_element << " - identical to new"
189 << new_element;
Samuel Huangfdb2f3a2017-12-20 17:45:14190 } else {
Ali Hijazia709b48b2022-11-09 01:27:44191 *out_ << "Matched old" << best_old_element << " to new" << new_element
192 << " --> " << base::StringPrintf("%.5f", best_dist);
Samuel Huangfdb2f3a2017-12-20 17:45:14193 }
Ali Hijazia709b48b2022-11-09 01:27:44194 *out_ << std::endl;
Samuel Huangfdb2f3a2017-12-20 17:45:14195 }
196
197 void OutputScores(const std::string& stats) override {
Ali Hijazia709b48b2022-11-09 01:27:44198 *out_ << "Best dists: " << stats << std::endl;
Samuel Huangfdb2f3a2017-12-20 17:45:14199 }
200
201 void OutputTextGrid() override {
202 int new_size = static_cast<int>(text_grid_.size());
203 for (int inew = 0; inew < new_size; ++inew) {
204 const std::string& line = text_grid_[inew];
Ali Hijazia709b48b2022-11-09 01:27:44205 *out_ << " ";
Samuel Huangfdb2f3a2017-12-20 17:45:14206 for (char ch : line) {
207 char prefix = (ch == 'I' || ch == 'M') ? '(' : ' ';
208 char suffix = (ch == 'I' || ch == 'M') ? ')' : ' ';
Ali Hijazia709b48b2022-11-09 01:27:44209 *out_ << prefix << ch << suffix;
Samuel Huangfdb2f3a2017-12-20 17:45:14210 }
211 if (best_dist_[inew] >= 0)
Ali Hijazia709b48b2022-11-09 01:27:44212 *out_ << " " << base::StringPrintf("%.5f", best_dist_[inew]);
213 *out_ << std::endl;
Samuel Huangfdb2f3a2017-12-20 17:45:14214 }
215 if (!text_grid_.empty()) {
Ali Hijazia709b48b2022-11-09 01:27:44216 *out_ << " Legend: I = identical, M = matched, T = type mismatch, "
217 "U = unsafe distance, C = candidate, O = outlier, - = skipped."
218 << std::endl;
Samuel Huangfdb2f3a2017-12-20 17:45:14219 }
220 }
221
222 private:
Ali Hijazia709b48b2022-11-09 01:27:44223 const raw_ref<std::ostream> out_;
Samuel Huangfdb2f3a2017-12-20 17:45:14224
225 // Text grid representation of matches. Rows correspond to "old" and columns
226 // correspond to "new".
227 std::vector<std::string> text_grid_;
228
229 // For each "new" element, distance of best match. -1 denotes no match.
230 std::vector<double> best_dist_;
Samuel Huangfdb2f3a2017-12-20 17:45:14231};
232
233} // namespace
234
235/******** HeuristicEnsembleMatcher ********/
236
237HeuristicEnsembleMatcher::HeuristicEnsembleMatcher(std::ostream* out)
238 : out_(out) {}
239
240HeuristicEnsembleMatcher::~HeuristicEnsembleMatcher() = default;
241
242bool HeuristicEnsembleMatcher::RunMatch(ConstBufferView old_image,
243 ConstBufferView new_image) {
244 DCHECK(matches_.empty());
Samuel Huangfdb2f3a2017-12-20 17:45:14245 LOG(INFO) << "Start matching.";
246
247 // Find all elements in "old" and "new".
Anton Bikineev1156b5f2021-05-15 22:35:36248 absl::optional<std::vector<Element>> old_elements =
Samuel Huangfdb2f3a2017-12-20 17:45:14249 FindEmbeddedElements(old_image, "Old file",
250 base::BindRepeating(DetectElementFromDisassembler));
251 if (!old_elements.has_value())
252 return false;
Anton Bikineev1156b5f2021-05-15 22:35:36253 absl::optional<std::vector<Element>> new_elements =
Samuel Huangfdb2f3a2017-12-20 17:45:14254 FindEmbeddedElements(new_image, "New file",
255 base::BindRepeating(DetectElementFromDisassembler));
256 if (!new_elements.has_value())
257 return false;
258
259 std::unique_ptr<MatchingInfoOut> info_out;
260 if (out_)
261 info_out = std::make_unique<MatchingInfoOutVerbose>(*out_);
262 else
263 info_out = std::make_unique<MatchingInfoOutTerse>();
264
265 const int num_new_elements = base::checked_cast<int>(new_elements->size());
266 const int num_old_elements = base::checked_cast<int>(old_elements->size());
267 info_out->InitSizes(num_old_elements, num_new_elements);
268
269 // For each "new" element, match it with the "old" element that's nearest to
270 // it, with distance determined by BinaryDataHistogram. The resulting
271 // "old"-"new" pairs are stored into |results|. Possibilities:
272 // - Type mismatch: No match.
273 // - UnsafeDifference() heuristics fail: No match.
274 // - Identical match: Skip "new" since this is a trivial case.
275 // - Non-identical match: Match "new" with "old" with min distance.
276 // - No match: Skip "new".
277 struct Results {
278 int iold;
279 int inew;
280 double dist;
281 };
282 std::vector<Results> results;
283
284 // Precompute histograms for "old" since they get reused.
285 std::vector<BinaryDataHistogram> old_his(num_old_elements);
286 for (int iold = 0; iold < num_old_elements; ++iold) {
287 ConstBufferView sub_image(old_image[(*old_elements)[iold]]);
288 old_his[iold].Compute(sub_image);
289 // ProgramDetector should have imposed minimal size limit to |sub_image|.
290 // Therefore resulting histogram are expected to be valid.
291 CHECK(old_his[iold].IsValid());
292 }
293
294 const int kUninitIold = num_old_elements;
295 for (int inew = 0; inew < num_new_elements; ++inew) {
296 const Element& cur_new_element = (*new_elements)[inew];
297 ConstBufferView cur_new_sub_image(new_image[cur_new_element.region()]);
298 BinaryDataHistogram new_his;
299 new_his.Compute(cur_new_sub_image);
300 CHECK(new_his.IsValid());
301
302 double best_dist = HUGE_VAL;
303 int best_iold = kUninitIold;
304 bool is_identical = false;
305
306 for (int iold = 0; iold < num_old_elements; ++iold) {
307 const Element& cur_old_element = (*old_elements)[iold];
308 if (cur_old_element.exe_type != cur_new_element.exe_type) {
309 info_out->DeclareTypeMismatch(iold, inew);
310 continue;
311 }
312 if (UnsafeDifference(cur_old_element, cur_new_element)) {
313 info_out->DeclareUnsafeDistance(iold, inew);
314 continue;
315 }
316 double dist = old_his[iold].Distance(new_his);
317 info_out->DeclareCandidate(iold, inew);
318 info_out->OutputCompare(cur_old_element, cur_new_element, dist);
319 if (best_dist > dist) { // Tie resolution: First-one, first-serve.
320 best_iold = iold;
321 best_dist = dist;
322 if (best_dist == 0) {
323 ConstBufferView sub_image(old_image[cur_old_element.region()]);
324 if (sub_image.equals(cur_new_sub_image)) {
325 is_identical = true;
326 break;
327 }
328 }
329 }
330 }
331
332 if (best_iold != kUninitIold) {
333 const Element& best_old_element = (*old_elements)[best_iold];
334 info_out->DeclareMatch(best_iold, inew, best_dist, is_identical);
335 if (is_identical) // Skip "new" if identical match is found.
336 ++num_identical_;
337 else
338 results.push_back({best_iold, inew, best_dist});
339 info_out->OutputMatch(best_old_element, cur_new_element, is_identical,
340 best_dist);
341 }
342 }
343
344 // Populate |matches_| from |result|. To reduce that chance of false-positive
345 // matches, statistics on dists are computed. If a match's |dist| is an
346 // outlier then it is rejected.
347 if (results.size() > 0) {
348 OutlierDetector detector;
349 for (const auto& result : results) {
350 if (result.dist > 0)
351 detector.Add(result.dist);
352 }
353 detector.Prepare();
354 info_out->OutputScores(detector.RenderStats());
355 for (const Results& result : results) {
356 if (detector.DecideOutlier(result.dist) > 0) {
357 info_out->DeclareOutlier(result.iold, result.inew);
358 } else {
359 matches_.push_back(
360 {(*old_elements)[result.iold], (*new_elements)[result.inew]});
361 }
362 }
363 info_out->OutputTextGrid();
364 }
365
366 Trim();
Samuel Huangfdb2f3a2017-12-20 17:45:14367 return true;
368}
369
370} // namespace zucchini