battre | 4cdaa7c | 2016-01-07 11:30:27 | [diff] [blame] | 1 | // Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
| 5 | #ifndef COMPONENTS_FEEDBACK_ANONYMIZER_TOOL_H_ |
| 6 | #define COMPONENTS_FEEDBACK_ANONYMIZER_TOOL_H_ |
| 7 | |
| 8 | #include <map> |
dcheng | 84c358e | 2016-04-26 07:05:53 | [diff] [blame] | 9 | #include <memory> |
battre | 4cdaa7c | 2016-01-07 11:30:27 | [diff] [blame] | 10 | #include <string> |
| 11 | #include <vector> |
| 12 | |
battre | 03910b4 | 2016-01-11 13:42:34 | [diff] [blame] | 13 | #include "base/macros.h" |
Dominic Battre | f091addfc | 2017-12-07 15:45:34 | [diff] [blame] | 14 | #include "base/memory/ref_counted.h" |
| 15 | #include "base/sequence_checker.h" |
| 16 | #include "base/sequenced_task_runner.h" |
battre | 03910b4 | 2016-01-11 13:42:34 | [diff] [blame] | 17 | |
| 18 | namespace re2 { |
| 19 | class RE2; |
| 20 | } |
battre | 4cdaa7c | 2016-01-07 11:30:27 | [diff] [blame] | 21 | |
| 22 | namespace feedback { |
| 23 | |
battre | 03910b4 | 2016-01-11 13:42:34 | [diff] [blame] | 24 | struct CustomPatternWithoutContext { |
| 25 | // A string literal used in anonymized tests. Matches to the |pattern| are |
| 26 | // replaced with <|alias|: 1>, <|alias|: 2>, ... |
| 27 | const char* alias; |
| 28 | // A RE2 regexp with exactly one capture group. Matches will be replaced by |
| 29 | // the alias reference described above. |
| 30 | const char* pattern; |
| 31 | }; |
| 32 | |
battre | 4cdaa7c | 2016-01-07 11:30:27 | [diff] [blame] | 33 | class AnonymizerTool { |
| 34 | public: |
| 35 | AnonymizerTool(); |
| 36 | ~AnonymizerTool(); |
| 37 | |
| 38 | // Returns an anonymized version of |input|. PII-sensitive data (such as MAC |
| 39 | // addresses) in |input| is replaced with unique identifiers. |
Dominic Battre | f091addfc | 2017-12-07 15:45:34 | [diff] [blame] | 40 | // This is an expensive operation. Make sure not to execute this on the UI |
| 41 | // thread. |
battre | 4cdaa7c | 2016-01-07 11:30:27 | [diff] [blame] | 42 | std::string Anonymize(const std::string& input); |
| 43 | |
| 44 | private: |
| 45 | friend class AnonymizerToolTest; |
| 46 | |
battre | 03910b4 | 2016-01-11 13:42:34 | [diff] [blame] | 47 | re2::RE2* GetRegExp(const std::string& pattern); |
| 48 | |
battre | 4cdaa7c | 2016-01-07 11:30:27 | [diff] [blame] | 49 | std::string AnonymizeMACAddresses(const std::string& input); |
| 50 | std::string AnonymizeCustomPatterns(std::string input); |
battre | 03910b4 | 2016-01-11 13:42:34 | [diff] [blame] | 51 | std::string AnonymizeCustomPatternWithContext( |
battre | 4cdaa7c | 2016-01-07 11:30:27 | [diff] [blame] | 52 | const std::string& input, |
| 53 | const std::string& pattern, |
| 54 | std::map<std::string, std::string>* identifier_space); |
battre | 03910b4 | 2016-01-11 13:42:34 | [diff] [blame] | 55 | std::string AnonymizeCustomPatternWithoutContext( |
| 56 | const std::string& input, |
| 57 | const CustomPatternWithoutContext& pattern, |
| 58 | std::map<std::string, std::string>* identifier_space); |
battre | 4cdaa7c | 2016-01-07 11:30:27 | [diff] [blame] | 59 | |
| 60 | // Map of MAC addresses discovered in anonymized strings to anonymized |
| 61 | // representations. 11:22:33:44:55:66 gets anonymized to 11:22:33:00:00:01, |
| 62 | // where the first three bytes represent the manufacturer. The last three |
| 63 | // bytes are used to distinguish different MAC addresses and are incremented |
| 64 | // for each newly discovered MAC address. |
| 65 | std::map<std::string, std::string> mac_addresses_; |
| 66 | |
| 67 | // Like mac addresses, identifiers in custom patterns are anonymized. |
battre | 03910b4 | 2016-01-11 13:42:34 | [diff] [blame] | 68 | // custom_patterns_with_context_[i] contains a map of original identifier to |
| 69 | // anonymized identifier for custom pattern number i. |
| 70 | std::vector<std::map<std::string, std::string>> custom_patterns_with_context_; |
| 71 | std::vector<std::map<std::string, std::string>> |
| 72 | custom_patterns_without_context_; |
| 73 | |
| 74 | // Cache to prevent the repeated compilation of the same regular expression |
| 75 | // pattern. Key is the string representation of the RegEx. |
dcheng | 84c358e | 2016-04-26 07:05:53 | [diff] [blame] | 76 | std::map<std::string, std::unique_ptr<re2::RE2>> regexp_cache_; |
battre | 4cdaa7c | 2016-01-07 11:30:27 | [diff] [blame] | 77 | |
Dominic Battre | f091addfc | 2017-12-07 15:45:34 | [diff] [blame] | 78 | SEQUENCE_CHECKER(sequence_checker_); |
| 79 | |
battre | 4cdaa7c | 2016-01-07 11:30:27 | [diff] [blame] | 80 | DISALLOW_COPY_AND_ASSIGN(AnonymizerTool); |
| 81 | }; |
| 82 | |
Dominic Battre | f091addfc | 2017-12-07 15:45:34 | [diff] [blame] | 83 | // A container for a AnonymizerTool that is thread-safely ref-countable. |
| 84 | // This is useful for a class that wants to post an async anonymization task |
| 85 | // to a background sequence runner and not deal with its own life-cycle ending |
| 86 | // while the AnonymizerTool is busy on another sequence. |
| 87 | class AnonymizerToolContainer |
| 88 | : public base::RefCountedThreadSafe<AnonymizerToolContainer> { |
| 89 | public: |
| 90 | explicit AnonymizerToolContainer( |
| 91 | scoped_refptr<base::SequencedTaskRunner> task_runner); |
| 92 | |
| 93 | // Returns a pointer to the instance of this anonymier. May only be called |
| 94 | // on |task_runner_|. |
| 95 | AnonymizerTool* Get(); |
| 96 | |
| 97 | private: |
| 98 | friend class base::RefCountedThreadSafe<AnonymizerToolContainer>; |
| 99 | ~AnonymizerToolContainer(); |
| 100 | |
| 101 | std::unique_ptr<AnonymizerTool> anonymizer_; |
| 102 | scoped_refptr<base::SequencedTaskRunner> task_runner_; |
| 103 | }; |
| 104 | |
battre | 4cdaa7c | 2016-01-07 11:30:27 | [diff] [blame] | 105 | } // namespace feedback |
| 106 | |
| 107 | #endif // COMPONENTS_FEEDBACK_ANONYMIZER_TOOL_H_ |