agrieve | 142e275 | 2016-09-12 14:36:21 | [diff] [blame] | 1 | #!/usr/bin/env python |
smaier | b6dc58c | 2016-06-13 22:14:44 | [diff] [blame] | 2 | # Copyright 2016 The Chromium Authors. All rights reserved. |
| 3 | # Use of this source code is governed by a BSD-style license that can be |
| 4 | # found in the LICENSE file. |
| 5 | """Tool to diff 2 dex files that have been proguarded. |
| 6 | |
| 7 | To use this tool, first get dextra. http://newandroidbook.com/tools/dextra.html |
agrieve | 07bf88a | 2016-09-30 00:13:45 | [diff] [blame] | 8 | Then invoke script like: |
| 9 | |
| 10 | PATH=$PATH:/path/to/dextra dexdiffer.py --old classes1.dex --new classes2.dex |
| 11 | |
| 12 | apks files may be used as well. |
smaier | b6dc58c | 2016-06-13 22:14:44 | [diff] [blame] | 13 | """ |
| 14 | |
| 15 | import argparse |
agrieve | 07bf88a | 2016-09-30 00:13:45 | [diff] [blame] | 16 | import errno |
smaier | b6dc58c | 2016-06-13 22:14:44 | [diff] [blame] | 17 | import re |
agrieve | 07bf88a | 2016-09-30 00:13:45 | [diff] [blame] | 18 | import subprocess |
smaier | b6dc58c | 2016-06-13 22:14:44 | [diff] [blame] | 19 | import sys |
agrieve | 07bf88a | 2016-09-30 00:13:45 | [diff] [blame] | 20 | import tempfile |
| 21 | import zipfile |
smaier | b6dc58c | 2016-06-13 22:14:44 | [diff] [blame] | 22 | |
| 23 | |
| 24 | _QUALIFIERS = set(['public', 'protected', 'private', 'final', 'static', |
| 25 | 'abstract', 'volatile', 'native', 'enum']) |
| 26 | |
| 27 | |
| 28 | def _IsNewClass(line): |
| 29 | return line.endswith(':') |
| 30 | |
| 31 | |
| 32 | # Expects lines like one of these 3: |
| 33 | # 'android.support.v8.MenuPopupHelper -> android.support.v8.v:' |
| 34 | # ' android.view.LayoutInflater mInflater -> d' |
| 35 | # ' 117:118:void setForceShowIcon(boolean) -> b' |
| 36 | # Those three examples would return |
| 37 | # 'android.support.v8.MenuPopupHelper', 'android.support.v8.v' |
| 38 | # 'android.view.LayoutInflater mInflater', 'android.view.LayoutInflater d' |
| 39 | # 'void setForceShowIcon(boolean)', 'void b(boolean)' |
| 40 | def _ParseMappingLine(line): |
| 41 | line = line.rstrip(':') |
| 42 | |
| 43 | # Stripping any line number denotations |
| 44 | line = re.sub(r'\d+:\d+:', '', line) |
| 45 | line = re.sub(r'\):\d+', ')', line) |
| 46 | |
| 47 | original_name, new_name = line.split(' -> ') |
| 48 | |
| 49 | type_string = '' |
| 50 | if ' ' in original_name: |
| 51 | type_string = original_name[:original_name.find(' ') + 1] |
| 52 | |
| 53 | arguments_string = '' |
| 54 | match = re.search(r'(\(.*?\))', original_name) |
| 55 | if match: |
| 56 | arguments_string = match.group(1) |
| 57 | |
| 58 | return original_name, type_string + new_name + arguments_string |
| 59 | |
| 60 | |
| 61 | def _ReadMappingDict(mapping_file): |
| 62 | mapping = {} |
| 63 | renamed_class_name = '' |
| 64 | original_class_name = '' |
smaier | 58d01504 | 2016-08-05 21:18:16 | [diff] [blame] | 65 | current_entry = [] |
smaier | b6dc58c | 2016-06-13 22:14:44 | [diff] [blame] | 66 | for line in mapping_file: |
| 67 | line = line.strip() |
| 68 | if _IsNewClass(line): |
| 69 | if renamed_class_name: |
| 70 | mapping[renamed_class_name] = current_entry |
| 71 | |
| 72 | member_mappings = {} |
| 73 | original_class_name, renamed_class_name = _ParseMappingLine(line) |
| 74 | current_entry = [original_class_name, member_mappings] |
| 75 | else: |
| 76 | original_member_name, renamed_member_name = _ParseMappingLine(line) |
| 77 | member_mappings[renamed_member_name] = original_member_name |
| 78 | |
smaier | 58d01504 | 2016-08-05 21:18:16 | [diff] [blame] | 79 | if current_entry and renamed_class_name: |
| 80 | mapping[renamed_class_name] = current_entry |
smaier | b6dc58c | 2016-06-13 22:14:44 | [diff] [blame] | 81 | return mapping |
| 82 | |
| 83 | |
| 84 | def _StripComments(string): |
| 85 | # Remove all occurances of multiline comments (/*COMMENT*/) |
| 86 | string = re.sub(r'/\*.*?\*/', "", string, flags=re.DOTALL) |
| 87 | # Remove all occurances of single line comments (//COMMENT) |
| 88 | string = re.sub(r'//.*?$', "", string) |
| 89 | return string |
| 90 | |
| 91 | |
| 92 | def _StripQuotes(string): |
| 93 | return re.sub(r'([\'"]).*?\1', '', string) |
| 94 | |
| 95 | |
| 96 | def _RemoveQualifiers(string_tokens): |
| 97 | while string_tokens and string_tokens[0] in _QUALIFIERS: |
| 98 | string_tokens = string_tokens[1:] |
| 99 | return string_tokens |
| 100 | |
| 101 | |
| 102 | def _GetLineTokens(line): |
| 103 | line = _StripComments(line) |
| 104 | # Match all alphanumeric + underscore with \w then cases for: |
| 105 | # '$', '<', '>', '{', '}', '[', ']', and '.' |
| 106 | tokens = re.findall(r'[\w\$\.<>\{\}\[\]]+', line) |
| 107 | return _RemoveQualifiers(tokens) |
| 108 | |
| 109 | |
| 110 | def _IsClassDefinition(line_tokens): |
| 111 | return line_tokens and line_tokens[0] == 'class' |
| 112 | |
| 113 | |
| 114 | def _IsEndOfClass_definition(line_tokens): |
| 115 | return line_tokens and line_tokens[-1] == '{' |
| 116 | |
| 117 | |
| 118 | def _IsEndOfClass(line_tokens): |
| 119 | return line_tokens and line_tokens[-1] == '}' |
| 120 | |
| 121 | |
| 122 | def _TypeLookup(renamed_type, mapping_dict): |
| 123 | renamed_type_stripped = renamed_type.strip('[]') |
| 124 | postfix = renamed_type.replace(renamed_type_stripped, '') |
| 125 | |
| 126 | if renamed_type_stripped in mapping_dict: |
| 127 | real_type = mapping_dict[renamed_type_stripped][0] |
| 128 | else: |
| 129 | real_type = renamed_type_stripped |
| 130 | |
| 131 | return real_type + postfix |
| 132 | |
| 133 | |
| 134 | def _GetMemberIdentifier(line_tokens, mapping_dict, renamed_class_name, |
| 135 | is_function): |
| 136 | assert len(line_tokens) > 1 |
smaier | 58d01504 | 2016-08-05 21:18:16 | [diff] [blame] | 137 | if mapping_dict: |
| 138 | assert renamed_class_name in mapping_dict |
| 139 | mapping_entry = mapping_dict[renamed_class_name][1] |
smaier | b6dc58c | 2016-06-13 22:14:44 | [diff] [blame] | 140 | |
| 141 | renamed_type = line_tokens[0] |
| 142 | real_type = _TypeLookup(renamed_type, mapping_dict) |
| 143 | |
| 144 | renamed_name_token = line_tokens[1] |
| 145 | renamed_name_token, _, _ = renamed_name_token.partition('=') |
| 146 | |
| 147 | function_args = '' |
| 148 | if is_function: |
| 149 | function_args += '(' |
| 150 | for token in line_tokens[2:]: |
| 151 | function_args += _TypeLookup(token, mapping_dict) + ',' |
| 152 | # Remove trailing ',' |
| 153 | function_args = function_args.rstrip(',') |
| 154 | function_args += ')' |
| 155 | |
| 156 | renamed_member_identifier = (real_type + ' ' + renamed_name_token |
| 157 | + function_args) |
smaier | 58d01504 | 2016-08-05 21:18:16 | [diff] [blame] | 158 | |
| 159 | if not mapping_dict: |
| 160 | return renamed_member_identifier |
| 161 | |
smaier | b6dc58c | 2016-06-13 22:14:44 | [diff] [blame] | 162 | if renamed_member_identifier not in mapping_entry: |
| 163 | print 'Proguarded class which caused the issue:', renamed_class_name |
| 164 | print 'Key supposed to be in this dict:', mapping_entry |
| 165 | print 'Definition line tokens:', line_tokens |
| 166 | |
| 167 | # This will be the real type + real_identifier + any real function args (if |
| 168 | # applicable) |
| 169 | return mapping_entry[renamed_member_identifier] |
| 170 | |
| 171 | |
| 172 | def _GetClassNames(line_tokens, mapping_dict): |
| 173 | assert len(line_tokens) > 1 |
smaier | 58d01504 | 2016-08-05 21:18:16 | [diff] [blame] | 174 | if not mapping_dict: |
| 175 | return line_tokens[1], line_tokens[1] |
smaier | b6dc58c | 2016-06-13 22:14:44 | [diff] [blame] | 176 | assert line_tokens[1] in mapping_dict |
| 177 | return line_tokens[1], mapping_dict[line_tokens[1]][0] |
| 178 | |
| 179 | |
| 180 | def _IsLineFunctionDefinition(line): |
| 181 | line = _StripComments(line) |
| 182 | line = _StripQuotes(line) |
| 183 | return line.find('(') > 0 and line.find(')') > 0 |
| 184 | |
| 185 | |
| 186 | # Expects data from dextra -j -m -f |
| 187 | # Returns dictionary mapping class name to list of members |
| 188 | def _BuildMappedDexDict(dextra_file, mapping_dict): |
| 189 | # Have to add 'bool' -> 'boolean' mapping in dictionary, since for some reason |
| 190 | # dextra shortens boolean to bool. |
smaier | 58d01504 | 2016-08-05 21:18:16 | [diff] [blame] | 191 | if mapping_dict: |
| 192 | mapping_dict['bool'] = ['boolean', {}] |
smaier | b6dc58c | 2016-06-13 22:14:44 | [diff] [blame] | 193 | dex_dict = {} |
| 194 | current_entry = [] |
| 195 | reading_class_header = True |
| 196 | unmatched_string = False |
| 197 | |
| 198 | for line in dextra_file: |
| 199 | # Accounting for multi line strings |
| 200 | if line.count('"') % 2: |
| 201 | unmatched_string = not unmatched_string |
| 202 | continue |
| 203 | if unmatched_string: |
| 204 | continue |
| 205 | |
| 206 | line_tokens = _GetLineTokens(line) |
| 207 | if _IsClassDefinition(line_tokens): |
| 208 | reading_class_header = True |
| 209 | renamed_class_name, real_class_name = _GetClassNames(line_tokens, |
| 210 | mapping_dict) |
| 211 | if _IsEndOfClass_definition(line_tokens): |
| 212 | reading_class_header = False |
| 213 | continue |
| 214 | if _IsEndOfClass(line_tokens): |
| 215 | dex_dict[real_class_name] = current_entry |
| 216 | current_entry = [] |
| 217 | continue |
| 218 | |
| 219 | if not reading_class_header and line_tokens: |
| 220 | is_function = _IsLineFunctionDefinition(line) |
| 221 | member = _GetMemberIdentifier(line_tokens, mapping_dict, |
| 222 | renamed_class_name, is_function) |
| 223 | current_entry.append(member) |
| 224 | |
| 225 | return dex_dict |
| 226 | |
| 227 | |
| 228 | def _DiffDexDicts(dex_base, dex_new): |
| 229 | diffs = [] |
| 230 | for key, base_class_members in dex_base.iteritems(): |
| 231 | if key in dex_new: |
| 232 | # Class in both |
| 233 | base_class_members_set = set(base_class_members) |
| 234 | # Removing from dex_new to have just those which only appear in dex_new |
| 235 | # left over. |
| 236 | new_class_members_set = set(dex_new.pop(key)) |
| 237 | if base_class_members_set == new_class_members_set: |
| 238 | continue |
| 239 | else: |
| 240 | # They are not equal |
| 241 | diff_string = key |
| 242 | for diff in base_class_members_set.difference(new_class_members_set): |
| 243 | # Base has stuff the new one doesn't |
| 244 | diff_string += '\n' + '- ' + diff |
| 245 | for diff in new_class_members_set.difference(base_class_members_set): |
| 246 | # New has stuff the base one doesn't |
| 247 | diff_string += '\n' + '+ ' + diff |
| 248 | diffs.append(diff_string) |
| 249 | else: |
| 250 | # Class not found in new |
| 251 | diff_string = '-class ' + key |
| 252 | diffs.append(diff_string) |
| 253 | if dex_new: |
| 254 | # Classes in new that have yet to be hit by base |
| 255 | for key in dex_new: |
| 256 | diff_string = '+class ' + key |
| 257 | diffs.append(diff_string) |
| 258 | |
| 259 | return diffs |
| 260 | |
| 261 | |
agrieve | 07bf88a | 2016-09-30 00:13:45 | [diff] [blame] | 262 | def _RunDextraOnDex(dex_path): |
| 263 | try: |
| 264 | out = subprocess.check_output( |
| 265 | ['dextra.ELF64', '-j', '-f', '-m', dex_path]) |
| 266 | return out.splitlines() |
| 267 | except OSError as e: |
| 268 | if e.errno == errno.ENOENT: |
| 269 | raise Exception('Ensure dextra.ELF64 is in your PATH') |
| 270 | raise |
| 271 | |
| 272 | |
| 273 | def _RunDextra(dex_or_apk_path): |
| 274 | if dex_or_apk_path.endswith('.dex'): |
| 275 | return _RunDextraOnDex(dex_or_apk_path) |
| 276 | |
| 277 | with tempfile.NamedTemporaryFile(suffix='.dex') as tmp_file: |
| 278 | with zipfile.ZipFile(dex_or_apk_path) as apk: |
| 279 | tmp_file.write(apk.read('classes.dex')) |
| 280 | tmp_file.flush() |
| 281 | return _RunDextraOnDex(tmp_file.name) |
| 282 | |
| 283 | |
smaier | b6dc58c | 2016-06-13 22:14:44 | [diff] [blame] | 284 | def main(): |
| 285 | parser = argparse.ArgumentParser() |
agrieve | 07bf88a | 2016-09-30 00:13:45 | [diff] [blame] | 286 | parser.add_argument('--base-mapping-file', |
smaier | b6dc58c | 2016-06-13 22:14:44 | [diff] [blame] | 287 | help='Mapping file from proguard output for base dex') |
agrieve | 07bf88a | 2016-09-30 00:13:45 | [diff] [blame] | 288 | parser.add_argument('--base-dextra-output', |
smaier | b6dc58c | 2016-06-13 22:14:44 | [diff] [blame] | 289 | help='dextra -j -f -m output for base dex') |
agrieve | 07bf88a | 2016-09-30 00:13:45 | [diff] [blame] | 290 | parser.add_argument('--new-mapping-file', |
smaier | b6dc58c | 2016-06-13 22:14:44 | [diff] [blame] | 291 | help='Mapping file from proguard output for new dex') |
agrieve | 07bf88a | 2016-09-30 00:13:45 | [diff] [blame] | 292 | parser.add_argument('--new-dextra-output', |
smaier | b6dc58c | 2016-06-13 22:14:44 | [diff] [blame] | 293 | help='dextra -j -f -m output for new dex') |
agrieve | 07bf88a | 2016-09-30 00:13:45 | [diff] [blame] | 294 | parser.add_argument('--old', |
| 295 | help='Path to base apk / classes.dex') |
| 296 | parser.add_argument('--new', |
| 297 | help='Path to new apk / classes.dex') |
smaier | b6dc58c | 2016-06-13 22:14:44 | [diff] [blame] | 298 | args = parser.parse_args() |
| 299 | |
agrieve | 07bf88a | 2016-09-30 00:13:45 | [diff] [blame] | 300 | mapping_base = {} |
| 301 | mapping_new = {} |
| 302 | if args.base_mapping_file: |
| 303 | with open(args.base_mapping_file) as f: |
| 304 | mapping_base = _ReadMappingDict(f) |
| 305 | if args.new_mapping_file: |
| 306 | with open(args.new_mapping_file) as f: |
| 307 | mapping_new = _ReadMappingDict(f) |
smaier | b6dc58c | 2016-06-13 22:14:44 | [diff] [blame] | 308 | |
agrieve | 07bf88a | 2016-09-30 00:13:45 | [diff] [blame] | 309 | if args.base_dextra_output: |
| 310 | with open(args.base_dextra_output) as f: |
| 311 | dex_base = _BuildMappedDexDict(f, mapping_base) |
| 312 | else: |
| 313 | assert args.old, 'Must pass either --old or --base-dextra-output' |
| 314 | print 'Running dextra #1' |
| 315 | lines = _RunDextra(args.old) |
| 316 | dex_base = _BuildMappedDexDict(lines, mapping_base) |
| 317 | if args.new_dextra_output: |
| 318 | with open(args.new_dextra_output) as f: |
| 319 | dex_new = _BuildMappedDexDict(f, mapping_new) |
| 320 | else: |
| 321 | assert args.new, 'Must pass either --new or --new-dextra-output' |
| 322 | print 'Running dextra #2' |
| 323 | lines = _RunDextra(args.new) |
| 324 | dex_new = _BuildMappedDexDict(lines, mapping_base) |
| 325 | |
| 326 | print 'Analyzing...' |
smaier | b6dc58c | 2016-06-13 22:14:44 | [diff] [blame] | 327 | diffs = _DiffDexDicts(dex_base, dex_new) |
| 328 | if diffs: |
| 329 | for diff in diffs: |
| 330 | print diff |
agrieve | 07bf88a | 2016-09-30 00:13:45 | [diff] [blame] | 331 | sys.exit(1) |
| 332 | else: |
| 333 | class_count = len(dex_base) |
| 334 | method_count = sum(len(v) for v in dex_base.itervalues()) |
| 335 | print ('No meaningful differences: ' |
| 336 | 'both have the same %d classes and %d methods.' % |
| 337 | (class_count, method_count)) |
smaier | b6dc58c | 2016-06-13 22:14:44 | [diff] [blame] | 338 | |
| 339 | |
| 340 | if __name__ == '__main__': |
| 341 | main() |
| 342 | |