blob: e4a07b6ec66a084cb8041e1758700cd57cfcda19 [file] [log] [blame]
#!/usr/bin/env python
# Copyright 2016 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Tool to diff 2 dex files that have been proguarded.
To use this tool, first get dextra. http://newandroidbook.com/tools/dextra.html
Then invoke script like:
PATH=$PATH:/path/to/dextra dexdiffer.py --old classes1.dex --new classes2.dex
apks files may be used as well.
"""
import argparse
import errno
import re
import subprocess
import sys
import tempfile
import zipfile
_QUALIFIERS = set(['public', 'protected', 'private', 'final', 'static',
'abstract', 'volatile', 'native', 'enum'])
def _IsNewClass(line):
return line.endswith(':')
# Expects lines like one of these 3:
# 'android.support.v8.MenuPopupHelper -> android.support.v8.v:'
# ' android.view.LayoutInflater mInflater -> d'
# ' 117:118:void setForceShowIcon(boolean) -> b'
# Those three examples would return
# 'android.support.v8.MenuPopupHelper', 'android.support.v8.v'
# 'android.view.LayoutInflater mInflater', 'android.view.LayoutInflater d'
# 'void setForceShowIcon(boolean)', 'void b(boolean)'
def _ParseMappingLine(line):
line = line.rstrip(':')
# Stripping any line number denotations
line = re.sub(r'\d+:\d+:', '', line)
line = re.sub(r'\):\d+', ')', line)
original_name, new_name = line.split(' -> ')
type_string = ''
if ' ' in original_name:
type_string = original_name[:original_name.find(' ') + 1]
arguments_string = ''
match = re.search(r'(\(.*?\))', original_name)
if match:
arguments_string = match.group(1)
return original_name, type_string + new_name + arguments_string
def _ReadMappingDict(mapping_file):
mapping = {}
renamed_class_name = ''
original_class_name = ''
current_entry = []
for line in mapping_file:
line = line.strip()
if _IsNewClass(line):
if renamed_class_name:
mapping[renamed_class_name] = current_entry
member_mappings = {}
original_class_name, renamed_class_name = _ParseMappingLine(line)
current_entry = [original_class_name, member_mappings]
else:
original_member_name, renamed_member_name = _ParseMappingLine(line)
member_mappings[renamed_member_name] = original_member_name
if current_entry and renamed_class_name:
mapping[renamed_class_name] = current_entry
return mapping
def _StripComments(string):
# Remove all occurances of multiline comments (/*COMMENT*/)
string = re.sub(r'/\*.*?\*/', "", string, flags=re.DOTALL)
# Remove all occurances of single line comments (//COMMENT)
string = re.sub(r'//.*?$', "", string)
return string
def _StripQuotes(string):
return re.sub(r'([\'"]).*?\1', '', string)
def _RemoveQualifiers(string_tokens):
while string_tokens and string_tokens[0] in _QUALIFIERS:
string_tokens = string_tokens[1:]
return string_tokens
def _GetLineTokens(line):
line = _StripComments(line)
# Match all alphanumeric + underscore with \w then cases for:
# '$', '<', '>', '{', '}', '[', ']', and '.'
tokens = re.findall(r'[\w\$\.<>\{\}\[\]]+', line)
return _RemoveQualifiers(tokens)
def _IsClassDefinition(line_tokens):
return line_tokens and line_tokens[0] == 'class'
def _IsEndOfClass_definition(line_tokens):
return line_tokens and line_tokens[-1] == '{'
def _IsEndOfClass(line_tokens):
return line_tokens and line_tokens[-1] == '}'
def _TypeLookup(renamed_type, mapping_dict):
renamed_type_stripped = renamed_type.strip('[]')
postfix = renamed_type.replace(renamed_type_stripped, '')
if renamed_type_stripped in mapping_dict:
real_type = mapping_dict[renamed_type_stripped][0]
else:
real_type = renamed_type_stripped
return real_type + postfix
def _GetMemberIdentifier(line_tokens, mapping_dict, renamed_class_name,
is_function):
assert len(line_tokens) > 1
if mapping_dict:
assert renamed_class_name in mapping_dict
mapping_entry = mapping_dict[renamed_class_name][1]
renamed_type = line_tokens[0]
real_type = _TypeLookup(renamed_type, mapping_dict)
renamed_name_token = line_tokens[1]
renamed_name_token, _, _ = renamed_name_token.partition('=')
function_args = ''
if is_function:
function_args += '('
for token in line_tokens[2:]:
function_args += _TypeLookup(token, mapping_dict) + ','
# Remove trailing ','
function_args = function_args.rstrip(',')
function_args += ')'
renamed_member_identifier = (real_type + ' ' + renamed_name_token
+ function_args)
if not mapping_dict:
return renamed_member_identifier
if renamed_member_identifier not in mapping_entry:
print 'Proguarded class which caused the issue:', renamed_class_name
print 'Key supposed to be in this dict:', mapping_entry
print 'Definition line tokens:', line_tokens
# This will be the real type + real_identifier + any real function args (if
# applicable)
return mapping_entry[renamed_member_identifier]
def _GetClassNames(line_tokens, mapping_dict):
assert len(line_tokens) > 1
if not mapping_dict:
return line_tokens[1], line_tokens[1]
assert line_tokens[1] in mapping_dict
return line_tokens[1], mapping_dict[line_tokens[1]][0]
def _IsLineFunctionDefinition(line):
line = _StripComments(line)
line = _StripQuotes(line)
return line.find('(') > 0 and line.find(')') > 0
# Expects data from dextra -j -m -f
# Returns dictionary mapping class name to list of members
def _BuildMappedDexDict(dextra_file, mapping_dict):
# Have to add 'bool' -> 'boolean' mapping in dictionary, since for some reason
# dextra shortens boolean to bool.
if mapping_dict:
mapping_dict['bool'] = ['boolean', {}]
dex_dict = {}
current_entry = []
reading_class_header = True
unmatched_string = False
for line in dextra_file:
# Accounting for multi line strings
if line.count('"') % 2:
unmatched_string = not unmatched_string
continue
if unmatched_string:
continue
line_tokens = _GetLineTokens(line)
if _IsClassDefinition(line_tokens):
reading_class_header = True
renamed_class_name, real_class_name = _GetClassNames(line_tokens,
mapping_dict)
if _IsEndOfClass_definition(line_tokens):
reading_class_header = False
continue
if _IsEndOfClass(line_tokens):
dex_dict[real_class_name] = current_entry
current_entry = []
continue
if not reading_class_header and line_tokens:
is_function = _IsLineFunctionDefinition(line)
member = _GetMemberIdentifier(line_tokens, mapping_dict,
renamed_class_name, is_function)
current_entry.append(member)
return dex_dict
def _DiffDexDicts(dex_base, dex_new):
diffs = []
for key, base_class_members in dex_base.iteritems():
if key in dex_new:
# Class in both
base_class_members_set = set(base_class_members)
# Removing from dex_new to have just those which only appear in dex_new
# left over.
new_class_members_set = set(dex_new.pop(key))
if base_class_members_set == new_class_members_set:
continue
else:
# They are not equal
diff_string = key
for diff in base_class_members_set.difference(new_class_members_set):
# Base has stuff the new one doesn't
diff_string += '\n' + '- ' + diff
for diff in new_class_members_set.difference(base_class_members_set):
# New has stuff the base one doesn't
diff_string += '\n' + '+ ' + diff
diffs.append(diff_string)
else:
# Class not found in new
diff_string = '-class ' + key
diffs.append(diff_string)
if dex_new:
# Classes in new that have yet to be hit by base
for key in dex_new:
diff_string = '+class ' + key
diffs.append(diff_string)
return diffs
def _RunDextraOnDex(dex_path):
try:
out = subprocess.check_output(
['dextra.ELF64', '-j', '-f', '-m', dex_path])
return out.splitlines()
except OSError as e:
if e.errno == errno.ENOENT:
raise Exception('Ensure dextra.ELF64 is in your PATH')
raise
def _RunDextra(dex_or_apk_path):
if dex_or_apk_path.endswith('.dex'):
return _RunDextraOnDex(dex_or_apk_path)
with tempfile.NamedTemporaryFile(suffix='.dex') as tmp_file:
with zipfile.ZipFile(dex_or_apk_path) as apk:
tmp_file.write(apk.read('classes.dex'))
tmp_file.flush()
return _RunDextraOnDex(tmp_file.name)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--base-mapping-file',
help='Mapping file from proguard output for base dex')
parser.add_argument('--base-dextra-output',
help='dextra -j -f -m output for base dex')
parser.add_argument('--new-mapping-file',
help='Mapping file from proguard output for new dex')
parser.add_argument('--new-dextra-output',
help='dextra -j -f -m output for new dex')
parser.add_argument('--old',
help='Path to base apk / classes.dex')
parser.add_argument('--new',
help='Path to new apk / classes.dex')
args = parser.parse_args()
mapping_base = {}
mapping_new = {}
if args.base_mapping_file:
with open(args.base_mapping_file) as f:
mapping_base = _ReadMappingDict(f)
if args.new_mapping_file:
with open(args.new_mapping_file) as f:
mapping_new = _ReadMappingDict(f)
if args.base_dextra_output:
with open(args.base_dextra_output) as f:
dex_base = _BuildMappedDexDict(f, mapping_base)
else:
assert args.old, 'Must pass either --old or --base-dextra-output'
print 'Running dextra #1'
lines = _RunDextra(args.old)
dex_base = _BuildMappedDexDict(lines, mapping_base)
if args.new_dextra_output:
with open(args.new_dextra_output) as f:
dex_new = _BuildMappedDexDict(f, mapping_new)
else:
assert args.new, 'Must pass either --new or --new-dextra-output'
print 'Running dextra #2'
lines = _RunDextra(args.new)
dex_new = _BuildMappedDexDict(lines, mapping_base)
print 'Analyzing...'
diffs = _DiffDexDicts(dex_base, dex_new)
if diffs:
for diff in diffs:
print diff
sys.exit(1)
else:
class_count = len(dex_base)
method_count = sum(len(v) for v in dex_base.itervalues())
print ('No meaningful differences: '
'both have the same %d classes and %d methods.' %
(class_count, method_count))
if __name__ == '__main__':
main()