cygprofile: Improve orderfile comparison.

This CL adds the "fractional average distance" to examine ordering stability.
Also makes it compatible with orderfiles with symbols.

For instance, for a regular roll:
$ tools/cygprofile/compare_orderfiles.py --from-commit f9062ad5c6383ae959f1c80d20099375a2be745e
[...]
Symbols count:
        first:  417918
        second: 418657
New symbols = 1004
Removed symbols = 265
Average fractional distance = 0.15%

And a significant one:
$ ./tools/cygprofile/compare_orderfiles.py --from-commit 4c1e7a85c8bac01731203990fd4b66a7b5b19dd9
[...]
Symbols count:
        first:  428730
        second: 190074
New symbols = 16952
Removed symbols = 255608
Average fractional distance = 26.53%

Note that the distance is insensitive to the relative sizes of the
orderfiles, only to the relative ordering of the intersection.

Change-Id: I188684ef7a0bb545e6a55bc335d432dd6690a203
Reviewed-on: https://chromium-review.googlesource.com/876088
Commit-Queue: Benoit L <[email protected]>
Reviewed-by: Matthew Cary <[email protected]>
Cr-Commit-Position: refs/heads/master@{#539814}
diff --git a/tools/cygprofile/compare_orderfiles.py b/tools/cygprofile/compare_orderfiles.py
index 3cf2934..b110d8e 100755
--- a/tools/cygprofile/compare_orderfiles.py
+++ b/tools/cygprofile/compare_orderfiles.py
@@ -10,6 +10,7 @@
 """
 
 import argparse
+import collections
 import logging
 import os
 import subprocess
@@ -32,6 +33,10 @@
     lines = [line.strip() for line in f]
 
   for entry in lines:
+    # Keep only sections, not symbols (symbols don't contain '.').
+    # We could only keep symbols, but then old orderfiles would not be parsed.
+    if '.' not in entry:
+      continue
     # Example: .text.startup.BLA
     symbol_name = entry[entry.rindex('.'):]
     if symbol_name in already_seen or symbol_name == '*' or entry == '.text':
@@ -41,12 +46,32 @@
   return symbols
 
 
+def CommonSymbolsToOrder(symbols, common_symbols):
+  """Returns s -> index for all s in common_symbols."""
+  result = {}
+  index = 0
+  for s in symbols:
+    if s not in common_symbols:
+      continue
+    result[s] = index
+    index += 1
+  return result
+
+
+CompareResult = collections.namedtuple(
+    'CompareResult', ('first_count', 'second_count',
+                      'new_count', 'removed_count',
+                      'average_fractional_distance'))
+
 def Compare(first_filename, second_filename):
   """Outputs a comparison of two orderfiles to stdout.
 
   Args:
     first_filename: (str) First orderfile.
     second_filename: (str) Second orderfile.
+
+  Returns:
+    An instance of CompareResult.
   """
   first_symbols = ParseOrderfile(first_filename)
   second_symbols = ParseOrderfile(second_filename)
@@ -56,8 +81,22 @@
   second_symbols = set(second_symbols)
   new_symbols = second_symbols - first_symbols
   removed_symbols = first_symbols - second_symbols
+  common_symbols = first_symbols & second_symbols
+  # Distance between orderfiles.
+  first_to_ordering = CommonSymbolsToOrder(first_symbols, common_symbols)
+  second_to_ordering = CommonSymbolsToOrder(second_symbols, common_symbols)
+  total_distance = sum(abs(first_to_ordering[s] - second_to_ordering[s])\
+                       for s in first_to_ordering)
+  # Each distance is in [0, len(common_symbols)] and there are
+  # len(common_symbols) entries, hence the normalization.
+  average_fractional_distance = float(total_distance) / (len(common_symbols)**2)
   print 'New symbols = %d' % len(new_symbols)
   print 'Removed symbols = %d' % len(removed_symbols)
+  print 'Average fractional distance = %.2f%%' % (
+      100. * average_fractional_distance)
+  return CompareResult(len(first_symbols), len(second_symbols),
+                       len(new_symbols), len(removed_symbols),
+                       average_fractional_distance)
 
 
 def CheckOrderfileCommit(commit_hash, clank_path):
@@ -70,7 +109,8 @@
   output = subprocess.check_output(
       ['git', 'show', r'--format=%an %s', commit_hash], cwd=clank_path)
   first_line = output.split('\n')[0]
-  assert first_line == 'clank-autoroller Update Orderfile.', (
+  # Capitalization changed at some point.
+  assert first_line.upper() == 'clank-autoroller Update Orderfile.'.upper(), (
       'Not an orderfile commit')
 
 
@@ -129,6 +169,7 @@
   parser.add_argument('--second', help='Second orderfile')
   parser.add_argument('--from-commit', help='Analyze the difference in the '
                       'orderfile from an orderfile bot commit.')
+  parser.add_argument('--csv-output', help='Appends the result to a CSV file.')
   return parser
 
 
@@ -143,7 +184,11 @@
     first, second = GetOrderfilesFromCommit(args.from_commit)
     try:
       logging.info('Comparing the orderfiles')
-      Compare(first, second)
+      result = Compare(first, second)
+      if args.csv_output:
+        with open(args.csv_output, 'a') as f:
+          f.write('%s,%d,%d,%d,%d,%f\n' % tuple(
+              [args.from_commit] + list(result)))
     finally:
       os.remove(first)
       os.remove(second)