Blame - tools/uberblame.py - chromium/src.git

blob: 9c99a0d22e3634d70825e0c424894dca7a756a17 [file] [log] [blame]

Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	1	#!/usr/bin/env python
				2	# Copyright 2017 The Chromium Authors. All rights reserved.
				3	# Use of this source code is governed by a BSD-style license that can be
				4	# found in the LICENSE file.
				5
				6	import argparse
				7	import cgi
				8	import colorsys
				9	import difflib
				10	import random
				11	import os
				12	import re
				13	import subprocess
				14	import sys
				15	import tempfile
				16	import textwrap
				17	import webbrowser
				18
				19
				20	class TokenContext(object):
				21	"""Metadata about a token.
				22
				23	Attributes:
				24	row: Row index of the token in the data file.
				25	column: Column index of the token in the data file.
				26	token: The token string.
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	27	commit: A Commit object that corresponds to the commit that added
				28	this token.
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	29	"""
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	30
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	31	def __init__(self, row, column, token, commit=None):
				32	self.row = row
				33	self.column = column
				34	self.token = token
				35	self.commit = commit
				36
				37
				38	class Commit(object):
				39	"""Commit data.
				40
				41	Attributes:
				42	hash: The commit hash.
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	43	author_name: The author's name.
				44	author_email: the author's email.
				45	author_date: The date and time the author created this commit.
				46	message: The commit message.
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	47	diff: The commit diff.
				48	"""
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	49
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	50	def __init__(self, hash, author_name, author_email, author_date, message,
				51	diff):
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	52	self.hash = hash
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	53	self.author_name = author_name
				54	self.author_email = author_email
				55	self.author_date = author_date
				56	self.message = message
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	57	self.diff = diff
				58
				59
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	60	def tokenize_data(data, tokenize_by_char, tokenize_whitespace):
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	61	"""Tokenizes \|data\|.
				62
				63	Args:
				64	data: String to tokenize.
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	65	tokenize_by_char: If true, individual characters are treated as tokens.
				66	Otherwise, tokens are either symbols or strings of both alphanumeric
				67	characters and underscores.
				68	tokenize_whitespace: Treat non-newline whitespace characters as tokens.
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	69
				70	Returns:
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	71	A list of lists of TokenContexts. Each list represents a line.
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	72	"""
				73	contexts = []
				74	in_identifier = False
				75	identifier_start = 0
				76	identifier = ''
				77	row = 0
				78	column = 0
				79	line_contexts = []
				80
				81	for c in data + '\n':
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	82	if not tokenize_by_char and (c.isalnum() or c == '_'):
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	83	if in_identifier:
				84	identifier += c
				85	else:
				86	in_identifier = True
				87	identifier_start = column
				88	identifier = c
				89	else:
				90	if in_identifier:
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	91	line_contexts.append(TokenContext(row, identifier_start, identifier))
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	92	in_identifier = False
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	93	if not c.isspace() or (tokenize_whitespace and c != '\n'):
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	94	line_contexts.append(TokenContext(row, column, c))
				95
				96	if c == '\n':
				97	row += 1
				98	column = 0
				99	contexts.append(line_contexts)
				100	line_tokens = []
				101	line_contexts = []
				102	else:
				103	column += 1
				104	return contexts
				105
				106
				107	def compute_unified_diff(old_tokens, new_tokens):
				108	"""Computes the diff between \|old_tokens\| and \|new_tokens\|.
				109
				110	Args:
				111	old_tokens: Token strings corresponding to the old data.
				112	new_tokens: Token strings corresponding to the new data.
				113
				114	Returns:
				115	The diff, in unified diff format.
				116	"""
				117	return difflib.unified_diff(old_tokens, new_tokens, n=0, lineterm='')
				118
				119
				120	def parse_chunk_header_file_range(file_range):
				121	"""Parses a chunk header file range.
				122
				123	Diff chunk headers have the form:
				124	@@ -<file-range> +<file-range> @@
				125	File ranges have the form:
				126	<start line number>,<number of lines changed>
				127
				128	Args:
				129	file_range: A chunk header file range.
				130
				131	Returns:
				132	A tuple (range_start, range_end). The endpoints are adjusted such that
				133	iterating over [range_start, range_end) will give the changed indices.
				134	"""
				135	if ',' in file_range:
				136	file_range_parts = file_range.split(',')
				137	start = int(file_range_parts[0])
				138	amount = int(file_range_parts[1])
				139	if amount == 0:
				140	return (start, start)
				141	return (start - 1, start + amount - 1)
				142	else:
				143	return (int(file_range) - 1, int(file_range))
				144
				145
				146	def compute_changed_token_indices(previous_tokens, current_tokens):
				147	"""Computes changed and added tokens.
				148
				149	Args:
				150	previous_tokens: Tokens corresponding to the old file.
				151	current_tokens: Tokens corresponding to the new file.
				152
				153	Returns:
				154	A tuple (added_tokens, changed_tokens).
				155	added_tokens: A list of indices into \|current_tokens\|.
				156	changed_tokens: A map of indices into \|current_tokens\| to
				157	indices into \|previous_tokens\|.
				158	"""
				159	prev_file_chunk_end = 0
				160	prev_patched_chunk_end = 0
				161	added_tokens = []
				162	changed_tokens = {}
				163	for line in compute_unified_diff(previous_tokens, current_tokens):
				164	if line.startswith("@@"):
				165	parts = line.split(' ')
				166	removed = parts[1].lstrip('-')
				167	removed_start, removed_end = parse_chunk_header_file_range(removed)
				168	added = parts[2].lstrip('+')
				169	added_start, added_end = parse_chunk_header_file_range(added)
				170	for i in range(added_start, added_end):
				171	added_tokens.append(i)
				172	for i in range(0, removed_start - prev_patched_chunk_end):
				173	changed_tokens[prev_file_chunk_end + i] = prev_patched_chunk_end + i
				174	prev_patched_chunk_end = removed_end
				175	prev_file_chunk_end = added_end
				176	for i in range(0, len(previous_tokens) - prev_patched_chunk_end):
				177	changed_tokens[prev_file_chunk_end + i] = prev_patched_chunk_end + i
				178	return added_tokens, changed_tokens
				179
				180
				181	def flatten_nested_list(l):
				182	"""Flattens a list and provides a mapping from elements in the list back
				183	into the nested list.
				184
				185	Args:
				186	l: A list of lists.
				187
				188	Returns:
				189	A tuple (flattened, index_to_position):
				190	flattened: The flattened list.
				191	index_to_position: A list of pairs (r, c) such that
				192	index_to_position[i] == (r, c); flattened[i] == l[r][c]
				193	"""
				194	flattened = []
				195	index_to_position = {}
				196	r = 0
				197	c = 0
				198	for nested_list in l:
				199	for element in nested_list:
				200	index_to_position[len(flattened)] = (r, c)
				201	flattened.append(element)
				202	c += 1
				203	r += 1
				204	c = 0
				205	return (flattened, index_to_position)
				206
				207
				208	def compute_changed_token_positions(previous_tokens, current_tokens):
				209	"""Computes changed and added token positions.
				210
				211	Args:
				212	previous_tokens: A list of lists of token strings. Lines in the file
				213	correspond to the nested lists.
				214	current_tokens: A list of lists of token strings. Lines in the file
				215	correspond to the nested lists.
				216
				217	Returns:
				218	A tuple (added_token_positions, changed_token_positions):
				219	added_token_positions: A list of pairs that index into \|current_tokens\|.
				220	changed_token_positions: A map from pairs that index into
				221	\|current_tokens\| to pairs that index into \|previous_tokens\|.
				222	"""
				223	flat_previous_tokens, previous_index_to_position = flatten_nested_list(
				224	previous_tokens)
				225	flat_current_tokens, current_index_to_position = flatten_nested_list(
				226	current_tokens)
				227	added_indices, changed_indices = compute_changed_token_indices(
				228	flat_previous_tokens, flat_current_tokens)
				229	added_token_positions = [current_index_to_position[i] for i in added_indices]
				230	changed_token_positions = {
				231	current_index_to_position[current_i]:
				232	previous_index_to_position[changed_indices[current_i]]
				233	for current_i in changed_indices
				234	}
				235	return (added_token_positions, changed_token_positions)
				236
				237
				238	def parse_chunks_from_diff(diff):
				239	"""Returns a generator of chunk data from a diff.
				240
				241	Args:
				242	diff: A list of strings, with each string being a line from a diff
				243	in unified diff format.
				244
				245	Returns:
				246	A generator of tuples (added_lines_start, added_lines_end,
				247	removed_lines, removed_lines_start)
				248	"""
				249	in_chunk = False
				250	chunk_previous = []
				251	previous_start = None
				252	current_start = None
				253	current_end = None
				254	for line in diff:
				255	if line.startswith('@@'):
				256	if in_chunk:
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	257	yield (current_start, current_end, chunk_previous, previous_start)
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	258	parts = line.split(' ')
				259	previous = parts[1].lstrip('-')
				260	previous_start, _ = parse_chunk_header_file_range(previous)
				261	current = parts[2].lstrip('+')
				262	current_start, current_end = parse_chunk_header_file_range(current)
				263	in_chunk = True
				264	chunk_previous = []
				265	elif in_chunk and line.startswith('-'):
				266	chunk_previous.append(line[1:])
				267	if current_start != None:
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	268	yield (current_start, current_end, chunk_previous, previous_start)
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	269
				270
				271	def should_skip_commit(commit):
				272	"""Decides if \|commit\| should be skipped when computing the blame.
				273
				274	Commit 5d4451e deleted all files in the repo except for DEPS. The
				275	next commit, 1e7896, brought them back. This is a hack to skip
				276	those commits (except for the files they modified). If we did not
				277	do this, changes would be incorrectly attributed to 1e7896.
				278
				279	Args:
				280	commit: A Commit object.
				281
				282	Returns:
				283	A boolean indicating if this commit should be skipped.
				284	"""
				285	banned_commits = [
				286	'1e78967ed2f1937b3809c19d91e7dd62d756d307',
				287	'5d4451ebf298d9d71f716cc0135f465cec41fcd0',
				288	]
				289	if commit.hash not in banned_commits:
				290	return False
				291	banned_commits_file_exceptions = [
				292	'DEPS',
				293	'chrome/browser/ui/views/file_manager_dialog_browsertest.cc',
				294	]
				295	for line in commit.diff:
				296	if line.startswith('---') or line.startswith('+++'):
				297	if line.split(' ')[1] in banned_commits_file_exceptions:
				298	return False
				299	elif line.startswith('@@'):
				300	return True
				301	assert False
				302
				303
Tom Anderson	1e71692	2017-10-12 19:43:49	[diff] [blame]	304	def generate_substrings(file):
				305	"""Generates substrings from a file stream, where substrings are
				306	separated by '\0'.
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	307
Tom Anderson	1e71692	2017-10-12 19:43:49	[diff] [blame]	308	For example, the input:
				309	'a\0bc\0\0\0d\0'
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	310	would produce the output:
Tom Anderson	1e71692	2017-10-12 19:43:49	[diff] [blame]	311	['a', 'bc', 'd']
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	312
				313	Args:
Tom Anderson	1e71692	2017-10-12 19:43:49	[diff] [blame]	314	file: A readable file.
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	315	"""
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	316	BUF_SIZE = 448 # Experimentally found to be pretty fast.
				317	data = []
Tom Anderson	1e71692	2017-10-12 19:43:49	[diff] [blame]	318	while True:
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	319	buf = file.read(BUF_SIZE)
				320	parts = buf.split('\0')
				321	data.append(parts[0])
				322	if len(parts) > 1:
				323	joined = ''.join(data)
				324	if joined != '':
				325	yield joined
				326	for i in range(1, len(parts) - 1):
				327	if parts[i] != '':
				328	yield parts[i]
				329	data = [parts[-1]]
				330	if len(buf) < BUF_SIZE:
				331	joined = ''.join(data)
				332	if joined != '':
				333	yield joined
				334	return
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	335
				336
				337	def generate_commits(git_log_stdout):
				338	"""Parses git log output into a stream of Commit objects.
				339	"""
Tom Anderson	1e71692	2017-10-12 19:43:49	[diff] [blame]	340	substring_generator = generate_substrings(git_log_stdout)
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	341	while True:
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	342	hash = substring_generator.next()
				343	author_name = substring_generator.next()
				344	author_email = substring_generator.next()
				345	author_date = substring_generator.next()
				346	message = substring_generator.next()
				347	diff = substring_generator.next().split('\n')
				348	yield Commit(hash, author_name, author_email, author_date, message, diff)
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	349
				350
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	351	def uberblame_aux(file_name, git_log_stdout, data, tokenization_method):
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	352	"""Computes the uberblame of file \|file_name\|.
				353
				354	Args:
				355	file_name: File to uberblame.
				356	git_log_stdout: A file object that represents the git log output.
				357	data: A string containing the data of file \|file_name\|.
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	358	tokenization_method: A function that takes a string and returns a list of
				359	TokenContexts.
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	360
				361	Returns:
				362	A tuple (data, blame).
				363	data: File contents.
				364	blame: A list of TokenContexts.
				365	"""
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	366	blame = tokenization_method(data)
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	367
				368	blamed_tokens = 0
				369	total_tokens = len(blame)
				370	uber_blame = (data, blame[:])
				371
				372	for commit in generate_commits(git_log_stdout):
				373	if should_skip_commit(commit):
				374	continue
				375
				376	offset = 0
				377	for (added_lines_start, added_lines_end, removed_lines,
				378	removed_lines_start) in parse_chunks_from_diff(commit.diff):
				379	added_lines_start += offset
				380	added_lines_end += offset
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	381	previous_contexts = [
				382	token_lines
				383	for line_previous in removed_lines
				384	for token_lines in tokenization_method(line_previous)
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	385	]
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	386	previous_tokens = [[context.token for context in contexts]
				387	for contexts in previous_contexts]
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	388	current_contexts = blame[added_lines_start:added_lines_end]
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	389	current_tokens = [[context.token for context in contexts]
				390	for contexts in current_contexts]
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	391	added_token_positions, changed_token_positions = (
				392	compute_changed_token_positions(previous_tokens, current_tokens))
				393	for r, c in added_token_positions:
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	394	current_contexts[r][c].commit = commit
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	395	blamed_tokens += 1
				396	for r, c in changed_token_positions:
				397	pr, pc = changed_token_positions[(r, c)]
				398	previous_contexts[pr][pc] = current_contexts[r][c]
				399
				400	assert added_lines_start <= added_lines_end <= len(blame)
				401	current_blame_size = len(blame)
				402	blame[added_lines_start:added_lines_end] = previous_contexts
				403	offset += len(blame) - current_blame_size
				404
				405	assert blame == [] or blame == [[]]
				406	return uber_blame
				407
				408
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	409	def uberblame(file_name, revision, tokenization_method):
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	410	"""Computes the uberblame of file \|file_name\|.
				411
				412	Args:
				413	file_name: File to uberblame.
				414	revision: The revision to start the uberblame at.
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	415	tokenization_method: A function that takes a string and returns a list of
				416	TokenContexts.
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	417
				418	Returns:
				419	A tuple (data, blame).
				420	data: File contents.
				421	blame: A list of TokenContexts.
				422	"""
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	423	cmd_git_log = [
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	424	'git', 'log', '--minimal', '--no-prefix', '--follow', '-m',
				425	'--first-parent', '-p', '-U0', '-z',
				426	'--format=%x00%H%x00%an%x00%ae%x00%ad%x00%B', revision, '--', file_name
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	427	]
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	428	git_log = subprocess.Popen(
				429	cmd_git_log, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	430	data = subprocess.check_output(
				431	['git', 'show', '%s:%s' % (revision, file_name)])
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	432	data, blame = uberblame_aux(file_name, git_log.stdout, data,
				433	tokenization_method)
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	434
				435	_, stderr = git_log.communicate()
				436	if git_log.returncode != 0:
				437	raise subprocess.CalledProcessError(git_log.returncode, cmd_git_log, stderr)
				438	return data, blame
				439
				440
				441	def generate_pastel_color():
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	442	"""Generates a random color from a nice looking pastel palette.
				443
				444	Returns:
				445	The color, formatted as hex string. For example, white is "#FFFFFF".
				446	"""
				447	(h, l, s) = (random.uniform(0, 1), random.uniform(0.8, 0.9), random.uniform(
				448	0.5, 1))
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	449	(r, g, b) = colorsys.hls_to_rgb(h, l, s)
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	450	return "#%0.2X%0.2X%0.2X" % (int(r * 255), int(g * 255), int(b * 255))
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	451
				452
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	453	def create_visualization(data, blame):
				454	"""Creates a web page to visualize \|blame\|.
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	455
				456	Args:
				457	data: The data file as returned by uberblame().
				458	blame: A list of TokenContexts as returned by uberblame().
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	459
				460	Returns;
				461	The html for the generated page, as a string.
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	462	"""
				463	# Use the same seed for the color generator on each run so that
				464	# loading the same blame of the same file twice will result in the
				465	# same generated HTML page.
				466	random.seed(0x52937865ec62d1ea)
				467	html = """\
				468	<html>
				469	<head>
				470	<style>
				471	body {
				472	font-family: "Courier New";
				473	}
				474	pre {
				475	display: inline;
				476	}
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	477	span {
Tom Anderson	a671dad	2017-10-10 19:19:47	[diff] [blame]	478	outline: 1pt solid #00000030;
				479	outline-offset: -1pt;
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	480	cursor: pointer;
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	481	}
				482	#linenums {
				483	text-align: right;
				484	}
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	485	#file_display {
				486	position: absolute;
				487	left: 0;
				488	top: 0;
				489	width: 50%%;
				490	height: 100%%;
				491	overflow: scroll;
				492	}
				493	#commit_display_container {
				494	position: absolute;
				495	left: 50%%;
				496	top: 0;
				497	width: 50%%;
				498	height: 100%%;
				499	overflow: scroll;
				500	}
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	501	</style>
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	502	<script>
				503	commit_data = %s;
				504	function display_commit(hash) {
				505	var e = document.getElementById("commit_display");
				506	e.innerHTML = commit_data[hash]
				507	}
				508	</script>
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	509	</head>
				510	<body>
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	511	<div id="file_display">
				512	<table>
				513	<tbody>
				514	<tr>
				515	<td valign="top" id="linenums">
				516	<pre>%s</pre>
				517	</td>
				518	<td valign="top">
				519	<pre>%s</pre>
				520	</td>
				521	</tr>
				522	</tbody>
				523	</table>
				524	</div>
				525	<div id="commit_display_container" valign="top">
				526	<pre id="commit_display" />
				527	</div>
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	528	</body>
				529	</html>
				530	"""
				531	html = textwrap.dedent(html)
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	532	commits = {}
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	533	lines = []
				534	commit_colors = {}
				535	blame_index = 0
				536	blame = [context for contexts in blame for context in contexts]
				537	row = 0
				538	lastline = ''
				539	for line in data.split('\n'):
				540	lastline = line
				541	column = 0
				542	for c in line + '\n':
				543	if blame_index < len(blame):
				544	token_context = blame[blame_index]
				545	if (row == token_context.row and
				546	column == token_context.column + len(token_context.token)):
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	547	if (blame_index + 1 == len(blame) or blame[blame_index].commit.hash !=
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	548	blame[blame_index + 1].commit.hash):
				549	lines.append('</span>')
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	550	blame_index += 1
				551	if blame_index < len(blame):
				552	token_context = blame[blame_index]
				553	if row == token_context.row and column == token_context.column:
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	554	if (blame_index == 0 or blame[blame_index - 1].commit.hash !=
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	555	blame[blame_index].commit.hash):
				556	hash = token_context.commit.hash
				557	commits[hash] = token_context.commit
				558	if hash not in commit_colors:
				559	commit_colors[hash] = generate_pastel_color()
				560	color = commit_colors[hash]
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	561	lines.append(('<span style="background-color: %s" ' +
				562	'onclick="display_commit("%s")">') % (color,
				563	hash))
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	564	lines.append(cgi.escape(c))
				565	column += 1
				566	row += 1
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	567	commit_data = ['{']
				568	commit_display_format = """\
				569	commit: {hash}
				570	Author: {author_name} <{author_email}>
				571	Date: {author_date}
				572
				573	{message}
				574	"""
				575	commit_display_format = textwrap.dedent(commit_display_format)
				576	links = re.compile(r'(https?:\/\/\S+)')
				577	for hash in commits:
				578	commit = commits[hash]
				579	commit_display = commit_display_format.format(
				580	hash=hash,
				581	author_name=commit.author_name,
				582	author_email=commit.author_email,
				583	author_date=commit.author_date,
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	584	message=commit.message)
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	585	commit_display = cgi.escape(commit_display, quote=True)
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	586	commit_display = re.sub(links, '<a href=\\"\\1\\">\\1</a>', commit_display)
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	587	commit_display = commit_display.replace('\n', '\\n')
				588	commit_data.append('"%s": "%s",' % (hash, commit_display))
				589	commit_data.append('}')
				590	commit_data = ''.join(commit_data)
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	591	line_nums = range(1, row if lastline.strip() == '' else row + 1)
				592	line_nums = '\n'.join([str(num) for num in line_nums])
				593	lines = ''.join(lines)
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	594	return html % (commit_data, line_nums, lines)
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	595
				596
				597	def show_visualization(html):
				598	"""Display \|html\| in a web browser.
				599
				600	Args:
				601	html: The contents of the file to display, as a string.
				602	"""
				603	# Keep the temporary file around so the browser has time to open it.
				604	# TODO(thomasanderson): spin up a temporary web server to serve this
				605	# file so we don't have to leak it.
				606	html_file = tempfile.NamedTemporaryFile(delete=False, suffix='.html')
				607	html_file.write(html)
				608	html_file.flush()
				609	if sys.platform.startswith('linux'):
				610	# Don't show any messages when starting the browser.
				611	saved_stdout = os.dup(1)
				612	saved_stderr = os.dup(2)
				613	os.close(1)
				614	os.close(2)
				615	os.open(os.devnull, os.O_RDWR)
				616	os.open(os.devnull, os.O_RDWR)
				617	webbrowser.open('file://' + html_file.name)
				618	if sys.platform.startswith('linux'):
				619	os.dup2(saved_stdout, 1)
				620	os.dup2(saved_stderr, 2)
				621	os.close(saved_stdout)
				622	os.close(saved_stderr)
				623
				624
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	625	def main(argv):
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	626	parser = argparse.ArgumentParser(
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	627	description='Show what revision last modified each token of a file.')
				628	parser.add_argument(
				629	'revision',
				630	default='HEAD',
				631	nargs='?',
				632	help='show only commits starting from a revision')
				633	parser.add_argument('file', help='the file to uberblame')
				634	parser.add_argument(
				635	'--skip-visualization',
				636	action='store_true',
				637	help='do not display the blame visualization in a web browser')
				638	parser.add_argument(
				639	'--tokenize-by-char',
				640	action='store_true',
				641	help='treat individual characters as tokens')
				642	parser.add_argument(
				643	'--tokenize-whitespace',
				644	action='store_true',
				645	help='also blame non-newline whitespace characters')
				646	args = parser.parse_args(argv)
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	647
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	648	def tokenization_method(data):
				649	return tokenize_data(data, args.tokenize_by_char, args.tokenize_whitespace)
				650
				651	data, blame = uberblame(args.file, args.revision, tokenization_method)
				652	html = create_visualization(data, blame)
				653	if not args.skip_visualization:
				654	show_visualization(html)
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	655	return 0
				656
				657
				658	if __name__ == '__main__':
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	659	sys.exit(main(sys.argv[1:]))