Blame - upload_to_google_storage.py - chromium/tools/depot_tools.git

blob: 031214d6af9e644d05dd990a95c5b4424b445eef [file] [log] [blame]

[email protected]	867e5b5	2013-03-13 21:43:51	[diff] [blame]	1	#!/usr/bin/env python
				2	# Copyright (c) 2012 The Chromium Authors. All rights reserved.
				3	# Use of this source code is governed by a BSD-style license that can be
				4	# found in the LICENSE file.
				5
				6	"""Uploads files to Google Storage content addressed."""
				7
				8	import hashlib
				9	import optparse
				10	import os
				11	import Queue
				12	import re
[email protected]	ba63bcb	2013-10-28 19:55:48	[diff] [blame]	13	import stat
[email protected]	867e5b5	2013-03-13 21:43:51	[diff] [blame]	14	import sys
[email protected]	92cd7b0	2015-08-18 05:53:55	[diff] [blame]	15	import tarfile
[email protected]	867e5b5	2013-03-13 21:43:51	[diff] [blame]	16	import threading
				17	import time
				18
[email protected]	867e5b5	2013-03-13 21:43:51	[diff] [blame]	19	from download_from_google_storage import get_sha1
				20	from download_from_google_storage import Gsutil
Nico Weber	87677e9	2017-10-09 19:42:44	[diff] [blame]	21	from download_from_google_storage import PrinterThread
[email protected]	199bc5f	2014-12-17 02:17:14	[diff] [blame]	22	from download_from_google_storage import GSUTIL_DEFAULT_PATH
[email protected]	867e5b5	2013-03-13 21:43:51	[diff] [blame]	23
				24	USAGE_STRING = """%prog [options] target [target2 ...].
				25	Target is the file intended to be uploaded to Google Storage.
				26	If target is "-", then a list of files will be taken from standard input
				27
				28	This script will generate a file (original filename).sha1 containing the
				29	sha1 sum of the uploaded file.
				30	It is recommended that the .sha1 file is checked into the repository,
				31	the original file removed from the repository, and a hook added to the
				32	DEPS file to call download_from_google_storage.py.
				33
				34	Example usages
				35	--------------
				36
				37	Scan the current directory and upload all files larger than 1MB:
				38	find . -name .svn -prune -o -size +1000k -type f -print0 \| %prog -0 -b bkt -
				39	(Replace "bkt" with the name of a writable bucket.)
				40	"""
				41
				42
				43	def get_md5(filename):
				44	md5_calculator = hashlib.md5()
				45	with open(filename, 'rb') as f:
				46	while True:
				47	chunk = f.read(1024*1024)
				48	if not chunk:
				49	break
				50	md5_calculator.update(chunk)
				51	return md5_calculator.hexdigest()
				52
				53
				54	def get_md5_cached(filename):
				55	"""Don't calculate the MD5 if we can find a .md5 file."""
				56	# See if we can find an existing MD5 sum stored in a file.
				57	if os.path.exists('%s.md5' % filename):
				58	with open('%s.md5' % filename, 'rb') as f:
				59	md5_match = re.search('([a-z0-9]{32})', f.read())
				60	if md5_match:
				61	return md5_match.group(1)
				62	else:
				63	md5_hash = get_md5(filename)
				64	with open('%s.md5' % filename, 'wb') as f:
				65	f.write(md5_hash)
				66	return md5_hash
				67
				68
				69	def _upload_worker(
				70	thread_num, upload_queue, base_url, gsutil, md5_lock, force,
[email protected]	364876e	2015-04-03 14:14:18	[diff] [blame]	71	use_md5, stdout_queue, ret_codes, gzip):
[email protected]	867e5b5	2013-03-13 21:43:51	[diff] [blame]	72	while True:
				73	filename, sha1_sum = upload_queue.get()
				74	if not filename:
				75	break
				76	file_url = '%s/%s' % (base_url, sha1_sum)
				77	if gsutil.check_call('ls', file_url)[0] == 0 and not force:
				78	# File exists, check MD5 hash.
[email protected]	b180ded	2016-03-29 03:27:41	[diff] [blame]	79	_, out, _ = gsutil.check_call_with_retries('ls', '-L', file_url)
[email protected]	867e5b5	2013-03-13 21:43:51	[diff] [blame]	80	etag_match = re.search('ETag:\s+([a-z0-9]{32})', out)
				81	if etag_match:
				82	remote_md5 = etag_match.group(1)
				83	# Calculate the MD5 checksum to match it to Google Storage's ETag.
				84	with md5_lock:
				85	if use_md5:
				86	local_md5 = get_md5_cached(filename)
				87	else:
				88	local_md5 = get_md5(filename)
				89	if local_md5 == remote_md5:
				90	stdout_queue.put(
				91	'%d> File %s already exists and MD5 matches, upload skipped' %
				92	(thread_num, filename))
				93	continue
				94	stdout_queue.put('%d> Uploading %s...' % (
				95	thread_num, filename))
[email protected]	364876e	2015-04-03 14:14:18	[diff] [blame]	96	gsutil_args = ['cp']
				97	if gzip:
				98	gsutil_args.extend(['-z', gzip])
				99	gsutil_args.extend([filename, file_url])
[email protected]	b180ded	2016-03-29 03:27:41	[diff] [blame]	100	code, _, err = gsutil.check_call_with_retries(*gsutil_args)
[email protected]	867e5b5	2013-03-13 21:43:51	[diff] [blame]	101	if code != 0:
				102	ret_codes.put(
				103	(code,
				104	'Encountered error on uploading %s to %s\n%s' %
				105	(filename, file_url, err)))
				106	continue
				107
[email protected]	ba63bcb	2013-10-28 19:55:48	[diff] [blame]	108	# Mark executable files with the header "x-goog-meta-executable: 1" which
				109	# the download script will check for to preserve the executable bit.
				110	if not sys.platform.startswith('win'):
				111	if os.stat(filename).st_mode & stat.S_IEXEC:
[email protected]	b180ded	2016-03-29 03:27:41	[diff] [blame]	112	code, _, err = gsutil.check_call_with_retries(
				113	'setmeta', '-h', 'x-goog-meta-executable:1', file_url)
				114	if not code:
[email protected]	ba63bcb	2013-10-28 19:55:48	[diff] [blame]	115	ret_codes.put(
				116	(code,
				117	'Encountered error on setting metadata on %s\n%s' %
				118	(file_url, err)))
				119
[email protected]	867e5b5	2013-03-13 21:43:51	[diff] [blame]	120
				121	def get_targets(args, parser, use_null_terminator):
				122	if not args:
				123	parser.error('Missing target.')
				124
				125	if len(args) == 1 and args[0] == '-':
qyearsley	12fa6ff	2016-08-24 16:18:40	[diff] [blame]	126	# Take stdin as a newline or null separated list of files.
[email protected]	867e5b5	2013-03-13 21:43:51	[diff] [blame]	127	if use_null_terminator:
				128	return sys.stdin.read().split('\0')
				129	else:
				130	return sys.stdin.read().splitlines()
				131	else:
				132	return args
				133
				134
				135	def upload_to_google_storage(
				136	input_filenames, base_url, gsutil, force,
[email protected]	364876e	2015-04-03 14:14:18	[diff] [blame]	137	use_md5, num_threads, skip_hashing, gzip):
[email protected]	867e5b5	2013-03-13 21:43:51	[diff] [blame]	138	# We only want one MD5 calculation happening at a time to avoid HD thrashing.
				139	md5_lock = threading.Lock()
				140
				141	# Start up all the worker threads plus the printer thread.
				142	all_threads = []
				143	ret_codes = Queue.Queue()
				144	ret_codes.put((0, None))
				145	upload_queue = Queue.Queue()
				146	upload_timer = time.time()
				147	stdout_queue = Queue.Queue()
Nico Weber	87677e9	2017-10-09 19:42:44	[diff] [blame]	148	printer_thread = PrinterThread(stdout_queue)
[email protected]	867e5b5	2013-03-13 21:43:51	[diff] [blame]	149	printer_thread.daemon = True
				150	printer_thread.start()
				151	for thread_num in range(num_threads):
				152	t = threading.Thread(
				153	target=_upload_worker,
				154	args=[thread_num, upload_queue, base_url, gsutil, md5_lock,
[email protected]	364876e	2015-04-03 14:14:18	[diff] [blame]	155	force, use_md5, stdout_queue, ret_codes, gzip])
[email protected]	867e5b5	2013-03-13 21:43:51	[diff] [blame]	156	t.daemon = True
				157	t.start()
				158	all_threads.append(t)
				159
				160	# We want to hash everything in a single thread since its faster.
				161	# The bottleneck is in disk IO, not CPU.
				162	hashing_start = time.time()
				163	for filename in input_filenames:
				164	if not os.path.exists(filename):
				165	stdout_queue.put('Main> Error: %s not found, skipping.' % filename)
				166	continue
				167	if os.path.exists('%s.sha1' % filename) and skip_hashing:
				168	stdout_queue.put(
				169	'Main> Found hash for %s, sha1 calculation skipped.' % filename)
				170	with open(filename + '.sha1', 'rb') as f:
				171	sha1_file = f.read(1024)
				172	if not re.match('^([a-z0-9]{40})$', sha1_file):
				173	print >> sys.stderr, 'Invalid sha1 hash file %s.sha1' % filename
				174	return 1
				175	upload_queue.put((filename, sha1_file))
				176	continue
				177	stdout_queue.put('Main> Calculating hash for %s...' % filename)
				178	sha1_sum = get_sha1(filename)
				179	with open(filename + '.sha1', 'wb') as f:
				180	f.write(sha1_sum)
				181	stdout_queue.put('Main> Done calculating hash for %s.' % filename)
				182	upload_queue.put((filename, sha1_sum))
				183	hashing_duration = time.time() - hashing_start
				184
				185	# Wait for everything to finish.
				186	for _ in all_threads:
				187	upload_queue.put((None, None)) # To mark the end of the work queue.
				188	for t in all_threads:
				189	t.join()
				190	stdout_queue.put(None)
				191	printer_thread.join()
				192
				193	# Print timing information.
				194	print 'Hashing %s files took %1f seconds' % (
				195	len(input_filenames), hashing_duration)
				196	print 'Uploading took %1f seconds' % (time.time() - upload_timer)
				197
				198	# See if we ran into any errors.
				199	max_ret_code = 0
				200	for ret_code, message in ret_codes.queue:
				201	max_ret_code = max(ret_code, max_ret_code)
				202	if message:
				203	print >> sys.stderr, message
				204
				205	if not max_ret_code:
				206	print 'Success!'
				207
				208	return max_ret_code
				209
				210
[email protected]	92cd7b0	2015-08-18 05:53:55	[diff] [blame]	211	def create_archives(dirs):
				212	archive_names = []
				213	for name in dirs:
				214	tarname = '%s.tar.gz' % name
				215	with tarfile.open(tarname, 'w:gz') as tar:
				216	tar.add(name)
				217	archive_names.append(tarname)
				218	return archive_names
				219
				220
				221	def validate_archive_dirs(dirs):
				222	# We don't allow .. in paths in our archives.
				223	if any(map(lambda x: '..' in x, dirs)):
				224	return False
				225	# We only allow dirs.
				226	if any(map(lambda x: not os.path.isdir(x), dirs)):
				227	return False
				228	# We don't allow sym links in our archives.
				229	if any(map(os.path.islink, dirs)):
				230	return False
				231	# We required that the subdirectories we are archiving are all just below
				232	# cwd.
				233	return not any(map(lambda x: x not in next(os.walk('.'))[1], dirs))
				234
				235
[email protected]	013731e	2015-02-26 18:28:43	[diff] [blame]	236	def main():
[email protected]	867e5b5	2013-03-13 21:43:51	[diff] [blame]	237	parser = optparse.OptionParser(USAGE_STRING)
				238	parser.add_option('-b', '--bucket',
				239	help='Google Storage bucket to upload to.')
				240	parser.add_option('-e', '--boto', help='Specify a custom boto file.')
[email protected]	92cd7b0	2015-08-18 05:53:55	[diff] [blame]	241	parser.add_option('-a', '--archive', action='store_true',
				242	help='Archive directory as a tar.gz file')
[email protected]	867e5b5	2013-03-13 21:43:51	[diff] [blame]	243	parser.add_option('-f', '--force', action='store_true',
				244	help='Force upload even if remote file exists.')
				245	parser.add_option('-g', '--gsutil_path', default=GSUTIL_DEFAULT_PATH,
				246	help='Path to the gsutil script.')
				247	parser.add_option('-m', '--use_md5', action='store_true',
				248	help='Generate MD5 files when scanning, and don\'t check '
				249	'the MD5 checksum if a .md5 file is found.')
				250	parser.add_option('-t', '--num_threads', default=1, type='int',
				251	help='Number of uploader threads to run.')
				252	parser.add_option('-s', '--skip_hashing', action='store_true',
				253	help='Skip hashing if .sha1 file exists.')
				254	parser.add_option('-0', '--use_null_terminator', action='store_true',
				255	help='Use \\0 instead of \\n when parsing '
				256	'the file list from stdin. This is useful if the input '
				257	'is coming from "find ... -print0".')
[email protected]	364876e	2015-04-03 14:14:18	[diff] [blame]	258	parser.add_option('-z', '--gzip', metavar='ext',
				259	help='Gzip files which end in ext. '
				260	'ext is a comma-separated list')
[email protected]	867e5b5	2013-03-13 21:43:51	[diff] [blame]	261	(options, args) = parser.parse_args()
				262
				263	# Enumerate our inputs.
				264	input_filenames = get_targets(args, parser, options.use_null_terminator)
				265
[email protected]	92cd7b0	2015-08-18 05:53:55	[diff] [blame]	266	if options.archive:
				267	if not validate_archive_dirs(input_filenames):
				268	parser.error('Only directories just below cwd are valid entries when '
				269	'using the --archive argument. Entries can not contain .. '
				270	' and entries can not be symlinks. Entries was %s' %
				271	input_filenames)
				272	return 1
				273	input_filenames = create_archives(input_filenames)
				274
[email protected]	867e5b5	2013-03-13 21:43:51	[diff] [blame]	275	# Make sure we can find a working instance of gsutil.
				276	if os.path.exists(GSUTIL_DEFAULT_PATH):
[email protected]	199bc5f	2014-12-17 02:17:14	[diff] [blame]	277	gsutil = Gsutil(GSUTIL_DEFAULT_PATH, boto_path=options.boto)
[email protected]	867e5b5	2013-03-13 21:43:51	[diff] [blame]	278	else:
				279	gsutil = None
				280	for path in os.environ["PATH"].split(os.pathsep):
				281	if os.path.exists(path) and 'gsutil' in os.listdir(path):
[email protected]	e4d906f	2013-06-28 00:24:52	[diff] [blame]	282	gsutil = Gsutil(os.path.join(path, 'gsutil'), boto_path=options.boto)
[email protected]	867e5b5	2013-03-13 21:43:51	[diff] [blame]	283	if not gsutil:
				284	parser.error('gsutil not found in %s, bad depot_tools checkout?' %
				285	GSUTIL_DEFAULT_PATH)
				286
[email protected]	5125952	2014-12-05 00:03:01	[diff] [blame]	287	base_url = 'gs://%s' % options.bucket
				288
[email protected]	867e5b5	2013-03-13 21:43:51	[diff] [blame]	289	return upload_to_google_storage(
				290	input_filenames, base_url, gsutil, options.force, options.use_md5,
[email protected]	364876e	2015-04-03 14:14:18	[diff] [blame]	291	options.num_threads, options.skip_hashing, options.gzip)
[email protected]	867e5b5	2013-03-13 21:43:51	[diff] [blame]	292
				293
				294	if __name__ == '__main__':
[email protected]	013731e	2015-02-26 18:28:43	[diff] [blame]	295	try:
				296	sys.exit(main())
				297	except KeyboardInterrupt:
				298	sys.stderr.write('interrupted\n')
				299	sys.exit(1)