blob: 031214d6af9e644d05dd990a95c5b4424b445eef [file] [log] [blame]
[email protected]867e5b52013-03-13 21:43:511#!/usr/bin/env python
2# Copyright (c) 2012 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Uploads files to Google Storage content addressed."""
7
8import hashlib
9import optparse
10import os
11import Queue
12import re
[email protected]ba63bcb2013-10-28 19:55:4813import stat
[email protected]867e5b52013-03-13 21:43:5114import sys
[email protected]92cd7b02015-08-18 05:53:5515import tarfile
[email protected]867e5b52013-03-13 21:43:5116import threading
17import time
18
[email protected]867e5b52013-03-13 21:43:5119from download_from_google_storage import get_sha1
20from download_from_google_storage import Gsutil
Nico Weber87677e92017-10-09 19:42:4421from download_from_google_storage import PrinterThread
[email protected]199bc5f2014-12-17 02:17:1422from download_from_google_storage import GSUTIL_DEFAULT_PATH
[email protected]867e5b52013-03-13 21:43:5123
24USAGE_STRING = """%prog [options] target [target2 ...].
25Target is the file intended to be uploaded to Google Storage.
26If target is "-", then a list of files will be taken from standard input
27
28This script will generate a file (original filename).sha1 containing the
29sha1 sum of the uploaded file.
30It is recommended that the .sha1 file is checked into the repository,
31the original file removed from the repository, and a hook added to the
32DEPS file to call download_from_google_storage.py.
33
34Example usages
35--------------
36
37Scan the current directory and upload all files larger than 1MB:
38find . -name .svn -prune -o -size +1000k -type f -print0 | %prog -0 -b bkt -
39(Replace "bkt" with the name of a writable bucket.)
40"""
41
42
43def get_md5(filename):
44 md5_calculator = hashlib.md5()
45 with open(filename, 'rb') as f:
46 while True:
47 chunk = f.read(1024*1024)
48 if not chunk:
49 break
50 md5_calculator.update(chunk)
51 return md5_calculator.hexdigest()
52
53
54def get_md5_cached(filename):
55 """Don't calculate the MD5 if we can find a .md5 file."""
56 # See if we can find an existing MD5 sum stored in a file.
57 if os.path.exists('%s.md5' % filename):
58 with open('%s.md5' % filename, 'rb') as f:
59 md5_match = re.search('([a-z0-9]{32})', f.read())
60 if md5_match:
61 return md5_match.group(1)
62 else:
63 md5_hash = get_md5(filename)
64 with open('%s.md5' % filename, 'wb') as f:
65 f.write(md5_hash)
66 return md5_hash
67
68
69def _upload_worker(
70 thread_num, upload_queue, base_url, gsutil, md5_lock, force,
[email protected]364876e2015-04-03 14:14:1871 use_md5, stdout_queue, ret_codes, gzip):
[email protected]867e5b52013-03-13 21:43:5172 while True:
73 filename, sha1_sum = upload_queue.get()
74 if not filename:
75 break
76 file_url = '%s/%s' % (base_url, sha1_sum)
77 if gsutil.check_call('ls', file_url)[0] == 0 and not force:
78 # File exists, check MD5 hash.
[email protected]b180ded2016-03-29 03:27:4179 _, out, _ = gsutil.check_call_with_retries('ls', '-L', file_url)
[email protected]867e5b52013-03-13 21:43:5180 etag_match = re.search('ETag:\s+([a-z0-9]{32})', out)
81 if etag_match:
82 remote_md5 = etag_match.group(1)
83 # Calculate the MD5 checksum to match it to Google Storage's ETag.
84 with md5_lock:
85 if use_md5:
86 local_md5 = get_md5_cached(filename)
87 else:
88 local_md5 = get_md5(filename)
89 if local_md5 == remote_md5:
90 stdout_queue.put(
91 '%d> File %s already exists and MD5 matches, upload skipped' %
92 (thread_num, filename))
93 continue
94 stdout_queue.put('%d> Uploading %s...' % (
95 thread_num, filename))
[email protected]364876e2015-04-03 14:14:1896 gsutil_args = ['cp']
97 if gzip:
98 gsutil_args.extend(['-z', gzip])
99 gsutil_args.extend([filename, file_url])
[email protected]b180ded2016-03-29 03:27:41100 code, _, err = gsutil.check_call_with_retries(*gsutil_args)
[email protected]867e5b52013-03-13 21:43:51101 if code != 0:
102 ret_codes.put(
103 (code,
104 'Encountered error on uploading %s to %s\n%s' %
105 (filename, file_url, err)))
106 continue
107
[email protected]ba63bcb2013-10-28 19:55:48108 # Mark executable files with the header "x-goog-meta-executable: 1" which
109 # the download script will check for to preserve the executable bit.
110 if not sys.platform.startswith('win'):
111 if os.stat(filename).st_mode & stat.S_IEXEC:
[email protected]b180ded2016-03-29 03:27:41112 code, _, err = gsutil.check_call_with_retries(
113 'setmeta', '-h', 'x-goog-meta-executable:1', file_url)
114 if not code:
[email protected]ba63bcb2013-10-28 19:55:48115 ret_codes.put(
116 (code,
117 'Encountered error on setting metadata on %s\n%s' %
118 (file_url, err)))
119
[email protected]867e5b52013-03-13 21:43:51120
121def get_targets(args, parser, use_null_terminator):
122 if not args:
123 parser.error('Missing target.')
124
125 if len(args) == 1 and args[0] == '-':
qyearsley12fa6ff2016-08-24 16:18:40126 # Take stdin as a newline or null separated list of files.
[email protected]867e5b52013-03-13 21:43:51127 if use_null_terminator:
128 return sys.stdin.read().split('\0')
129 else:
130 return sys.stdin.read().splitlines()
131 else:
132 return args
133
134
135def upload_to_google_storage(
136 input_filenames, base_url, gsutil, force,
[email protected]364876e2015-04-03 14:14:18137 use_md5, num_threads, skip_hashing, gzip):
[email protected]867e5b52013-03-13 21:43:51138 # We only want one MD5 calculation happening at a time to avoid HD thrashing.
139 md5_lock = threading.Lock()
140
141 # Start up all the worker threads plus the printer thread.
142 all_threads = []
143 ret_codes = Queue.Queue()
144 ret_codes.put((0, None))
145 upload_queue = Queue.Queue()
146 upload_timer = time.time()
147 stdout_queue = Queue.Queue()
Nico Weber87677e92017-10-09 19:42:44148 printer_thread = PrinterThread(stdout_queue)
[email protected]867e5b52013-03-13 21:43:51149 printer_thread.daemon = True
150 printer_thread.start()
151 for thread_num in range(num_threads):
152 t = threading.Thread(
153 target=_upload_worker,
154 args=[thread_num, upload_queue, base_url, gsutil, md5_lock,
[email protected]364876e2015-04-03 14:14:18155 force, use_md5, stdout_queue, ret_codes, gzip])
[email protected]867e5b52013-03-13 21:43:51156 t.daemon = True
157 t.start()
158 all_threads.append(t)
159
160 # We want to hash everything in a single thread since its faster.
161 # The bottleneck is in disk IO, not CPU.
162 hashing_start = time.time()
163 for filename in input_filenames:
164 if not os.path.exists(filename):
165 stdout_queue.put('Main> Error: %s not found, skipping.' % filename)
166 continue
167 if os.path.exists('%s.sha1' % filename) and skip_hashing:
168 stdout_queue.put(
169 'Main> Found hash for %s, sha1 calculation skipped.' % filename)
170 with open(filename + '.sha1', 'rb') as f:
171 sha1_file = f.read(1024)
172 if not re.match('^([a-z0-9]{40})$', sha1_file):
173 print >> sys.stderr, 'Invalid sha1 hash file %s.sha1' % filename
174 return 1
175 upload_queue.put((filename, sha1_file))
176 continue
177 stdout_queue.put('Main> Calculating hash for %s...' % filename)
178 sha1_sum = get_sha1(filename)
179 with open(filename + '.sha1', 'wb') as f:
180 f.write(sha1_sum)
181 stdout_queue.put('Main> Done calculating hash for %s.' % filename)
182 upload_queue.put((filename, sha1_sum))
183 hashing_duration = time.time() - hashing_start
184
185 # Wait for everything to finish.
186 for _ in all_threads:
187 upload_queue.put((None, None)) # To mark the end of the work queue.
188 for t in all_threads:
189 t.join()
190 stdout_queue.put(None)
191 printer_thread.join()
192
193 # Print timing information.
194 print 'Hashing %s files took %1f seconds' % (
195 len(input_filenames), hashing_duration)
196 print 'Uploading took %1f seconds' % (time.time() - upload_timer)
197
198 # See if we ran into any errors.
199 max_ret_code = 0
200 for ret_code, message in ret_codes.queue:
201 max_ret_code = max(ret_code, max_ret_code)
202 if message:
203 print >> sys.stderr, message
204
205 if not max_ret_code:
206 print 'Success!'
207
208 return max_ret_code
209
210
[email protected]92cd7b02015-08-18 05:53:55211def create_archives(dirs):
212 archive_names = []
213 for name in dirs:
214 tarname = '%s.tar.gz' % name
215 with tarfile.open(tarname, 'w:gz') as tar:
216 tar.add(name)
217 archive_names.append(tarname)
218 return archive_names
219
220
221def validate_archive_dirs(dirs):
222 # We don't allow .. in paths in our archives.
223 if any(map(lambda x: '..' in x, dirs)):
224 return False
225 # We only allow dirs.
226 if any(map(lambda x: not os.path.isdir(x), dirs)):
227 return False
228 # We don't allow sym links in our archives.
229 if any(map(os.path.islink, dirs)):
230 return False
231 # We required that the subdirectories we are archiving are all just below
232 # cwd.
233 return not any(map(lambda x: x not in next(os.walk('.'))[1], dirs))
234
235
[email protected]013731e2015-02-26 18:28:43236def main():
[email protected]867e5b52013-03-13 21:43:51237 parser = optparse.OptionParser(USAGE_STRING)
238 parser.add_option('-b', '--bucket',
239 help='Google Storage bucket to upload to.')
240 parser.add_option('-e', '--boto', help='Specify a custom boto file.')
[email protected]92cd7b02015-08-18 05:53:55241 parser.add_option('-a', '--archive', action='store_true',
242 help='Archive directory as a tar.gz file')
[email protected]867e5b52013-03-13 21:43:51243 parser.add_option('-f', '--force', action='store_true',
244 help='Force upload even if remote file exists.')
245 parser.add_option('-g', '--gsutil_path', default=GSUTIL_DEFAULT_PATH,
246 help='Path to the gsutil script.')
247 parser.add_option('-m', '--use_md5', action='store_true',
248 help='Generate MD5 files when scanning, and don\'t check '
249 'the MD5 checksum if a .md5 file is found.')
250 parser.add_option('-t', '--num_threads', default=1, type='int',
251 help='Number of uploader threads to run.')
252 parser.add_option('-s', '--skip_hashing', action='store_true',
253 help='Skip hashing if .sha1 file exists.')
254 parser.add_option('-0', '--use_null_terminator', action='store_true',
255 help='Use \\0 instead of \\n when parsing '
256 'the file list from stdin. This is useful if the input '
257 'is coming from "find ... -print0".')
[email protected]364876e2015-04-03 14:14:18258 parser.add_option('-z', '--gzip', metavar='ext',
259 help='Gzip files which end in ext. '
260 'ext is a comma-separated list')
[email protected]867e5b52013-03-13 21:43:51261 (options, args) = parser.parse_args()
262
263 # Enumerate our inputs.
264 input_filenames = get_targets(args, parser, options.use_null_terminator)
265
[email protected]92cd7b02015-08-18 05:53:55266 if options.archive:
267 if not validate_archive_dirs(input_filenames):
268 parser.error('Only directories just below cwd are valid entries when '
269 'using the --archive argument. Entries can not contain .. '
270 ' and entries can not be symlinks. Entries was %s' %
271 input_filenames)
272 return 1
273 input_filenames = create_archives(input_filenames)
274
[email protected]867e5b52013-03-13 21:43:51275 # Make sure we can find a working instance of gsutil.
276 if os.path.exists(GSUTIL_DEFAULT_PATH):
[email protected]199bc5f2014-12-17 02:17:14277 gsutil = Gsutil(GSUTIL_DEFAULT_PATH, boto_path=options.boto)
[email protected]867e5b52013-03-13 21:43:51278 else:
279 gsutil = None
280 for path in os.environ["PATH"].split(os.pathsep):
281 if os.path.exists(path) and 'gsutil' in os.listdir(path):
[email protected]e4d906f2013-06-28 00:24:52282 gsutil = Gsutil(os.path.join(path, 'gsutil'), boto_path=options.boto)
[email protected]867e5b52013-03-13 21:43:51283 if not gsutil:
284 parser.error('gsutil not found in %s, bad depot_tools checkout?' %
285 GSUTIL_DEFAULT_PATH)
286
[email protected]51259522014-12-05 00:03:01287 base_url = 'gs://%s' % options.bucket
288
[email protected]867e5b52013-03-13 21:43:51289 return upload_to_google_storage(
290 input_filenames, base_url, gsutil, options.force, options.use_md5,
[email protected]364876e2015-04-03 14:14:18291 options.num_threads, options.skip_hashing, options.gzip)
[email protected]867e5b52013-03-13 21:43:51292
293
294if __name__ == '__main__':
[email protected]013731e2015-02-26 18:28:43295 try:
296 sys.exit(main())
297 except KeyboardInterrupt:
298 sys.stderr.write('interrupted\n')
299 sys.exit(1)