[email protected] | 867e5b5 | 2013-03-13 21:43:51 | [diff] [blame] | 1 | #!/usr/bin/env python |
| 2 | # Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 3 | # Use of this source code is governed by a BSD-style license that can be |
| 4 | # found in the LICENSE file. |
| 5 | |
| 6 | """Download files from Google Storage based on SHA1 sums.""" |
| 7 | |
| 8 | |
| 9 | import hashlib |
| 10 | import optparse |
| 11 | import os |
| 12 | import Queue |
| 13 | import re |
[email protected] | ba63bcb | 2013-10-28 19:55:48 | [diff] [blame] | 14 | import stat |
[email protected] | 867e5b5 | 2013-03-13 21:43:51 | [diff] [blame] | 15 | import sys |
| 16 | import threading |
| 17 | import time |
| 18 | |
| 19 | import subprocess2 |
| 20 | |
| 21 | |
| 22 | GSUTIL_DEFAULT_PATH = os.path.join( |
| 23 | os.path.dirname(os.path.abspath(__file__)), |
| 24 | 'third_party', 'gsutil', 'gsutil') |
| 25 | |
| 26 | |
| 27 | class FileNotFoundError(IOError): |
| 28 | pass |
| 29 | |
| 30 | |
| 31 | class InvalidFileError(IOError): |
| 32 | pass |
| 33 | |
| 34 | |
[email protected] | 7d6ffa5 | 2013-12-05 18:43:11 | [diff] [blame] | 35 | def GetNormalizedPlatform(): |
| 36 | """Returns the result of sys.platform accounting for cygwin. |
| 37 | Under cygwin, this will always return "win32" like the native Python.""" |
| 38 | if sys.platform == 'cygwin': |
| 39 | return 'win32' |
| 40 | return sys.platform |
| 41 | |
| 42 | |
[email protected] | 867e5b5 | 2013-03-13 21:43:51 | [diff] [blame] | 43 | # Common utilities |
| 44 | class Gsutil(object): |
| 45 | """Call gsutil with some predefined settings. This is a convenience object, |
| 46 | and is also immutable.""" |
[email protected] | acb9ed7 | 2013-06-20 12:16:15 | [diff] [blame] | 47 | def __init__(self, path, boto_path, timeout=None): |
[email protected] | 867e5b5 | 2013-03-13 21:43:51 | [diff] [blame] | 48 | if not os.path.exists(path): |
| 49 | raise FileNotFoundError('GSUtil not found in %s' % path) |
| 50 | self.path = path |
| 51 | self.timeout = timeout |
| 52 | self.boto_path = boto_path |
| 53 | |
[email protected] | 3e31fca | 2013-06-28 17:04:44 | [diff] [blame] | 54 | def get_sub_env(self): |
[email protected] | 867e5b5 | 2013-03-13 21:43:51 | [diff] [blame] | 55 | env = os.environ.copy() |
[email protected] | c61894c | 2013-11-19 20:25:21 | [diff] [blame] | 56 | if self.boto_path == os.devnull: |
| 57 | env['AWS_CREDENTIAL_FILE'] = '' |
| 58 | env['BOTO_CONFIG'] = '' |
| 59 | elif self.boto_path: |
[email protected] | 867e5b5 | 2013-03-13 21:43:51 | [diff] [blame] | 60 | env['AWS_CREDENTIAL_FILE'] = self.boto_path |
[email protected] | c61894c | 2013-11-19 20:25:21 | [diff] [blame] | 61 | env['BOTO_CONFIG'] = self.boto_path |
[email protected] | 3e31fca | 2013-06-28 17:04:44 | [diff] [blame] | 62 | else: |
| 63 | custompath = env.get('AWS_CREDENTIAL_FILE', '~/.boto') + '.depot_tools' |
| 64 | custompath = os.path.expanduser(custompath) |
| 65 | if os.path.exists(custompath): |
| 66 | env['AWS_CREDENTIAL_FILE'] = custompath |
| 67 | |
| 68 | return env |
| 69 | |
| 70 | def call(self, *args): |
[email protected] | 867e5b5 | 2013-03-13 21:43:51 | [diff] [blame] | 71 | return subprocess2.call((sys.executable, self.path) + args, |
[email protected] | 3e31fca | 2013-06-28 17:04:44 | [diff] [blame] | 72 | env=self.get_sub_env(), |
[email protected] | 867e5b5 | 2013-03-13 21:43:51 | [diff] [blame] | 73 | timeout=self.timeout) |
| 74 | |
| 75 | def check_call(self, *args): |
[email protected] | 867e5b5 | 2013-03-13 21:43:51 | [diff] [blame] | 76 | ((out, err), code) = subprocess2.communicate( |
| 77 | (sys.executable, self.path) + args, |
| 78 | stdout=subprocess2.PIPE, |
| 79 | stderr=subprocess2.PIPE, |
[email protected] | 3e31fca | 2013-06-28 17:04:44 | [diff] [blame] | 80 | env=self.get_sub_env(), |
[email protected] | 867e5b5 | 2013-03-13 21:43:51 | [diff] [blame] | 81 | timeout=self.timeout) |
| 82 | |
| 83 | # Parse output. |
| 84 | status_code_match = re.search('status=([0-9]+)', err) |
| 85 | if status_code_match: |
| 86 | return (int(status_code_match.group(1)), out, err) |
| 87 | if ('You are attempting to access protected data with ' |
| 88 | 'no configured credentials.' in err): |
| 89 | return (403, out, err) |
| 90 | if 'No such object' in err: |
| 91 | return (404, out, err) |
| 92 | return (code, out, err) |
| 93 | |
| 94 | |
| 95 | def check_bucket_permissions(bucket, gsutil): |
| 96 | if not bucket: |
| 97 | print >> sys.stderr, 'Missing bucket %s.' |
| 98 | return (None, 1) |
| 99 | base_url = 'gs://%s' % bucket |
| 100 | |
| 101 | code, _, ls_err = gsutil.check_call('ls', base_url) |
[email protected] | 8b3cad7 | 2013-09-19 20:00:48 | [diff] [blame] | 102 | if code != 0: |
| 103 | print >> sys.stderr, ls_err |
[email protected] | 867e5b5 | 2013-03-13 21:43:51 | [diff] [blame] | 104 | if code == 403: |
[email protected] | f9cc91d | 2013-06-04 03:25:42 | [diff] [blame] | 105 | print >> sys.stderr, 'Got error 403 while authenticating to %s.' % base_url |
[email protected] | 0477f8c | 2013-06-26 22:23:57 | [diff] [blame] | 106 | print >> sys.stderr, 'Try running "download_from_google_storage --config".' |
[email protected] | 867e5b5 | 2013-03-13 21:43:51 | [diff] [blame] | 107 | elif code == 404: |
| 108 | print >> sys.stderr, '%s not found.' % base_url |
[email protected] | 867e5b5 | 2013-03-13 21:43:51 | [diff] [blame] | 109 | return (base_url, code) |
| 110 | |
| 111 | |
| 112 | def get_sha1(filename): |
| 113 | sha1 = hashlib.sha1() |
| 114 | with open(filename, 'rb') as f: |
| 115 | while True: |
| 116 | # Read in 1mb chunks, so it doesn't all have to be loaded into memory. |
| 117 | chunk = f.read(1024*1024) |
| 118 | if not chunk: |
| 119 | break |
| 120 | sha1.update(chunk) |
| 121 | return sha1.hexdigest() |
| 122 | |
| 123 | |
| 124 | # Download-specific code starts here |
| 125 | |
| 126 | def enumerate_work_queue(input_filename, work_queue, directory, |
| 127 | recursive, ignore_errors, output, sha1_file): |
| 128 | if sha1_file: |
| 129 | if not os.path.exists(input_filename): |
| 130 | if not ignore_errors: |
| 131 | raise FileNotFoundError('%s not found.' % input_filename) |
| 132 | print >> sys.stderr, '%s not found.' % input_filename |
| 133 | with open(input_filename, 'rb') as f: |
| 134 | sha1_match = re.match('^([A-Za-z0-9]{40})$', f.read(1024).rstrip()) |
| 135 | if sha1_match: |
| 136 | work_queue.put( |
| 137 | (sha1_match.groups(1)[0], input_filename.replace('.sha1', ''))) |
| 138 | return 1 |
| 139 | if not ignore_errors: |
| 140 | raise InvalidFileError('No sha1 sum found in %s.' % input_filename) |
| 141 | print >> sys.stderr, 'No sha1 sum found in %s.' % input_filename |
| 142 | return 0 |
| 143 | |
| 144 | if not directory: |
| 145 | work_queue.put((input_filename, output)) |
| 146 | return 1 |
| 147 | |
| 148 | work_queue_size = 0 |
| 149 | for root, dirs, files in os.walk(input_filename): |
| 150 | if not recursive: |
| 151 | for item in dirs[:]: |
| 152 | dirs.remove(item) |
| 153 | else: |
| 154 | for exclude in ['.svn', '.git']: |
| 155 | if exclude in dirs: |
| 156 | dirs.remove(exclude) |
| 157 | for filename in files: |
| 158 | full_path = os.path.join(root, filename) |
| 159 | if full_path.endswith('.sha1'): |
| 160 | with open(full_path, 'rb') as f: |
| 161 | sha1_match = re.match('^([A-Za-z0-9]{40})$', f.read(1024).rstrip()) |
| 162 | if sha1_match: |
| 163 | work_queue.put( |
| 164 | (sha1_match.groups(1)[0], full_path.replace('.sha1', ''))) |
| 165 | work_queue_size += 1 |
| 166 | else: |
| 167 | if not ignore_errors: |
| 168 | raise InvalidFileError('No sha1 sum found in %s.' % filename) |
| 169 | print >> sys.stderr, 'No sha1 sum found in %s.' % filename |
| 170 | return work_queue_size |
| 171 | |
| 172 | |
| 173 | def _downloader_worker_thread(thread_num, q, force, base_url, |
[email protected] | ff7ea00 | 2013-11-25 19:28:54 | [diff] [blame] | 174 | gsutil, out_q, ret_codes, verbose): |
[email protected] | 867e5b5 | 2013-03-13 21:43:51 | [diff] [blame] | 175 | while True: |
| 176 | input_sha1_sum, output_filename = q.get() |
| 177 | if input_sha1_sum is None: |
| 178 | return |
| 179 | if os.path.exists(output_filename) and not force: |
| 180 | if get_sha1(output_filename) == input_sha1_sum: |
[email protected] | ff7ea00 | 2013-11-25 19:28:54 | [diff] [blame] | 181 | if verbose: |
| 182 | out_q.put( |
| 183 | '%d> File %s exists and SHA1 matches. Skipping.' % ( |
| 184 | thread_num, output_filename)) |
[email protected] | 867e5b5 | 2013-03-13 21:43:51 | [diff] [blame] | 185 | continue |
| 186 | # Check if file exists. |
| 187 | file_url = '%s/%s' % (base_url, input_sha1_sum) |
| 188 | if gsutil.check_call('ls', file_url)[0] != 0: |
| 189 | out_q.put('%d> File %s for %s does not exist, skipping.' % ( |
| 190 | thread_num, file_url, output_filename)) |
| 191 | ret_codes.put((1, 'File %s for %s does not exist.' % ( |
| 192 | file_url, output_filename))) |
| 193 | continue |
| 194 | # Fetch the file. |
| 195 | out_q.put('%d> Downloading %s...' % ( |
| 196 | thread_num, output_filename)) |
| 197 | code, _, err = gsutil.check_call('cp', '-q', file_url, output_filename) |
| 198 | if code != 0: |
| 199 | out_q.put('%d> %s' % (thread_num, err)) |
| 200 | ret_codes.put((code, err)) |
| 201 | |
[email protected] | 25a33d3 | 2013-12-05 22:34:27 | [diff] [blame] | 202 | # Set executable bit. |
| 203 | if sys.platform == 'cygwin': |
| 204 | # Under cygwin, mark all files as executable. The executable flag in |
| 205 | # Google Storage will not be set when uploading from Windows, so if |
| 206 | # this script is running under cygwin and we're downloading an |
| 207 | # executable, it will be unrunnable from inside cygwin without this. |
| 208 | st = os.stat(output_filename) |
| 209 | os.chmod(output_filename, st.st_mode | stat.S_IEXEC) |
| 210 | elif sys.platform != 'win32': |
| 211 | # On non-Windows platforms, key off of the custom header |
| 212 | # "x-goog-meta-executable". |
| 213 | # |
| 214 | # TODO(hinoka): It is supposedly faster to use "gsutil stat" but that |
| 215 | # doesn't appear to be supported by the gsutil currently in our tree. When |
| 216 | # we update, this code should use that instead of "gsutil ls -L". |
[email protected] | ba63bcb | 2013-10-28 19:55:48 | [diff] [blame] | 217 | code, out, _ = gsutil.check_call('ls', '-L', file_url) |
| 218 | if code != 0: |
| 219 | out_q.put('%d> %s' % (thread_num, err)) |
| 220 | ret_codes.put((code, err)) |
| 221 | elif re.search('x-goog-meta-executable:', out): |
| 222 | st = os.stat(output_filename) |
| 223 | os.chmod(output_filename, st.st_mode | stat.S_IEXEC) |
[email protected] | 867e5b5 | 2013-03-13 21:43:51 | [diff] [blame] | 224 | |
| 225 | def printer_worker(output_queue): |
| 226 | while True: |
| 227 | line = output_queue.get() |
| 228 | # Its plausible we want to print empty lines. |
| 229 | if line is None: |
| 230 | break |
| 231 | print line |
| 232 | |
| 233 | |
| 234 | def download_from_google_storage( |
| 235 | input_filename, base_url, gsutil, num_threads, directory, recursive, |
[email protected] | ff7ea00 | 2013-11-25 19:28:54 | [diff] [blame] | 236 | force, output, ignore_errors, sha1_file, verbose): |
[email protected] | 867e5b5 | 2013-03-13 21:43:51 | [diff] [blame] | 237 | # Start up all the worker threads. |
| 238 | all_threads = [] |
| 239 | download_start = time.time() |
| 240 | stdout_queue = Queue.Queue() |
| 241 | work_queue = Queue.Queue() |
| 242 | ret_codes = Queue.Queue() |
| 243 | ret_codes.put((0, None)) |
| 244 | for thread_num in range(num_threads): |
| 245 | t = threading.Thread( |
| 246 | target=_downloader_worker_thread, |
| 247 | args=[thread_num, work_queue, force, base_url, |
[email protected] | ff7ea00 | 2013-11-25 19:28:54 | [diff] [blame] | 248 | gsutil, stdout_queue, ret_codes, verbose]) |
[email protected] | 867e5b5 | 2013-03-13 21:43:51 | [diff] [blame] | 249 | t.daemon = True |
| 250 | t.start() |
| 251 | all_threads.append(t) |
| 252 | printer_thread = threading.Thread(target=printer_worker, args=[stdout_queue]) |
| 253 | printer_thread.daemon = True |
| 254 | printer_thread.start() |
| 255 | |
| 256 | # Enumerate our work queue. |
| 257 | work_queue_size = enumerate_work_queue( |
| 258 | input_filename, work_queue, directory, recursive, |
| 259 | ignore_errors, output, sha1_file) |
| 260 | for _ in all_threads: |
| 261 | work_queue.put((None, None)) # Used to tell worker threads to stop. |
| 262 | |
| 263 | # Wait for all downloads to finish. |
| 264 | for t in all_threads: |
| 265 | t.join() |
| 266 | stdout_queue.put(None) |
| 267 | printer_thread.join() |
| 268 | |
| 269 | # See if we ran into any errors. |
| 270 | max_ret_code = 0 |
| 271 | for ret_code, message in ret_codes.queue: |
| 272 | max_ret_code = max(ret_code, max_ret_code) |
| 273 | if message: |
| 274 | print >> sys.stderr, message |
[email protected] | ff7ea00 | 2013-11-25 19:28:54 | [diff] [blame] | 275 | if verbose and not max_ret_code: |
[email protected] | 867e5b5 | 2013-03-13 21:43:51 | [diff] [blame] | 276 | print 'Success!' |
| 277 | |
[email protected] | ff7ea00 | 2013-11-25 19:28:54 | [diff] [blame] | 278 | if verbose: |
| 279 | print 'Downloading %d files took %1f second(s)' % ( |
| 280 | work_queue_size, time.time() - download_start) |
[email protected] | 867e5b5 | 2013-03-13 21:43:51 | [diff] [blame] | 281 | return max_ret_code |
| 282 | |
| 283 | |
| 284 | def main(args): |
| 285 | usage = ('usage: %prog [options] target\n' |
| 286 | 'Target must be:\n' |
| 287 | ' (default) a sha1 sum ([A-Za-z0-9]{40}).\n' |
| 288 | ' (-s or --sha1_file) a .sha1 file, containing a sha1 sum on ' |
| 289 | 'the first line.\n' |
| 290 | ' (-d or --directory) A directory to scan for .sha1 files.') |
| 291 | parser = optparse.OptionParser(usage) |
| 292 | parser.add_option('-o', '--output', |
| 293 | help='Specify the output file name. Defaults to: ' |
| 294 | '(a) Given a SHA1 hash, the name is the SHA1 hash. ' |
| 295 | '(b) Given a .sha1 file or directory, the name will ' |
| 296 | 'match (.*).sha1.') |
| 297 | parser.add_option('-b', '--bucket', |
| 298 | help='Google Storage bucket to fetch from.') |
| 299 | parser.add_option('-e', '--boto', |
| 300 | help='Specify a custom boto file.') |
| 301 | parser.add_option('-c', '--no_resume', action='store_true', |
| 302 | help='Resume download if file is partially downloaded.') |
| 303 | parser.add_option('-f', '--force', action='store_true', |
| 304 | help='Force download even if local file exists.') |
| 305 | parser.add_option('-i', '--ignore_errors', action='store_true', |
| 306 | help='Don\'t throw error if we find an invalid .sha1 file.') |
| 307 | parser.add_option('-r', '--recursive', action='store_true', |
| 308 | help='Scan folders recursively for .sha1 files. ' |
| 309 | 'Must be used with -d/--directory') |
| 310 | parser.add_option('-t', '--num_threads', default=1, type='int', |
| 311 | help='Number of downloader threads to run.') |
| 312 | parser.add_option('-d', '--directory', action='store_true', |
| 313 | help='The target is a directory. ' |
| 314 | 'Cannot be used with -s/--sha1_file.') |
| 315 | parser.add_option('-s', '--sha1_file', action='store_true', |
| 316 | help='The target is a file containing a sha1 sum. ' |
| 317 | 'Cannot be used with -d/--directory.') |
[email protected] | 0477f8c | 2013-06-26 22:23:57 | [diff] [blame] | 318 | parser.add_option('-g', '--config', action='store_true', |
| 319 | help='Alias for "gsutil config". Run this if you want ' |
| 320 | 'to initialize your saved Google Storage ' |
| 321 | 'credentials.') |
[email protected] | c61894c | 2013-11-19 20:25:21 | [diff] [blame] | 322 | parser.add_option('-n', '--no_auth', action='store_true', |
| 323 | help='Skip auth checking. Use if it\'s known that the ' |
| 324 | 'target bucket is a public bucket.') |
| 325 | parser.add_option('-p', '--platform', |
[email protected] | ba63bcb | 2013-10-28 19:55:48 | [diff] [blame] | 326 | help='A regular expression that is compared against ' |
| 327 | 'Python\'s sys.platform. If this option is specified, ' |
| 328 | 'the download will happen only if there is a match.') |
[email protected] | ff7ea00 | 2013-11-25 19:28:54 | [diff] [blame] | 329 | parser.add_option('-v', '--verbose', action='store_true', |
| 330 | help='Output extra diagnostic and progress information.') |
[email protected] | 867e5b5 | 2013-03-13 21:43:51 | [diff] [blame] | 331 | |
| 332 | (options, args) = parser.parse_args() |
[email protected] | ba63bcb | 2013-10-28 19:55:48 | [diff] [blame] | 333 | |
| 334 | # Make sure we should run at all based on platform matching. |
| 335 | if options.platform: |
[email protected] | 7d4cc4a | 2013-12-06 18:30:57 | [diff] [blame] | 336 | if not re.match(options.platform, GetNormalizedPlatform()): |
[email protected] | ff7ea00 | 2013-11-25 19:28:54 | [diff] [blame] | 337 | if options.verbose: |
| 338 | print('The current platform doesn\'t match "%s", skipping.' % |
| 339 | options.platform) |
[email protected] | ba63bcb | 2013-10-28 19:55:48 | [diff] [blame] | 340 | return 0 |
| 341 | |
[email protected] | c61894c | 2013-11-19 20:25:21 | [diff] [blame] | 342 | # Set the boto file to /dev/null if we don't need auth. |
| 343 | if options.no_auth: |
| 344 | options.boto = os.devnull |
| 345 | |
[email protected] | ba63bcb | 2013-10-28 19:55:48 | [diff] [blame] | 346 | # Make sure we can find a working instance of gsutil. |
[email protected] | 0477f8c | 2013-06-26 22:23:57 | [diff] [blame] | 347 | if os.path.exists(GSUTIL_DEFAULT_PATH): |
| 348 | gsutil = Gsutil(GSUTIL_DEFAULT_PATH, boto_path=options.boto) |
| 349 | else: |
| 350 | gsutil = None |
| 351 | for path in os.environ["PATH"].split(os.pathsep): |
| 352 | if os.path.exists(path) and 'gsutil' in os.listdir(path): |
| 353 | gsutil = Gsutil(os.path.join(path, 'gsutil'), boto_path=options.boto) |
| 354 | if not gsutil: |
| 355 | parser.error('gsutil not found in %s, bad depot_tools checkout?' % |
| 356 | GSUTIL_DEFAULT_PATH) |
| 357 | |
| 358 | # Passing in -g/--config will run our copy of GSUtil, then quit. |
| 359 | if options.config: |
| 360 | return gsutil.call('config') |
| 361 | |
[email protected] | 867e5b5 | 2013-03-13 21:43:51 | [diff] [blame] | 362 | if not args: |
| 363 | parser.error('Missing target.') |
| 364 | if len(args) > 1: |
| 365 | parser.error('Too many targets.') |
| 366 | if not options.bucket: |
| 367 | parser.error('Missing bucket. Specify bucket with --bucket.') |
| 368 | if options.sha1_file and options.directory: |
| 369 | parser.error('Both --directory and --sha1_file are specified, ' |
| 370 | 'can only specify one.') |
| 371 | if options.recursive and not options.directory: |
| 372 | parser.error('--recursive specified but --directory not specified.') |
| 373 | if options.output and options.directory: |
| 374 | parser.error('--directory is specified, so --output has no effect.') |
| 375 | input_filename = args[0] |
| 376 | |
| 377 | # Set output filename if not specified. |
| 378 | if not options.output and not options.directory: |
| 379 | if not options.sha1_file: |
| 380 | # Target is a sha1 sum, so output filename would also be the sha1 sum. |
| 381 | options.output = input_filename |
| 382 | elif options.sha1_file: |
| 383 | # Target is a .sha1 file. |
| 384 | if not input_filename.endswith('.sha1'): |
| 385 | parser.error('--sha1_file is specified, but the input filename ' |
| 386 | 'does not end with .sha1, and no --output is specified. ' |
| 387 | 'Either make sure the input filename has a .sha1 ' |
| 388 | 'extension, or specify --output.') |
| 389 | options.output = input_filename[:-5] |
| 390 | else: |
| 391 | parser.error('Unreachable state.') |
| 392 | |
| 393 | # Check if output file already exists. |
| 394 | if not options.directory and not options.force and not options.no_resume: |
| 395 | if os.path.exists(options.output): |
| 396 | parser.error('Output file %s exists and --no_resume is specified.' |
| 397 | % options.output) |
| 398 | |
[email protected] | 867e5b5 | 2013-03-13 21:43:51 | [diff] [blame] | 399 | # Check we have a valid bucket with valid permissions. |
| 400 | base_url, code = check_bucket_permissions(options.bucket, gsutil) |
| 401 | if code: |
| 402 | return code |
| 403 | |
| 404 | return download_from_google_storage( |
| 405 | input_filename, base_url, gsutil, options.num_threads, options.directory, |
| 406 | options.recursive, options.force, options.output, options.ignore_errors, |
[email protected] | ff7ea00 | 2013-11-25 19:28:54 | [diff] [blame] | 407 | options.sha1_file, options.verbose) |
[email protected] | 867e5b5 | 2013-03-13 21:43:51 | [diff] [blame] | 408 | |
| 409 | |
| 410 | if __name__ == '__main__': |
[email protected] | acb9ed7 | 2013-06-20 12:16:15 | [diff] [blame] | 411 | sys.exit(main(sys.argv)) |