blob: cb8723cf27e4d278aea45ad8fca536a3d4d1edab [file] [log] [blame]
#!/usr/bin/env python
# Copyright 2018 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Custom swarming triggering script.
This script does custom swarming triggering logic, to allow one bot to
conceptually span multiple Swarming configurations, while lumping all trigger
calls under one logical step.
The reason this script is needed is to allow seamless upgrades of the GPU, OS
version, or graphics driver. Most Chromium tests, GPU tests in particular, are
triggered with precise values for all of these Swarming dimensions. This ensures
that if a machine is added to the Swarming pool with a slightly different
configuration, tests don't fail for unexpected reasons.
During an upgrade of the fleet, it's not feasible to take half of the machines
offline. Some experience was gained with this during a recent upgrade of the
GPUs in Chromium's main Windows and Linux NVIDIA bots. In the middle of the
upgrade, only 50% of the capacity was available, and CQ jobs started to time
out. Once the hurdle had been passed in the middle of the upgrade, capacity was
sufficient, but it's crucial that this process remain seamless.
This script receives multiple machine configurations on the command line in the
form of quoted strings. These strings are JSON dictionaries that represent
entries in the "dimensions" array of the "swarming" dictionary in the
src/testing/buildbot JSON files. The script queries the Swarming pool for the
number of machines of each configuration, and distributes work (shards) among
them using the following algorithm:
1. If either configuration has machines available (online, not busy at the time
of the query) then distribute shards to them first.
2. Compute the relative fractions of all of the live (online, not quarantined,
not dead) machines of all configurations.
3. Distribute the remaining shards probabilistically among these configurations.
The use of random numbers attempts to avoid the pathology where one
configuration only has a couple of machines, and work is never distributed to it
once all machines are busy.
This script must have roughly the same command line interface as swarming.py
trigger. It modifies it in the following ways:
* Intercepts the dump-json argument, and creates its own by combining the
results from each trigger call.
* Scans through the multiple-trigger-configs dictionaries. For any key found,
deletes that dimension from the originally triggered task's dimensions. This
is what allows the Swarming dimensions to be replaced.
* On a per-shard basis, adds the Swarming dimensions chosen from the
multiple-trigger-configs list to the dimensions for the shard.
This script is normally called from the swarming recipe module in tools/build.
"""
import argparse
import copy
import json
import os
import random
import subprocess
import sys
import tempfile
import urllib
SRC_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(
__file__))))
SWARMING_PY = os.path.join(SRC_DIR, 'tools', 'swarming_client', 'swarming.py')
def strip_unicode(obj):
"""Recursively re-encodes strings as utf-8 inside |obj|. Returns the result.
"""
if isinstance(obj, unicode):
return obj.encode('utf-8', 'replace')
if isinstance(obj, list):
return list(map(strip_unicode, obj))
if isinstance(obj, dict):
new_obj = type(obj)(
(strip_unicode(k), strip_unicode(v)) for k, v in obj.iteritems() )
return new_obj
return obj
class MultiDimensionTestTriggerer(object):
def __init__(self):
self._bot_configs = None
self._bot_statuses = []
self._total_bots = 0
def modify_args(self, all_args, bot_index, shard_index, total_shards,
temp_file):
"""Modifies the given argument list.
Specifically, it does the following:
* Adds a --dump_json argument, to read in the results of the
individual trigger command.
* Adds the dimensions associated with the bot config at the given index.
* If the number of shards is greater than one, adds --env
arguments to set the GTEST_SHARD_INDEX and GTEST_TOTAL_SHARDS
environment variables to _shard_index_ and _total_shards_,
respectively.
The arguments are structured like this:
<args to swarming.py trigger> -- <args to bot running isolate>
This means we have to add arguments to specific locations in the argument
list, to either affect the trigger command, or what the bot runs.
"""
assert '--' in all_args, (
'Malformed trigger command; -- argument expected but not found')
dash_ind = all_args.index('--')
bot_args = ['--dump-json', temp_file]
if total_shards > 1:
bot_args.append('--env')
bot_args.append('GTEST_SHARD_INDEX')
bot_args.append(str(shard_index))
bot_args.append('--env')
bot_args.append('GTEST_TOTAL_SHARDS')
bot_args.append(str(total_shards))
for key, val in sorted(self._bot_configs[bot_index].iteritems()):
bot_args.append('--dimension')
bot_args.append(key)
bot_args.append(val)
return all_args[:dash_ind] + bot_args + all_args[dash_ind:]
def parse_bot_configs(self, args):
try:
self._bot_configs = strip_unicode(json.loads(
args.multiple_trigger_configs))
except Exception as e:
raise ValueError('Error while parsing JSON from bot config string %s: %s'
% (args.multiple_trigger_configs, str(e)))
# Validate the input.
if not isinstance(self._bot_configs, list):
raise ValueError('Bot configurations must be a list, were: %s' %
args.multiple_trigger_configs)
if len(self._bot_configs) < 1:
raise ValueError('Bot configuration list must have at least one entry')
if not all(isinstance(entry, dict) for entry in self._bot_configs):
raise ValueError('Bot configurations must all be dictionaries')
def query_swarming_for_bot_configs(self, verbose):
# Query Swarming to figure out which bots are available.
for config in self._bot_configs:
values = []
for key, value in sorted(config.iteritems()):
values.append(('dimensions', '%s:%s' % (key, value)))
# Ignore dead and quarantined bots.
values.append(('is_dead', 'FALSE'))
values.append(('quarantined', 'FALSE'))
query_arg = urllib.urlencode(values)
temp_file = self.make_temp_file(prefix='trigger_multiple_dimensions',
suffix='.json')
try:
ret = self.run_swarming(['query',
'-S',
'chromium-swarm.appspot.com',
'--limit',
'0',
'--json',
temp_file,
('bots/count?%s' % query_arg)],
verbose)
if ret:
raise Exception('Error running swarming.py')
with open(temp_file) as fp:
query_result = strip_unicode(json.load(fp))
# Summarize number of available bots per configuration.
count = int(query_result['count'])
# Be robust against errors in computation.
available = max(0, count - int(query_result['busy']))
self._bot_statuses.append({'total': count, 'available': available})
if verbose:
idx = len(self._bot_statuses) - 1
print 'Bot config %d: %s' % (idx, str(self._bot_statuses[idx]))
finally:
self.delete_temp_file(temp_file)
# Sum up the total count of all bots.
self._total_bots = sum(x['total'] for x in self._bot_statuses)
if verbose:
print 'Total bots: %d' % (self._total_bots)
def remove_swarming_dimension(self, args, dimension):
for i in xrange(len(args)):
if args[i] == '--dimension' and args[i+1] == dimension:
return args[:i] + args[i+3:]
return args
def choose_random_int(self, max_num):
return random.randint(1, max_num)
def pick_bot_configuration(self, verbose):
# These are the rules used:
# 1. If any configuration has bots available, pick the configuration with
# the most bots available.
# 2. If no configuration has bots available, pick a random configuration
# based on the total number of bots in each configuration.
#
# This method updates bot_statuses_ in case (1), and in both cases, returns
# the index into bot_configs_ that should be used.
if any(status['available'] > 0 for status in self._bot_statuses):
# Case 1.
max_index = 0
max_val = self._bot_statuses[0]['available']
for i in xrange(1, len(self._bot_statuses)):
avail = self._bot_statuses[i]['available']
if avail > max_val:
max_index = i
max_val = avail
self._bot_statuses[max_index]['available'] -= 1
assert self._bot_statuses[max_index]['available'] >= 0
if verbose:
print 'Chose bot config %d because bots were available' % (max_index)
return max_index
# Case 2.
# We want to choose a bot uniformly at random from all of the bots specified
# in the bot configs. To do this, we conceptually group the bots into
# buckets, pick a random number between 1 and the total number of bots, and
# figure out which bucket of bots it landed in.
r = self.choose_random_int(self._total_bots)
for i, status in enumerate(self._bot_statuses):
if r <= status['total']:
if verbose:
print 'Chose bot config %d stochastically' % (i)
return i
r -= status['total']
raise Exception('Should not reach here')
def make_temp_file(self, prefix=None, suffix=None):
# This trick of closing the file handle is needed on Windows in order to
# make the file writeable.
h, temp_file = tempfile.mkstemp(prefix=prefix, suffix=suffix)
os.close(h)
return temp_file
def delete_temp_file(self, temp_file):
os.remove(temp_file)
def read_json_from_temp_file(self, temp_file):
with open(temp_file) as f:
return json.load(f)
def write_json_to_file(self, merged_json, output_file):
with open(output_file, 'w') as f:
json.dump(merged_json, f)
def run_swarming(self, args, verbose):
if verbose:
print 'Running Swarming with args:'
print str(args)
return subprocess.call([sys.executable, SWARMING_PY] + args)
def trigger_tasks(self, args, remaining):
"""Triggers tasks for each bot.
Args:
args: Parsed arguments which we need to use.
remaining: The remainder of the arguments, which should be passed to
swarming.py calls.
Returns:
Exit code for the script.
"""
verbose = args.multiple_dimension_script_verbose
self.parse_bot_configs(args)
self.query_swarming_for_bot_configs(verbose)
# In the remaining arguments, find the Swarming dimensions that are
# specified by the bot configs and remove them, because for each shard,
# we're going to select one of the bot configs and put all of its Swarming
# dimensions on the command line.
filtered_remaining_args = copy.deepcopy(remaining)
for config in self._bot_configs:
for k in config.iterkeys():
filtered_remaining_args = self.remove_swarming_dimension(
filtered_remaining_args, k)
merged_json = {}
for i in xrange(args.shards):
# For each shard that we're going to distribute, do the following:
# 1. Pick which bot configuration to use.
# 2. Insert that bot configuration's dimensions as command line
# arguments, and invoke "swarming.py trigger".
bot_index = self.pick_bot_configuration(verbose)
# Holds the results of the swarming.py trigger call.
try:
json_temp = self.make_temp_file(prefix='trigger_multiple_dimensions',
suffix='.json')
args_to_pass = self.modify_args(filtered_remaining_args, bot_index, i,
args.shards, json_temp)
ret = self.run_swarming(args_to_pass, verbose)
if ret:
sys.stderr.write('Failed to trigger a task, aborting\n')
return ret
result_json = self.read_json_from_temp_file(json_temp)
if i == 0:
# Copy the entire JSON -- in particular, the "request"
# dictionary -- from shard 0. "swarming.py collect" uses
# some keys from this dictionary, in particular related to
# expiration. It also contains useful debugging information.
merged_json = copy.deepcopy(result_json)
# However, reset the "tasks" entry to an empty dictionary,
# which will be handled specially.
merged_json['tasks'] = {}
for k, v in result_json['tasks'].items():
v['shard_index'] = i
merged_json['tasks'][k + ':%d:%d' % (i, args.shards)] = v
finally:
self.delete_temp_file(json_temp)
self.write_json_to_file(merged_json, args.dump_json)
return 0
def main():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('--multiple-trigger-configs', type=str, required=True,
help='The Swarming configurations to trigger tasks on, '
'in the form of a JSON array of dictionaries (these are '
'Swarming dimension_sets). At least one entry in this '
'dictionary is required.')
parser.add_argument('--multiple-dimension-script-verbose', type=bool,
default=False, help='Turn on verbose logging')
parser.add_argument('--dump-json', required=True,
help='(Swarming Trigger Script API) Where to dump the'
' resulting json which indicates which tasks were'
' triggered for which shards.')
parser.add_argument('--shards', type=int, default=1,
help='How many shards to trigger. Duplicated from the'
' `swarming.py trigger` command.')
args, remaining = parser.parse_known_args()
return MultiDimensionTestTriggerer().trigger_tasks(args, remaining)
if __name__ == '__main__':
sys.exit(main())