Congbin Guo | 3afae6c | 2019-08-13 23:29:42 | [diff] [blame] | 1 | # -*- coding: utf-8 -*- |
| 2 | # Copyright 2019 The Chromium OS Authors. All rights reserved. |
| 3 | # Use of this source code is governed by a BSD-style license that can be |
| 4 | # found in the LICENSE file. |
| 5 | """A cherrypy application to check devserver health status.""" |
| 6 | |
| 7 | from __future__ import absolute_import |
| 8 | from __future__ import division |
| 9 | from __future__ import print_function |
| 10 | |
| 11 | import json |
| 12 | import os |
| 13 | import subprocess |
| 14 | import threading |
| 15 | import time |
| 16 | |
Amin Hassani | d4e3539 | 2019-10-03 18:02:44 | [diff] [blame] | 17 | import cherrypy # pylint: disable=import-error |
Vaibhav Rustagi | d10bd8d | 2019-11-20 19:32:25 | [diff] [blame] | 18 | |
| 19 | try: |
| 20 | import psutil |
| 21 | except ImportError: |
| 22 | # Ignore psutil import failure. lakitu doesn't have psutil installed |
| 23 | # and for auto-update test, lakitu copies the devserver code and uses |
| 24 | # that to run the devserver. This results in failure of devserver |
| 25 | # and the auto-update test fails. |
| 26 | psutil = None |
Congbin Guo | 3afae6c | 2019-08-13 23:29:42 | [diff] [blame] | 27 | |
Achuith Bhandarkar | 662fb72 | 2019-10-31 23:12:49 | [diff] [blame] | 28 | import setup_chromite # pylint: disable=unused-import |
Amin Hassani | e427e21 | 2019-10-28 18:04:27 | [diff] [blame] | 29 | from chromite.lib import cros_update_progress |
Achuith Bhandarkar | 662fb72 | 2019-10-31 23:12:49 | [diff] [blame] | 30 | from chromite.lib.xbuddy import cherrypy_log_util |
Congbin Guo | 3afae6c | 2019-08-13 23:29:42 | [diff] [blame] | 31 | |
Congbin Guo | 3afae6c | 2019-08-13 23:29:42 | [diff] [blame] | 32 | |
Achuith Bhandarkar | 662fb72 | 2019-10-31 23:12:49 | [diff] [blame] | 33 | def _Log(message, *args): |
| 34 | """Module-local log function.""" |
| 35 | return cherrypy_log_util.LogWithTag('HEALTHCHECKER', message, *args) |
| 36 | |
Congbin Guo | 3afae6c | 2019-08-13 23:29:42 | [diff] [blame] | 37 | # Number of seconds between the collection of disk and network IO counters. |
| 38 | STATS_INTERVAL = 10.0 |
| 39 | _1G = 1000000000 |
| 40 | |
| 41 | |
| 42 | def require_psutil(): |
| 43 | """Decorator for functions require psutil to run.""" |
| 44 | def deco_require_psutil(func): |
| 45 | """Wrapper of the decorator function. |
| 46 | |
| 47 | Args: |
| 48 | func: function to be called. |
| 49 | """ |
| 50 | def func_require_psutil(*args, **kwargs): |
| 51 | """Decorator for functions require psutil to run. |
| 52 | |
| 53 | If psutil is not installed, skip calling the function. |
| 54 | |
| 55 | Args: |
| 56 | *args: arguments for function to be called. |
| 57 | **kwargs: keyword arguments for function to be called. |
| 58 | """ |
| 59 | if psutil: |
| 60 | return func(*args, **kwargs) |
| 61 | else: |
| 62 | _Log('Python module psutil is not installed. Function call %s is ' |
| 63 | 'skipped.' % func) |
| 64 | return func_require_psutil |
| 65 | return deco_require_psutil |
| 66 | |
| 67 | |
| 68 | def _get_process_count(process_cmd_pattern): |
| 69 | """Get the count of processes that match the given command pattern. |
| 70 | |
| 71 | Args: |
| 72 | process_cmd_pattern: The regex pattern of process command to match. |
| 73 | |
| 74 | Returns: |
| 75 | The count of processes that match the given command pattern. |
| 76 | """ |
| 77 | try: |
| 78 | # Use Popen instead of check_output since the latter cannot run with old |
| 79 | # python version (less than 2.7) |
| 80 | proc = subprocess.Popen( |
| 81 | ['pgrep', '-fc', process_cmd_pattern], |
| 82 | stdout=subprocess.PIPE, |
| 83 | stderr=subprocess.PIPE, |
| 84 | ) |
| 85 | cmd_output, cmd_error = proc.communicate() |
| 86 | if cmd_error: |
| 87 | _Log('Error happened when getting process count: %s' % cmd_error) |
| 88 | |
| 89 | return int(cmd_output) |
| 90 | except subprocess.CalledProcessError: |
| 91 | return 0 |
| 92 | |
| 93 | |
| 94 | def get_config(): |
| 95 | """Get cherrypy config for this application.""" |
| 96 | return { |
| 97 | '/': { |
| 98 | # Automatically add trailing slash, i.e. |
| 99 | # /check_health -> /check_health/. |
| 100 | 'tools.trailing_slash.on': False, |
| 101 | } |
| 102 | } |
| 103 | |
| 104 | |
| 105 | class Root(object): |
| 106 | """Cherrypy Root class of the application.""" |
| 107 | def __init__(self, devserver, static_dir): |
| 108 | self._static_dir = static_dir |
| 109 | self._devserver = devserver |
| 110 | |
| 111 | # Cache of disk IO stats, a thread refresh the stats every 10 seconds. |
| 112 | # lock is not used for these variables as the only thread writes to these |
| 113 | # variables is _refresh_io_stats. |
| 114 | self.disk_read_bytes_per_sec = 0 |
| 115 | self.disk_write_bytes_per_sec = 0 |
| 116 | # Cache of network IO stats. |
| 117 | self.network_sent_bytes_per_sec = 0 |
| 118 | self.network_recv_bytes_per_sec = 0 |
| 119 | self._start_io_stat_thread() |
| 120 | |
| 121 | @require_psutil() |
| 122 | def _get_io_stats(self): |
| 123 | """Get the IO stats as a dictionary. |
| 124 | |
| 125 | Returns: |
| 126 | A dictionary of IO stats collected by psutil. |
| 127 | """ |
| 128 | return {'disk_read_bytes_per_second': self.disk_read_bytes_per_sec, |
| 129 | 'disk_write_bytes_per_second': self.disk_write_bytes_per_sec, |
| 130 | 'disk_total_bytes_per_second': (self.disk_read_bytes_per_sec + |
| 131 | self.disk_write_bytes_per_sec), |
| 132 | 'network_sent_bytes_per_second': self.network_sent_bytes_per_sec, |
| 133 | 'network_recv_bytes_per_second': self.network_recv_bytes_per_sec, |
| 134 | 'network_total_bytes_per_second': (self.network_sent_bytes_per_sec + |
| 135 | self.network_recv_bytes_per_sec), |
| 136 | 'cpu_percent': psutil.cpu_percent(), } |
| 137 | |
| 138 | @require_psutil() |
| 139 | def _refresh_io_stats(self): |
| 140 | """A call running in a thread to update IO stats periodically.""" |
| 141 | prev_disk_io_counters = psutil.disk_io_counters() |
| 142 | prev_network_io_counters = psutil.net_io_counters() |
| 143 | prev_read_time = time.time() |
| 144 | while True: |
| 145 | time.sleep(STATS_INTERVAL) |
| 146 | now = time.time() |
| 147 | interval = now - prev_read_time |
| 148 | prev_read_time = now |
| 149 | # Disk IO is for all disks. |
| 150 | disk_io_counters = psutil.disk_io_counters() |
| 151 | network_io_counters = psutil.net_io_counters() |
| 152 | |
| 153 | self.disk_read_bytes_per_sec = ( |
| 154 | disk_io_counters.read_bytes - |
| 155 | prev_disk_io_counters.read_bytes) / interval |
| 156 | self.disk_write_bytes_per_sec = ( |
| 157 | disk_io_counters.write_bytes - |
| 158 | prev_disk_io_counters.write_bytes) / interval |
| 159 | prev_disk_io_counters = disk_io_counters |
| 160 | |
| 161 | self.network_sent_bytes_per_sec = ( |
| 162 | network_io_counters.bytes_sent - |
| 163 | prev_network_io_counters.bytes_sent) / interval |
| 164 | self.network_recv_bytes_per_sec = ( |
| 165 | network_io_counters.bytes_recv - |
| 166 | prev_network_io_counters.bytes_recv) / interval |
| 167 | prev_network_io_counters = network_io_counters |
| 168 | |
| 169 | @require_psutil() |
| 170 | def _start_io_stat_thread(self): |
| 171 | """Start the thread to collect IO stats.""" |
| 172 | thread = threading.Thread(target=self._refresh_io_stats) |
| 173 | thread.daemon = True |
| 174 | thread.start() |
| 175 | |
| 176 | @cherrypy.expose |
| 177 | def index(self): |
| 178 | """Collect the health status of devserver to see if it's ready for staging. |
| 179 | |
| 180 | Returns: |
| 181 | A JSON dictionary containing all or some of the following fields: |
| 182 | free_disk (int): free disk space in GB |
| 183 | staging_thread_count (int): number of devserver threads currently staging |
| 184 | an image |
| 185 | apache_client_count (int): count of Apache processes. |
| 186 | telemetry_test_count (int): count of telemetry tests. |
| 187 | gsutil_count (int): count of gsutil processes. |
| 188 | """ |
| 189 | # Get free disk space. |
| 190 | stat = os.statvfs(self._static_dir) |
| 191 | free_disk = stat.f_bsize * stat.f_bavail / _1G |
| 192 | apache_client_count = _get_process_count('bin/apache2? -k start') |
| 193 | telemetry_test_count = _get_process_count('python.*telemetry') |
| 194 | gsutil_count = _get_process_count('gsutil') |
| 195 | au_process_count = len(cros_update_progress.GetAllRunningAUProcess()) |
| 196 | |
| 197 | health_data = { |
| 198 | 'free_disk': free_disk, |
| 199 | 'staging_thread_count': self._devserver.staging_thread_count, |
| 200 | 'apache_client_count': apache_client_count, |
| 201 | 'telemetry_test_count': telemetry_test_count, |
| 202 | 'gsutil_count': gsutil_count, |
| 203 | 'au_process_count': au_process_count, |
| 204 | } |
| 205 | health_data.update(self._get_io_stats() or {}) |
| 206 | |
| 207 | return json.dumps(health_data) |