[email protected] | 623c0bd | 2011-03-12 01:00:41 | [diff] [blame] | 1 | // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
| 5 | #if defined(OS_WIN) |
| 6 | #include <windows.h> |
| 7 | #endif |
| 8 | |
[email protected] | 623c0bd | 2011-03-12 01:00:41 | [diff] [blame] | 9 | #include "content/gpu/gpu_watchdog_thread.h" |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 10 | |
| 11 | #include "base/compiler_specific.h" |
[email protected] | f9a7e08f | 2011-08-18 21:20:16 | [diff] [blame] | 12 | #include "base/process_util.h" |
| 13 | #include "base/process.h" |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 14 | #include "build/build_config.h" |
[email protected] | f9a7e08f | 2011-08-18 21:20:16 | [diff] [blame] | 15 | #include "content/common/result_codes.h" |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 16 | |
| 17 | namespace { |
| 18 | const int64 kCheckPeriod = 2000; |
[email protected] | 49eab48 | 2010-11-24 00:07:43 | [diff] [blame] | 19 | |
| 20 | void DoNothing() { |
| 21 | } |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 22 | } |
| 23 | |
[email protected] | 981c1c5 | 2010-12-01 20:09:24 | [diff] [blame] | 24 | GpuWatchdogThread::GpuWatchdogThread(int timeout) |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 25 | : base::Thread("Watchdog"), |
[email protected] | 981c1c5 | 2010-12-01 20:09:24 | [diff] [blame] | 26 | watched_message_loop_(MessageLoop::current()), |
[email protected] | 49eab48 | 2010-11-24 00:07:43 | [diff] [blame] | 27 | timeout_(timeout), |
| 28 | armed_(false), |
[email protected] | 981c1c5 | 2010-12-01 20:09:24 | [diff] [blame] | 29 | #if defined(OS_WIN) |
| 30 | watched_thread_handle_(0), |
[email protected] | 995a7f1 | 2011-02-11 23:07:17 | [diff] [blame] | 31 | arm_cpu_time_(0), |
[email protected] | 981c1c5 | 2010-12-01 20:09:24 | [diff] [blame] | 32 | #endif |
[email protected] | 49eab48 | 2010-11-24 00:07:43 | [diff] [blame] | 33 | ALLOW_THIS_IN_INITIALIZER_LIST(task_observer_(this)) { |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 34 | DCHECK(timeout >= 0); |
[email protected] | 49eab48 | 2010-11-24 00:07:43 | [diff] [blame] | 35 | |
[email protected] | 981c1c5 | 2010-12-01 20:09:24 | [diff] [blame] | 36 | #if defined(OS_WIN) |
| 37 | // GetCurrentThread returns a pseudo-handle that cannot be used by one thread |
| 38 | // to identify another. DuplicateHandle creates a "real" handle that can be |
| 39 | // used for this purpose. |
| 40 | BOOL result = DuplicateHandle(GetCurrentProcess(), |
| 41 | GetCurrentThread(), |
| 42 | GetCurrentProcess(), |
| 43 | &watched_thread_handle_, |
| 44 | THREAD_QUERY_INFORMATION, |
| 45 | FALSE, |
| 46 | 0); |
| 47 | DCHECK(result); |
| 48 | #endif |
| 49 | |
[email protected] | 49eab48 | 2010-11-24 00:07:43 | [diff] [blame] | 50 | watched_message_loop_->AddTaskObserver(&task_observer_); |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 51 | } |
| 52 | |
| 53 | GpuWatchdogThread::~GpuWatchdogThread() { |
| 54 | // Verify that the thread was explicitly stopped. If the thread is stopped |
| 55 | // implicitly by the destructor, CleanUp() will not be called. |
| 56 | DCHECK(!method_factory_.get()); |
[email protected] | 49eab48 | 2010-11-24 00:07:43 | [diff] [blame] | 57 | |
[email protected] | 981c1c5 | 2010-12-01 20:09:24 | [diff] [blame] | 58 | #if defined(OS_WIN) |
| 59 | CloseHandle(watched_thread_handle_); |
| 60 | #endif |
| 61 | |
[email protected] | 49eab48 | 2010-11-24 00:07:43 | [diff] [blame] | 62 | watched_message_loop_->RemoveTaskObserver(&task_observer_); |
| 63 | } |
| 64 | |
| 65 | void GpuWatchdogThread::PostAcknowledge() { |
| 66 | // Called on the monitored thread. Responds with OnAcknowledge. Cannot use |
| 67 | // the method factory. Rely on reference counting instead. |
| 68 | message_loop()->PostTask( |
| 69 | FROM_HERE, |
| 70 | NewRunnableMethod(this, &GpuWatchdogThread::OnAcknowledge)); |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 71 | } |
| 72 | |
| 73 | void GpuWatchdogThread::Init() { |
| 74 | // The method factory must be created on the watchdog thread. |
| 75 | method_factory_.reset(new MethodFactory(this)); |
| 76 | |
| 77 | // Schedule the first check. |
| 78 | OnCheck(); |
| 79 | } |
| 80 | |
| 81 | void GpuWatchdogThread::CleanUp() { |
| 82 | // The method factory must be destroyed on the watchdog thread. |
| 83 | method_factory_->RevokeAll(); |
| 84 | method_factory_.reset(); |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 85 | } |
| 86 | |
[email protected] | 49eab48 | 2010-11-24 00:07:43 | [diff] [blame] | 87 | GpuWatchdogThread::GpuWatchdogTaskObserver::GpuWatchdogTaskObserver( |
| 88 | GpuWatchdogThread* watchdog) |
[email protected] | b224f79 | 2011-04-20 16:02:23 | [diff] [blame] | 89 | : watchdog_(watchdog) { |
[email protected] | 49eab48 | 2010-11-24 00:07:43 | [diff] [blame] | 90 | } |
| 91 | |
| 92 | GpuWatchdogThread::GpuWatchdogTaskObserver::~GpuWatchdogTaskObserver() { |
| 93 | } |
| 94 | |
| 95 | void GpuWatchdogThread::GpuWatchdogTaskObserver::WillProcessTask( |
[email protected] | b224f79 | 2011-04-20 16:02:23 | [diff] [blame] | 96 | base::TimeTicks time_posted) { |
[email protected] | 808f7fe7 | 2011-03-23 03:49:02 | [diff] [blame] | 97 | watchdog_->CheckArmed(); |
[email protected] | 49eab48 | 2010-11-24 00:07:43 | [diff] [blame] | 98 | } |
| 99 | |
| 100 | void GpuWatchdogThread::GpuWatchdogTaskObserver::DidProcessTask( |
[email protected] | b224f79 | 2011-04-20 16:02:23 | [diff] [blame] | 101 | base::TimeTicks time_posted) { |
[email protected] | 808f7fe7 | 2011-03-23 03:49:02 | [diff] [blame] | 102 | watchdog_->CheckArmed(); |
[email protected] | 49eab48 | 2010-11-24 00:07:43 | [diff] [blame] | 103 | } |
| 104 | |
[email protected] | b224f79 | 2011-04-20 16:02:23 | [diff] [blame] | 105 | void GpuWatchdogThread::CheckArmed() { |
[email protected] | 49eab48 | 2010-11-24 00:07:43 | [diff] [blame] | 106 | // Acknowledge the watchdog if it has armed itself. The watchdog will not |
| 107 | // change its armed state until it is acknowledged. |
[email protected] | 808f7fe7 | 2011-03-23 03:49:02 | [diff] [blame] | 108 | if (armed()) { |
| 109 | PostAcknowledge(); |
[email protected] | 49eab48 | 2010-11-24 00:07:43 | [diff] [blame] | 110 | } |
| 111 | } |
| 112 | |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 113 | void GpuWatchdogThread::OnAcknowledge() { |
[email protected] | 49eab48 | 2010-11-24 00:07:43 | [diff] [blame] | 114 | // The check has already been acknowledged and another has already been |
| 115 | // scheduled by a previous call to OnAcknowledge. It is normal for a |
| 116 | // watched thread to see armed_ being true multiple times before |
| 117 | // the OnAcknowledge task is run on the watchdog thread. |
| 118 | if (!armed_) |
| 119 | return; |
| 120 | |
[email protected] | cff2ac8e | 2011-02-25 22:08:49 | [diff] [blame] | 121 | // Revoke any pending hang termination. |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 122 | method_factory_->RevokeAll(); |
[email protected] | 49eab48 | 2010-11-24 00:07:43 | [diff] [blame] | 123 | armed_ = false; |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 124 | |
| 125 | // The monitored thread has responded. Post a task to check it again. |
[email protected] | 981c1c5 | 2010-12-01 20:09:24 | [diff] [blame] | 126 | message_loop()->PostDelayedTask( |
| 127 | FROM_HERE, |
| 128 | method_factory_->NewRunnableMethod(&GpuWatchdogThread::OnCheck), |
| 129 | kCheckPeriod); |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 130 | } |
| 131 | |
[email protected] | 981c1c5 | 2010-12-01 20:09:24 | [diff] [blame] | 132 | #if defined(OS_WIN) |
| 133 | int64 GpuWatchdogThread::GetWatchedThreadTime() { |
| 134 | FILETIME creation_time; |
| 135 | FILETIME exit_time; |
| 136 | FILETIME user_time; |
| 137 | FILETIME kernel_time; |
| 138 | BOOL result = GetThreadTimes(watched_thread_handle_, |
| 139 | &creation_time, |
| 140 | &exit_time, |
| 141 | &kernel_time, |
| 142 | &user_time); |
| 143 | DCHECK(result); |
| 144 | |
| 145 | ULARGE_INTEGER user_time64; |
| 146 | user_time64.HighPart = user_time.dwHighDateTime; |
| 147 | user_time64.LowPart = user_time.dwLowDateTime; |
| 148 | |
| 149 | ULARGE_INTEGER kernel_time64; |
| 150 | kernel_time64.HighPart = kernel_time.dwHighDateTime; |
| 151 | kernel_time64.LowPart = kernel_time.dwLowDateTime; |
| 152 | |
| 153 | // Time is reported in units of 100 nanoseconds. Kernel and user time are |
| 154 | // summed to deal with to kinds of hangs. One is where the GPU process is |
| 155 | // stuck in user level, never calling into the kernel and kernel time is |
| 156 | // not increasing. The other is where either the kernel hangs and never |
| 157 | // returns to user level or where user level code |
| 158 | // calls into kernel level repeatedly, giving up its quanta before it is |
| 159 | // tracked, for example a loop that repeatedly Sleeps. |
| 160 | return static_cast<int64>( |
| 161 | (user_time64.QuadPart + kernel_time64.QuadPart) / 10000); |
| 162 | } |
| 163 | #endif |
| 164 | |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 165 | void GpuWatchdogThread::OnCheck() { |
[email protected] | 981c1c5 | 2010-12-01 20:09:24 | [diff] [blame] | 166 | if (armed_) |
| 167 | return; |
[email protected] | 49eab48 | 2010-11-24 00:07:43 | [diff] [blame] | 168 | |
[email protected] | 981c1c5 | 2010-12-01 20:09:24 | [diff] [blame] | 169 | // Must set armed before posting the task. This task might be the only task |
| 170 | // that will activate the TaskObserver on the watched thread and it must not |
| 171 | // miss the false -> true transition. |
| 172 | armed_ = true; |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 173 | |
[email protected] | 981c1c5 | 2010-12-01 20:09:24 | [diff] [blame] | 174 | #if defined(OS_WIN) |
[email protected] | 995a7f1 | 2011-02-11 23:07:17 | [diff] [blame] | 175 | arm_cpu_time_ = GetWatchedThreadTime(); |
[email protected] | 981c1c5 | 2010-12-01 20:09:24 | [diff] [blame] | 176 | #endif |
| 177 | |
[email protected] | 995a7f1 | 2011-02-11 23:07:17 | [diff] [blame] | 178 | arm_absolute_time_ = base::Time::Now(); |
| 179 | |
[email protected] | 981c1c5 | 2010-12-01 20:09:24 | [diff] [blame] | 180 | // Post a task to the monitored thread that does nothing but wake up the |
| 181 | // TaskObserver. Any other tasks that are pending on the watched thread will |
| 182 | // also wake up the observer. This simply ensures there is at least one. |
| 183 | watched_message_loop_->PostTask( |
| 184 | FROM_HERE, |
| 185 | NewRunnableFunction(DoNothing)); |
| 186 | |
| 187 | // Post a task to the watchdog thread to exit if the monitored thread does |
| 188 | // not respond in time. |
| 189 | message_loop()->PostDelayedTask( |
| 190 | FROM_HERE, |
[email protected] | cff2ac8e | 2011-02-25 22:08:49 | [diff] [blame] | 191 | method_factory_->NewRunnableMethod( |
[email protected] | f9a7e08f | 2011-08-18 21:20:16 | [diff] [blame] | 192 | &GpuWatchdogThread::DeliberatelyTerminateToRecoverFromHang), |
[email protected] | 981c1c5 | 2010-12-01 20:09:24 | [diff] [blame] | 193 | timeout_); |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 194 | } |
| 195 | |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 196 | // Use the --disable-gpu-watchdog command line switch to disable this. |
[email protected] | f9a7e08f | 2011-08-18 21:20:16 | [diff] [blame] | 197 | void GpuWatchdogThread::DeliberatelyTerminateToRecoverFromHang() { |
[email protected] | 981c1c5 | 2010-12-01 20:09:24 | [diff] [blame] | 198 | #if defined(OS_WIN) |
[email protected] | 995a7f1 | 2011-02-11 23:07:17 | [diff] [blame] | 199 | // Defer termination until a certain amount of CPU time has elapsed on the |
| 200 | // watched thread. |
| 201 | int64 time_since_arm = GetWatchedThreadTime() - arm_cpu_time_; |
[email protected] | 981c1c5 | 2010-12-01 20:09:24 | [diff] [blame] | 202 | if (time_since_arm < timeout_) { |
| 203 | message_loop()->PostDelayedTask( |
| 204 | FROM_HERE, |
[email protected] | cff2ac8e | 2011-02-25 22:08:49 | [diff] [blame] | 205 | method_factory_->NewRunnableMethod( |
[email protected] | f9a7e08f | 2011-08-18 21:20:16 | [diff] [blame] | 206 | &GpuWatchdogThread::DeliberatelyTerminateToRecoverFromHang), |
[email protected] | 981c1c5 | 2010-12-01 20:09:24 | [diff] [blame] | 207 | timeout_ - time_since_arm); |
| 208 | return; |
| 209 | } |
| 210 | #endif |
| 211 | |
[email protected] | 995a7f1 | 2011-02-11 23:07:17 | [diff] [blame] | 212 | // If the watchdog woke up significantly behind schedule, disarm and reset |
| 213 | // the watchdog check. This is to prevent the watchdog thread from terminating |
| 214 | // when a machine wakes up from sleep or hibernation, which would otherwise |
| 215 | // appear to be a hang. |
| 216 | if ((base::Time::Now() - arm_absolute_time_).InMilliseconds() > |
| 217 | timeout_ * 2) { |
| 218 | armed_ = false; |
| 219 | OnCheck(); |
| 220 | return; |
| 221 | } |
| 222 | |
[email protected] | f9a7e08f | 2011-08-18 21:20:16 | [diff] [blame] | 223 | // For minimal developer annoyance, don't keep terminating. You need to skip |
| 224 | // the call to base::Process::Terminate below in a debugger for this to be |
| 225 | // useful. |
| 226 | static bool terminated = false; |
| 227 | if (terminated) |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 228 | return; |
| 229 | |
| 230 | #if defined(OS_WIN) |
| 231 | if (IsDebuggerPresent()) |
| 232 | return; |
| 233 | #endif |
| 234 | |
[email protected] | e8ea65a | 2011-01-19 01:24:49 | [diff] [blame] | 235 | LOG(ERROR) << "The GPU process hung. Terminating after " |
| 236 | << timeout_ << " ms."; |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 237 | |
[email protected] | f9a7e08f | 2011-08-18 21:20:16 | [diff] [blame] | 238 | base::Process current_process(base::GetCurrentProcessHandle()); |
| 239 | current_process.Terminate(content::RESULT_CODE_HUNG); |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 240 | |
[email protected] | f9a7e08f | 2011-08-18 21:20:16 | [diff] [blame] | 241 | terminated = true; |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 242 | } |