[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 1 | // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
| 5 | #if defined(OS_WIN) |
| 6 | #include <windows.h> |
| 7 | #endif |
| 8 | |
| 9 | #include "chrome/gpu/gpu_watchdog_thread.h" |
| 10 | |
| 11 | #include "base/compiler_specific.h" |
| 12 | #include "build/build_config.h" |
| 13 | |
| 14 | namespace { |
| 15 | const int64 kCheckPeriod = 2000; |
[email protected] | 49eab48 | 2010-11-24 00:07:43 | [diff] [blame] | 16 | |
| 17 | void DoNothing() { |
| 18 | } |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 19 | } |
| 20 | |
[email protected] | 981c1c5 | 2010-12-01 20:09:24 | [diff] [blame] | 21 | GpuWatchdogThread::GpuWatchdogThread(int timeout) |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 22 | : base::Thread("Watchdog"), |
[email protected] | 981c1c5 | 2010-12-01 20:09:24 | [diff] [blame] | 23 | watched_message_loop_(MessageLoop::current()), |
[email protected] | 49eab48 | 2010-11-24 00:07:43 | [diff] [blame] | 24 | timeout_(timeout), |
| 25 | armed_(false), |
[email protected] | 981c1c5 | 2010-12-01 20:09:24 | [diff] [blame] | 26 | #if defined(OS_WIN) |
| 27 | watched_thread_handle_(0), |
[email protected] | 995a7f1 | 2011-02-11 23:07:17 | [diff] [blame] | 28 | arm_cpu_time_(0), |
[email protected] | 981c1c5 | 2010-12-01 20:09:24 | [diff] [blame] | 29 | #endif |
[email protected] | 49eab48 | 2010-11-24 00:07:43 | [diff] [blame] | 30 | ALLOW_THIS_IN_INITIALIZER_LIST(task_observer_(this)) { |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 31 | DCHECK(timeout >= 0); |
[email protected] | 49eab48 | 2010-11-24 00:07:43 | [diff] [blame] | 32 | |
[email protected] | 981c1c5 | 2010-12-01 20:09:24 | [diff] [blame] | 33 | #if defined(OS_WIN) |
| 34 | // GetCurrentThread returns a pseudo-handle that cannot be used by one thread |
| 35 | // to identify another. DuplicateHandle creates a "real" handle that can be |
| 36 | // used for this purpose. |
| 37 | BOOL result = DuplicateHandle(GetCurrentProcess(), |
| 38 | GetCurrentThread(), |
| 39 | GetCurrentProcess(), |
| 40 | &watched_thread_handle_, |
| 41 | THREAD_QUERY_INFORMATION, |
| 42 | FALSE, |
| 43 | 0); |
| 44 | DCHECK(result); |
| 45 | #endif |
| 46 | |
[email protected] | 49eab48 | 2010-11-24 00:07:43 | [diff] [blame] | 47 | watched_message_loop_->AddTaskObserver(&task_observer_); |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 48 | } |
| 49 | |
| 50 | GpuWatchdogThread::~GpuWatchdogThread() { |
| 51 | // Verify that the thread was explicitly stopped. If the thread is stopped |
| 52 | // implicitly by the destructor, CleanUp() will not be called. |
| 53 | DCHECK(!method_factory_.get()); |
[email protected] | 49eab48 | 2010-11-24 00:07:43 | [diff] [blame] | 54 | |
[email protected] | 981c1c5 | 2010-12-01 20:09:24 | [diff] [blame] | 55 | #if defined(OS_WIN) |
| 56 | CloseHandle(watched_thread_handle_); |
| 57 | #endif |
| 58 | |
[email protected] | 49eab48 | 2010-11-24 00:07:43 | [diff] [blame] | 59 | watched_message_loop_->RemoveTaskObserver(&task_observer_); |
| 60 | } |
| 61 | |
| 62 | void GpuWatchdogThread::PostAcknowledge() { |
| 63 | // Called on the monitored thread. Responds with OnAcknowledge. Cannot use |
| 64 | // the method factory. Rely on reference counting instead. |
| 65 | message_loop()->PostTask( |
| 66 | FROM_HERE, |
| 67 | NewRunnableMethod(this, &GpuWatchdogThread::OnAcknowledge)); |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 68 | } |
| 69 | |
| 70 | void GpuWatchdogThread::Init() { |
| 71 | // The method factory must be created on the watchdog thread. |
| 72 | method_factory_.reset(new MethodFactory(this)); |
| 73 | |
| 74 | // Schedule the first check. |
| 75 | OnCheck(); |
| 76 | } |
| 77 | |
| 78 | void GpuWatchdogThread::CleanUp() { |
| 79 | // The method factory must be destroyed on the watchdog thread. |
| 80 | method_factory_->RevokeAll(); |
| 81 | method_factory_.reset(); |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 82 | } |
| 83 | |
[email protected] | 49eab48 | 2010-11-24 00:07:43 | [diff] [blame] | 84 | GpuWatchdogThread::GpuWatchdogTaskObserver::GpuWatchdogTaskObserver( |
| 85 | GpuWatchdogThread* watchdog) |
| 86 | : watchdog_(watchdog) { |
| 87 | } |
| 88 | |
| 89 | GpuWatchdogThread::GpuWatchdogTaskObserver::~GpuWatchdogTaskObserver() { |
| 90 | } |
| 91 | |
| 92 | void GpuWatchdogThread::GpuWatchdogTaskObserver::WillProcessTask( |
| 93 | const Task* task) |
| 94 | { |
| 95 | CheckArmed(); |
| 96 | } |
| 97 | |
| 98 | void GpuWatchdogThread::GpuWatchdogTaskObserver::DidProcessTask( |
| 99 | const Task* task) |
| 100 | { |
| 101 | CheckArmed(); |
| 102 | } |
| 103 | |
| 104 | void GpuWatchdogThread::GpuWatchdogTaskObserver::CheckArmed() |
| 105 | { |
| 106 | // Acknowledge the watchdog if it has armed itself. The watchdog will not |
| 107 | // change its armed state until it is acknowledged. |
| 108 | if (watchdog_->armed()) { |
| 109 | watchdog_->PostAcknowledge(); |
| 110 | } |
| 111 | } |
| 112 | |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 113 | void GpuWatchdogThread::OnAcknowledge() { |
[email protected] | 49eab48 | 2010-11-24 00:07:43 | [diff] [blame] | 114 | // The check has already been acknowledged and another has already been |
| 115 | // scheduled by a previous call to OnAcknowledge. It is normal for a |
| 116 | // watched thread to see armed_ being true multiple times before |
| 117 | // the OnAcknowledge task is run on the watchdog thread. |
| 118 | if (!armed_) |
| 119 | return; |
| 120 | |
[email protected] | cff2ac8e | 2011-02-25 22:08:49 | [diff] [blame^] | 121 | // Revoke any pending hang termination. |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 122 | method_factory_->RevokeAll(); |
[email protected] | 49eab48 | 2010-11-24 00:07:43 | [diff] [blame] | 123 | armed_ = false; |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 124 | |
| 125 | // The monitored thread has responded. Post a task to check it again. |
[email protected] | 981c1c5 | 2010-12-01 20:09:24 | [diff] [blame] | 126 | message_loop()->PostDelayedTask( |
| 127 | FROM_HERE, |
| 128 | method_factory_->NewRunnableMethod(&GpuWatchdogThread::OnCheck), |
| 129 | kCheckPeriod); |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 130 | } |
| 131 | |
[email protected] | 981c1c5 | 2010-12-01 20:09:24 | [diff] [blame] | 132 | #if defined(OS_WIN) |
| 133 | int64 GpuWatchdogThread::GetWatchedThreadTime() { |
| 134 | FILETIME creation_time; |
| 135 | FILETIME exit_time; |
| 136 | FILETIME user_time; |
| 137 | FILETIME kernel_time; |
| 138 | BOOL result = GetThreadTimes(watched_thread_handle_, |
| 139 | &creation_time, |
| 140 | &exit_time, |
| 141 | &kernel_time, |
| 142 | &user_time); |
| 143 | DCHECK(result); |
| 144 | |
| 145 | ULARGE_INTEGER user_time64; |
| 146 | user_time64.HighPart = user_time.dwHighDateTime; |
| 147 | user_time64.LowPart = user_time.dwLowDateTime; |
| 148 | |
| 149 | ULARGE_INTEGER kernel_time64; |
| 150 | kernel_time64.HighPart = kernel_time.dwHighDateTime; |
| 151 | kernel_time64.LowPart = kernel_time.dwLowDateTime; |
| 152 | |
| 153 | // Time is reported in units of 100 nanoseconds. Kernel and user time are |
| 154 | // summed to deal with to kinds of hangs. One is where the GPU process is |
| 155 | // stuck in user level, never calling into the kernel and kernel time is |
| 156 | // not increasing. The other is where either the kernel hangs and never |
| 157 | // returns to user level or where user level code |
| 158 | // calls into kernel level repeatedly, giving up its quanta before it is |
| 159 | // tracked, for example a loop that repeatedly Sleeps. |
| 160 | return static_cast<int64>( |
| 161 | (user_time64.QuadPart + kernel_time64.QuadPart) / 10000); |
| 162 | } |
| 163 | #endif |
| 164 | |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 165 | void GpuWatchdogThread::OnCheck() { |
[email protected] | 981c1c5 | 2010-12-01 20:09:24 | [diff] [blame] | 166 | if (armed_) |
| 167 | return; |
[email protected] | 49eab48 | 2010-11-24 00:07:43 | [diff] [blame] | 168 | |
[email protected] | 981c1c5 | 2010-12-01 20:09:24 | [diff] [blame] | 169 | // Must set armed before posting the task. This task might be the only task |
| 170 | // that will activate the TaskObserver on the watched thread and it must not |
| 171 | // miss the false -> true transition. |
| 172 | armed_ = true; |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 173 | |
[email protected] | 981c1c5 | 2010-12-01 20:09:24 | [diff] [blame] | 174 | #if defined(OS_WIN) |
[email protected] | 995a7f1 | 2011-02-11 23:07:17 | [diff] [blame] | 175 | arm_cpu_time_ = GetWatchedThreadTime(); |
[email protected] | 981c1c5 | 2010-12-01 20:09:24 | [diff] [blame] | 176 | #endif |
| 177 | |
[email protected] | 995a7f1 | 2011-02-11 23:07:17 | [diff] [blame] | 178 | arm_absolute_time_ = base::Time::Now(); |
| 179 | |
[email protected] | 981c1c5 | 2010-12-01 20:09:24 | [diff] [blame] | 180 | // Post a task to the monitored thread that does nothing but wake up the |
| 181 | // TaskObserver. Any other tasks that are pending on the watched thread will |
| 182 | // also wake up the observer. This simply ensures there is at least one. |
| 183 | watched_message_loop_->PostTask( |
| 184 | FROM_HERE, |
| 185 | NewRunnableFunction(DoNothing)); |
| 186 | |
| 187 | // Post a task to the watchdog thread to exit if the monitored thread does |
| 188 | // not respond in time. |
| 189 | message_loop()->PostDelayedTask( |
| 190 | FROM_HERE, |
[email protected] | cff2ac8e | 2011-02-25 22:08:49 | [diff] [blame^] | 191 | method_factory_->NewRunnableMethod( |
| 192 | &GpuWatchdogThread::DeliberatelyCrashingToRecoverFromHang), |
[email protected] | 981c1c5 | 2010-12-01 20:09:24 | [diff] [blame] | 193 | timeout_); |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 194 | } |
| 195 | |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 196 | // Use the --disable-gpu-watchdog command line switch to disable this. |
[email protected] | cff2ac8e | 2011-02-25 22:08:49 | [diff] [blame^] | 197 | void GpuWatchdogThread::DeliberatelyCrashingToRecoverFromHang() { |
[email protected] | 981c1c5 | 2010-12-01 20:09:24 | [diff] [blame] | 198 | #if defined(OS_WIN) |
[email protected] | 995a7f1 | 2011-02-11 23:07:17 | [diff] [blame] | 199 | // Defer termination until a certain amount of CPU time has elapsed on the |
| 200 | // watched thread. |
| 201 | int64 time_since_arm = GetWatchedThreadTime() - arm_cpu_time_; |
[email protected] | 981c1c5 | 2010-12-01 20:09:24 | [diff] [blame] | 202 | if (time_since_arm < timeout_) { |
| 203 | message_loop()->PostDelayedTask( |
| 204 | FROM_HERE, |
[email protected] | cff2ac8e | 2011-02-25 22:08:49 | [diff] [blame^] | 205 | method_factory_->NewRunnableMethod( |
| 206 | &GpuWatchdogThread::DeliberatelyCrashingToRecoverFromHang), |
[email protected] | 981c1c5 | 2010-12-01 20:09:24 | [diff] [blame] | 207 | timeout_ - time_since_arm); |
| 208 | return; |
| 209 | } |
| 210 | #endif |
| 211 | |
[email protected] | 995a7f1 | 2011-02-11 23:07:17 | [diff] [blame] | 212 | // If the watchdog woke up significantly behind schedule, disarm and reset |
| 213 | // the watchdog check. This is to prevent the watchdog thread from terminating |
| 214 | // when a machine wakes up from sleep or hibernation, which would otherwise |
| 215 | // appear to be a hang. |
| 216 | if ((base::Time::Now() - arm_absolute_time_).InMilliseconds() > |
| 217 | timeout_ * 2) { |
| 218 | armed_ = false; |
| 219 | OnCheck(); |
| 220 | return; |
| 221 | } |
| 222 | |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 223 | // Make sure the timeout period is on the stack before crashing. |
| 224 | volatile int timeout = timeout_; |
| 225 | |
| 226 | // For minimal developer annoyance, don't keep crashing. |
| 227 | static bool crashed = false; |
| 228 | if (crashed) |
| 229 | return; |
| 230 | |
| 231 | #if defined(OS_WIN) |
| 232 | if (IsDebuggerPresent()) |
| 233 | return; |
| 234 | #endif |
| 235 | |
[email protected] | e8ea65a | 2011-01-19 01:24:49 | [diff] [blame] | 236 | LOG(ERROR) << "The GPU process hung. Terminating after " |
| 237 | << timeout_ << " ms."; |
[email protected] | e09cee4 | 2010-11-09 01:50:08 | [diff] [blame] | 238 | |
| 239 | volatile int* null_pointer = NULL; |
| 240 | *null_pointer = timeout; |
| 241 | |
| 242 | crashed = true; |
| 243 | } |