watchdog是什么
Watchdog是SystemServer的一個(gè)線程(mThread = new Thread(this::run, "watchdog");)瓦盛,檢測(cè)system server重要線程的鎖狀態(tài)和Handler消息是否阻塞京腥,假如有線程block了60s那么就會(huì)觸發(fā)watchdog timeout flow, 觸發(fā)Android重啟來(lái)使系統(tǒng)恢復(fù);block 30s會(huì)有線程堆棧log打印,并觸發(fā)/data/anr/trace.txt文件生成船老。
watchdog怎么用
以AMS為例
public class ActivityManagerService extends IActivityManager.Stub
implements Watchdog.Monitor, BatteryStatsImpl.BatteryCallback, ActivityManagerGlobalLock {
public ActivityManagerService(Context systemContext, ActivityTaskManagerService atm) {
Watchdog.getInstance().addMonitor(this);
Watchdog.getInstance().addThread(mHandler);
}
public void monitor() {
synchronized (this) { }
}
watchdog源碼分析
com/android/server/Watchdog.java
private final HandlerChecker mMonitorChecker; //專門用于監(jiān)控是否死鎖
public void addMonitor(Monitor monitor) {
synchronized (mLock) {
mMonitorChecker.addMonitorLocked(monitor);
}
}
/* This handler will be used to post message back onto the main thread */
private final ArrayList<HandlerCheckerAndTimeout> mHandlerCheckers = new ArrayList<>();//需要監(jiān)控的looper
public void addThread(Handler thread) {
synchronized (mLock) {
final String name = thread.getLooper().getThread().getName();
mHandlerCheckers.add(withDefaultTimeout(new HandlerChecker(thread, name)));
}
}
private Watchdog() {
mThread = new Thread(this::run, "watchdog");//本質(zhì)上是一個(gè)線程
// Initialize handler checkers for each common thread we want to check. Note
// that we are not currently checking the background thread, since it can
// potentially hold longer running operations with no guarantees about the timeliness
// of operations there.
//
// The shared foreground thread is the main checker. It is where we
// will also dispatch monitor checks and do other work.
mMonitorChecker = new HandlerChecker(FgThread.getHandler(),
"foreground thread");
mHandlerCheckers.add(withDefaultTimeout(mMonitorChecker));
// Add checker for main thread. We only do a quick check since there
// can be UI running on the thread.
mHandlerCheckers.add(withDefaultTimeout(
new HandlerChecker(new Handler(Looper.getMainLooper()), "main thread")));
// Add checker for shared UI thread.
mHandlerCheckers.add(withDefaultTimeout(
new HandlerChecker(UiThread.getHandler(), "ui thread")));
// And also check IO thread.
mHandlerCheckers.add(withDefaultTimeout(
new HandlerChecker(IoThread.getHandler(), "i/o thread")));
// And the display thread.
mHandlerCheckers.add(withDefaultTimeout(
new HandlerChecker(DisplayThread.getHandler(), "display thread")));
// And the animation thread.
mHandlerCheckers.add(withDefaultTimeout(
new HandlerChecker(AnimationThread.getHandler(), "animation thread")));
// And the surface animation thread.
mHandlerCheckers.add(withDefaultTimeout(
new HandlerChecker(SurfaceAnimationThread.getHandler(),
"surface animation thread")));
// Initialize monitor for Binder threads.
addMonitor(new BinderThreadMonitor());//檢測(cè)是否binder線程耗盡
mInterestingJavaPids.add(Process.myPid());
// See the notes on DEFAULT_TIMEOUT.
assert DB ||
DEFAULT_TIMEOUT > ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS;
mTraceErrorLogger = new TraceErrorLogger();
}
public void start() {
mThread.start();
}
private void run() {
boolean waitedHalf = false;
while (true) {
List<HandlerChecker> blockedCheckers = Collections.emptyList();
String subject = "";
boolean allowRestart = true;
int debuggerWasConnected = 0;
boolean doWaitedHalfDump = false;
// The value of mWatchdogTimeoutMillis might change while we are executing the loop.
// We store the current value to use a consistent value for all handlers.
final long watchdogTimeoutMillis = mWatchdogTimeoutMillis;
final long checkIntervalMillis = watchdogTimeoutMillis / 2;
final ArrayList<Integer> pids;
synchronized (mLock) {
long timeout = checkIntervalMillis;
// Make sure we (re)spin the checkers that have become idle within
// this wait-and-check interval
for (int i=0; i<mHandlerCheckers.size(); i++) {
HandlerCheckerAndTimeout hc = mHandlerCheckers.get(i);
// We pick the watchdog to apply every time we reschedule the checkers. The
// default timeout might have changed since the last run.
hc.checker().scheduleCheckLocked(hc.customTimeoutMillis()
.orElse(watchdogTimeoutMillis * Build.HW_TIMEOUT_MULTIPLIER));//往被監(jiān)測(cè)的looper線程發(fā)消息
}
if (debuggerWasConnected > 0) {
debuggerWasConnected--;
}
// NOTE: We use uptimeMillis() here because we do not want to increment the time we
// wait while asleep. If the device is asleep then the thing that we are waiting
// to timeout on is asleep as well and won't have a chance to run, causing a false
// positive on when to kill things.
long start = SystemClock.uptimeMillis();
while (timeout > 0) {
if (Debug.isDebuggerConnected()) {
debuggerWasConnected = 2;
}
try {
mLock.wait(timeout);//等30s
// Note: mHandlerCheckers and mMonitorChecker may have changed after waiting
} catch (InterruptedException e) {
Log.wtf(TAG, e);
}
if (Debug.isDebuggerConnected()) {
debuggerWasConnected = 2;
}
timeout = checkIntervalMillis - (SystemClock.uptimeMillis() - start);
}
final int waitState = evaluateCheckerCompletionLocked();//判斷是否有阻塞
if (waitState == COMPLETED) {
// The monitors have returned; reset
waitedHalf = false;
continue;
} else if (waitState == WAITING) {
// still waiting but within their configured intervals; back off and recheck
continue;
} else if (waitState == WAITED_HALF) {
if (!waitedHalf) {
Slog.i(TAG, "WAITED_HALF");
waitedHalf = true;
// We've waited half, but we'd need to do the stack trace dump w/o the lock.
blockedCheckers = getCheckersWithStateLocked(WAITED_HALF);//找到被阻塞的looper
subject = describeCheckersLocked(blockedCheckers);
pids = new ArrayList<>(mInterestingJavaPids);
doWaitedHalfDump = true;
} else {
continue;
}
} else {
// something is overdue!
blockedCheckers = getCheckersWithStateLocked(OVERDUE);
subject = describeCheckersLocked(blockedCheckers);
allowRestart = mAllowRestart;
pids = new ArrayList<>(mInterestingJavaPids);
}
} // END synchronized (mLock)
// If we got here, that means that the system is most likely hung.
//
// First collect stack traces from all threads of the system process.
//
// Then, if we reached the full timeout, kill this process so that the system will
// restart. If we reached half of the timeout, just log some information and continue.
logWatchog(doWaitedHalfDump, subject, pids);//打日志世曾,觸發(fā)生成trace.txt
if (doWaitedHalfDump) {
// We have waited for only half of the timeout, we continue to wait for the duration
// of the full timeout before killing the process.
continue;
}
IActivityController controller;
synchronized (mLock) {
controller = mController;
}
if (controller != null) {
Slog.i(TAG, "Reporting stuck state to activity controller");
try {
Binder.setDumpDisabled("Service dumps disabled due to hung system process.");
// 1 = keep waiting, -1 = kill system
int res = controller.systemNotResponding(subject);
if (res >= 0) {
Slog.i(TAG, "Activity controller requested to coninue to wait");
waitedHalf = false;
continue;
}
} catch (RemoteException e) {
}
}
// Only kill the process if the debugger is not attached.
if (Debug.isDebuggerConnected()) {//判斷當(dāng)前是否處于debugger調(diào)試裕循,排除debugger調(diào)試引起的超時(shí)
debuggerWasConnected = 2;
}
if (debuggerWasConnected >= 2) {
Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
} else if (debuggerWasConnected > 0) {
Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process");
} else if (!allowRestart) {
Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
} else {
Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
WatchdogDiagnostics.diagnoseCheckers(blockedCheckers);
Slog.w(TAG, "*** GOODBYE!");
if (!Build.IS_USER && isCrashLoopFound()
&& !WatchdogProperties.should_ignore_fatal_count().orElse(false)) {
breakCrashLoop();
}
Process.killProcess(Process.myPid());//重啟system_server
System.exit(10);
}
waitedHalf = false;
}
}
public final class HandlerChecker implements Runnable {
public void scheduleCheckLocked(long handlerCheckerTimeoutMillis) {
mWaitMax = handlerCheckerTimeoutMillis;
if (mCompleted) {
// Safe to update monitors in queue, Handler is not in the middle of work
mMonitors.addAll(mMonitorQueue);
mMonitorQueue.clear();
}
if ((mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling())
|| (mPauseCount > 0)) {
// Don't schedule until after resume OR
// If the target looper has recently been polling, then
// there is no reason to enqueue our checker on it since that
// is as good as it not being deadlocked. This avoid having
// to do a context switch to check the thread. Note that we
// only do this if we have no monitors since those would need to
// be executed at this point.
mCompleted = true;
return;
}
if (!mCompleted) {
// we already have a check in flight, so no need
return;
}
mCompleted = false;
mCurrentMonitor = null;
mStartTime = SystemClock.uptimeMillis();
mHandler.postAtFrontOfQueue(this);//發(fā)消息
}
public int getCompletionStateLocked() {//判斷是否阻塞
if (mCompleted) {
return COMPLETED;
} else {
long latency = SystemClock.uptimeMillis() - mStartTime;
if (latency < mWaitMax/2) {
return WAITING;
} else if (latency < mWaitMax) {
return WAITED_HALF;
}
}
return OVERDUE;
}
public void run() {
// Once we get here, we ensure that mMonitors does not change even if we call
// #addMonitorLocked because we first add the new monitors to mMonitorQueue and
// move them to mMonitors on the next schedule when mCompleted is true, at which
// point we have completed execution of this method.
final int size = mMonitors.size();
for (int i = 0 ; i < size ; i++) {
synchronized (mLock) {
mCurrentMonitor = mMonitors.get(i);
}
mCurrentMonitor.monitor();//判斷是否死鎖
}
synchronized (mLock) {
mCompleted = true;
mCurrentMonitor = null;
}
}
}
總結(jié):
Watchdog的主要流程是:開啟一個(gè)死循環(huán)时迫,不斷給指定線程發(fā)送一條消息椎扬,然后休眠30秒惫搏,休眠結(jié)束后判斷是否收到消息的回調(diào),如果有蚕涤,則正常進(jìn)行下次循環(huán)筐赔,如果沒(méi)收到,判斷從發(fā)消息到現(xiàn)在的時(shí)機(jī)小于30秒不處理揖铜,大于30秒小于60秒收集信息茴丰,大于60秒收集信息并重啟。
問(wèn)題分析解決思路
跟anr解決的思路一致天吓,看log贿肩,看trace.txt,注意cpu龄寞、io汰规、鎖、binder調(diào)用甚至是Thread.sleep等常見的因素物邑×锵可以參考https://blog.csdn.net/wd229047557/article/details/108059481