Android SSWD(system server Watchdog)工作原理
阿新 • • 發佈:2019-02-14
簡介
一直在想如何介紹Watchdog,思來想去還是原始碼比較給力“This class calls its monitor every minute. Killing this process if they don't return”簡單粗暴。Android系統為了保證系統的穩定性,搞了這麼個Watchdog,專門負責監控Android系統的一些核心服務和執行緒,並且在這些服務和執行緒發生異常或者block時進行重啟,並儲存問題發生時的現場。同時Watchdog分hardware watchdog檢測硬體和system server watchdog檢測systemserver關鍵服務和執行緒(下面簡稱為sswd),本文主要結合AndroidP程式碼分析後者的原理。
SSWD檢測的物件是什麼?
使用gdb工具從coredump解析出了系統watchdog執行緒中mHandlerCheckers集合的資料,便可以獲取sswd檢測的服務和執行緒
Watchdog監聽的系統關鍵執行緒
[000] = 0x184cbaa0 Lcom/android/server/Watchdog$HandlerChecker; foreground thread [001] = 0x184cbf70 Lcom/android/server/Watchdog$HandlerChecker; main thread [002] = 0x184cbfa0 Lcom/android/server/Watchdog$HandlerChecker; ui thread [003] = 0x184cbfd0 Lcom/android/server/Watchdog$HandlerChecker; i/o thread [004] = 0x184cc000 Lcom/android/server/Watchdog$HandlerChecker; display thread [005] = 0x184cc030 Lcom/android/server/Watchdog$HandlerChecker; ActivityManager [006] = 0x184cc060 Lcom/android/server/Watchdog$HandlerChecker; PowerManagerService [007] = 0x184cc090 Lcom/android/server/Watchdog$HandlerChecker; main//同main thread [008] = 0x184cc0c0 Lcom/android/server/Watchdog$HandlerChecker; PackageManager [009] = 0x184cc0f0 Lcom/android/server/Watchdog$HandlerChecker; PackageManager//同上
fg->mMonitors(deadlock監聽)核心服務
[000] = 0x184cbf30 Lcom/android/server/Watchdog$BinderThreadMonitor; [001] = 0x15b00a80 Lcom/android/server/am/ActivityManagerService; [002] = 0x15b1f770 Lcom/android/server/power/PowerManagerService; [003] = 0x172759f0 Lcom/sonymobile/server/mirrorpowersave/LcdPowerSaveService; [004] = 0x15b02220 Lcom/android/server/wm/WindowManagerService; [005] = 0x15e4ee58 Lcom/android/server/input/InputManagerService; [006] = 0x15e78220 Lcom/android/server/NetworkManagementService; [007] = 0x18028bf8 Lcom/android/server/media/MediaSessionService; [008] = 0x1726a8b0 Lcom/android/server/media/MediaRouterService; [009] = 0x13f0d010 Lcom/android/server/media/projection/MediaProjectionManagerService;
SSWD的工作原理
設定檢測超時時間為60s,通過四種狀態判定系統服務和執行緒的工作狀態,自旋修改自身的狀態
- COMPLETED:狀態很好,無block
- WAITING:檢測30s內等待
- WAITED_HALF:已等待超過30s但在60s內,此時是列印一些cpu的dump資訊
- OVERDUE:超時,儲存超時現場,執行重啟
核心程式碼解釋
檢測演算法
@Override
public void run() {
boolean waitedHalf = false;
File initialStack = null;
final ProcessCpuTracker processCpuTracker = new ProcessCpuTracker(true);
processCpuTracker.init();
while (true) {
final List<HandlerChecker> blockedCheckers;//記錄異常的服務
final String subject;
final boolean allowRestart;
int debuggerWasConnected = 0;
synchronized (this) {
long timeout = CHECK_INTERVAL;//決定檢測頻率,減少功耗
// Make sure we (re)spin the checkers that have become idle within
// this wait-and-check interval
for (int i=0; i<mHandlerCheckers.size(); i++) {
HandlerChecker hc = mHandlerCheckers.get(i);
hc.scheduleCheckLocked();//執行檢測
}
if (debuggerWasConnected > 0) {
debuggerWasConnected--;
}
// NOTE: We use uptimeMillis() here because we do not want to increment the time we
// wait while asleep. If the device is asleep then the thing that we are waiting
// to timeout on is asleep as well and won't have a chance to run, causing a false
// positive on when to kill things.
long start = SystemClock.uptimeMillis();//記錄開始時間
while (timeout > 0) {
if (Debug.isDebuggerConnected()) {
debuggerWasConnected = 2;
}
try {
wait(timeout);//等待30s
} catch (InterruptedException e) {
Log.wtf(TAG, e);
}
if (Debug.isDebuggerConnected()) {
debuggerWasConnected = 2;
}
timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start);
}//30s繼續執行
boolean fdLimitTriggered = false;
if (mOpenFdMonitor != null) {
fdLimitTriggered = mOpenFdMonitor.monitor();
}
//檢測的主要演算法
//檢測分為兩段時間前30s,後30s,檢測結果分為四種
if (!fdLimitTriggered) {
final int waitState = evaluateCheckerCompletionLocked();//獲取當前檢測的狀態
if (waitState == COMPLETED) {//正常,執行下一次檢測
// The monitors have returned; reset
waitedHalf = false;
continue;
} else if (waitState == WAITING) {//執行過程中
// still waiting but within their configured intervals; back off and recheck
continue;
} else if (waitState == WAITED_HALF) {//等待超過30s
if (!waitedHalf) {//先列印一些cpu的使用資訊
// We've waited half the deadlock-detection interval. Pull a stack
// trace and wait another half.
ArrayList<Integer> pids = new ArrayList<Integer>();
pids.add(Process.myPid());
initialStack = ActivityManagerService.dumpStackTraces(true, pids,
null, null, getInterestingNativePids());
waitedHalf = true;
processCpuTracker.update();
}
continue;
}
// something is overdue!超時發生,獲取異常的服務和執行緒
blockedCheckers = getBlockedCheckersLocked();
subject = describeCheckersLocked(blockedCheckers);
} else {
blockedCheckers = Collections.emptyList();
subject = "Open FD high water mark reached";
}
allowRestart = mAllowRestart;
}
// Only kill the process if the debugger is not attached.
if (Debug.isDebuggerConnected()) {
debuggerWasConnected = 2;
}
if (debuggerWasConnected >= 2) {
Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
} else if (debuggerWasConnected > 0) {
Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process");
} else if (!allowRestart) {
Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
} else {
Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
WatchdogDiagnostics.diagnoseCheckers(blockedCheckers);
Slog.w(TAG, "*** GOODBYE!");
// Check if we should do system dump or not
if (errorHandlingInfo.mSystemDump) {
mActivity.forceCrashDump(errorHandlingInfo);
}
Process.killProcess(Process.myPid());
System.exit(10);//系統重啟
}
waitedHalf = false;
}
}
檢測關鍵類HandlerChecker
public final class HandlerChecker implements Runnable {
private final Handler mHandler;//檢測的執行緒對應的Handler
private final String mName;
private final long mWaitMax;//等待最大時間60s
private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>();//只存在與foreground thread對應的HandlerChecker中,用來描述系統的核心服務,檢測其中是否存在deadlock
private boolean mCompleted;//檢測完成狀態
private Monitor mCurrentMonitor;//當前檢測的服務
private long mStartTime;//在一次60s檢測中,記錄開始時間
HandlerChecker(Handler handler, String name, long waitMaxMillis) {
mHandler = handler;
mName = name;
mWaitMax = waitMaxMillis;
mCompleted = true;
}
public void addMonitor(Monitor monitor) {
mMonitors.add(monitor);
}
public void scheduleCheckLocked() {
if (mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling()) {
// If the target looper has recently been polling, then
// there is no reason to enqueue our checker on it since that
// is as good as it not being deadlocked. This avoid having
// to do a context switch to check the thread. Note that we
// only do this if mCheckReboot is false and we have no
// monitors, since those would need to be executed at this point.
mCompleted = true;
return;
}
if (!mCompleted) {
// we already have a check in flight, so no need
return;
}
mCompleted = false;
mCurrentMonitor = null;
mStartTime = SystemClock.uptimeMillis();//記錄當前執行檢測的時間
mHandler.postAtFrontOfQueue(this);//在對應執行緒的messagequeue的頭部發送一個訊息
}
public boolean isOverdueLocked() {//是否存在超時
//mCompleted==false並且執行已經超時60s未完成檢測
return (!mCompleted) && (SystemClock.uptimeMillis() > mStartTime + mWaitMax);
}
public int getCompletionStateLocked() {
if (mCompleted) {
return COMPLETED;
} else {
long latency = SystemClock.uptimeMillis() - mStartTime;
if (latency < mWaitMax/2) {
return WAITING;
} else if (latency < mWaitMax) {
return WAITED_HALF;
}
}
return OVERDUE;
}
public Thread getThread() {
return mHandler.getLooper().getThread();
}
public String getName() {
return mName;
}
public String describeBlockedStateLocked() {
if (mCurrentMonitor == null) {
return "Blocked in handler on " + mName + " (" + getThread().getName() + ")";
} else {
return "Blocked in monitor " + mCurrentMonitor.getClass().getName()
+ " on " + mName + " (" + getThread().getName() + ")";
}
}
@Override
public void run() {
//phase1:檢測死鎖
final int size = mMonitors.size();
for (int i = 0 ; i < size ; i++) {
synchronized (Watchdog.this) {
mCurrentMonitor = mMonitors.get(i);
}
mCurrentMonitor.monitor();//嘗試獲取各個服務中的lock
}
//phase2:執行到這裡分為兩種情況
// action1:mMonitors.size() == 0,屬於檢測執行緒loop messagequeue是否存在block即對應執行緒是否block
// action1:mMonitors.size() != 0,屬於檢測deadlock,判斷對應服務中的lock是否長時間被佔有,未即時釋放
//當執行到這裡的時候,說明不存在lock被長時間佔有,執行緒也未存在block情況因為檢測傳送的訊息已經被執行,不存在訊息堵塞的情況。
synchronized (Watchdog.this) {
mCompleted = true;//標記檢測完成
mCurrentMonitor = null;//清除當前檢測記錄
}
}
}
總結
當我們理解了SSWD的原理,會發現其實也並沒有什麼,總結一句話,SSWD會每間隔30s檢測一下系統關鍵的服務和執行緒,當出現60s超時時,重啟SystemServer程序。在實際的開發測試中,遇到SSWD的問題時,我們還是需要依據log資訊準確的分析原因。我們知道有兩種SSWD問題,一種死鎖,一種執行緒block,有些時候雖然爆出的是看是死鎖或者執行緒block,但是確實是由於一些其他原因導致的。