一张遭、Xcrash簡介
xcrash是愛奇藝在2019年4月開源在GitHub上的穩(wěn)定性日志收集框架,它能為android收集java crash蠕嫁、native crash锨天、anr日志。不需要root權(quán)限和系統(tǒng)權(quán)限剃毒。支持 Android 4.0 - 10(API level 14 - 29)病袄,支持 armeabi搂赋,armeabi-v7a,arm64-v8a益缠,x86 和 x86_64脑奠。
二、Xcrash架構(gòu)
三幅慌、Xcrash類圖
xcrash作為門面模式的入口宋欺,client調(diào)用通過配置InitParameter來進(jìn)行初始化。Xcrash分別關(guān)聯(lián)三種類型Handler來處理對應(yīng)的奔潰監(jiān)聽和日志收集胰伍,通過FileManager和TombstoneManager對奔潰日志進(jìn)行tombstone文件管理齿诞。client調(diào)用TombstoneParser來解析本地生成的對應(yīng)tombstone文件,獲取數(shù)據(jù)骂租。
四、捕獲Java奔潰
Java層的崩潰可以直接交給JVM的崩潰捕獲機(jī)制去處理渗饮。這個非常簡單,不贅述互站。
Thread.setDefaultUncaughtExceptionHandler(this);
如果有java crash發(fā)生,會回調(diào)uncaughtException云茸,執(zhí)行handleException收集相關(guān)log信息
private void handleException(Thread thread, Throwable throwable) {
...
//notify the java crash
NativeHandler.getInstance().notifyJavaCrashed();
AnrHandler.getInstance().notifyJavaCrashed();
//create log file data/data/packageName/files/tombstones
logFile = FileManager.getInstance().createLogFile(logPath);
...
//write info to log file
if (logFile != null) {
…
// write java stacktrace
raf.write(emergency.getBytes("UTF-8"));
//write logcat日志 logcat -b main;logcat -b system; logcat -b event;
raf.write(Util.getLogcat(logcatMainLines, logcatSystemLines, logcatEventsLines).getBytes("UTF-8"));
//write fds
raf.write(Util.getFds().getBytes("UTF-8"));
//write network info
raf.write(Util.getNetworkInfo().getBytes("UTF-8"));
//write memory info
raf.write(Util.getMemoryInfo().getBytes("UTF-8"));
//write background / foreground
raf.write(("foreground:\n" + (ActivityMonitor.getInstance().isApplicationForeground() ? "yes" : "no") + "\n\n").getBytes("UTF-8"));
//write other threads info
if (dumpAllThreads) {
raf.write(getOtherThreadsInfo(thread).getBytes("UTF-8"));
}
}
//callback 回調(diào)ICrashCallback onCrash
if (callback != null) {
try {
callback.onCrash(logFile == null ? null : logFile.getAbsolutePath(), emergency);
} catch (Exception ignored) {
}
}
}
五标捺、捕獲Native奔潰
Crash.java
public static synchronized int init(Context ctx, InitParameters params) {
…
NativeHandler.getInstance().initialize(...)
...
}
NativeHandler.java
int initialize(...) {
//load lib
System.loadLibrary("xcrash");
...
//init native lib
try {
int r = nativeInit(...);
}
...
}
NativeHandler在Xcrash init時會執(zhí)行initialize方法進(jìn)行初始化,初始化過程首先通過System.loadLibrary("xcrash”)注冊native函數(shù)亡容,其次就是調(diào)用nativeInit。
執(zhí)行System.loadLibrary("xcrash”)茂缚,JNI_OnLoad會被回調(diào),這里是動態(tài)注冊玩法脚囊。
xc_jni.c
JNIEXPORT jint JNICALL JNI_OnLoad(JavaVM *vm, void *reserved)
{
...
if((*env)->RegisterNatives(env, cls, xc_jni_methods, sizeof(xc_jni_methods) / sizeof(xc_jni_methods[0]))) return -1;
...
return XC_JNI_VERSION;
}
數(shù)組0元素對應(yīng):
static JNINativeMethod xc_jni_methods[] = {
{
"nativeInit",
"("
"I"
"Ljava/lang/String;"
"Ljava/lang/String;"
"Ljava/lang/String;"
"Ljava/lang/String;"
"Ljava/lang/String;"
"Ljava/lang/String;"
"Ljava/lang/String;"
"Ljava/lang/String;"
"Ljava/lang/String;"
"Ljava/lang/String;"
"Z"
"Z"
"I"
"I"
"I"
"Z"
"Z"
"Z"
"Z"
"Z"
"I"
"[Ljava/lang/String;"
"Z"
"Z"
"I"
"I"
"I"
"Z"
"Z"
")"
"I",
(void *)xc_jni_init
},
…
}
java層調(diào)用nativeInit桐磁,native xc_jni_init會被調(diào)用。
接著看nativeInit邏輯
xc_jni.c
static jint xc_jni_init(...)
{
...
//common init
xc_common_init(…);//通用信息初始化我擂,包括系統(tǒng)信息缓艳、應(yīng)用信息看峻、進(jìn)程信息等。
...
//crash init 捕獲crash日志
r_crash = xc_crash_init(…);
...
//trace init 捕獲anr日志
r_trace = xc_trace_init(...);
}
...
return (0 == r_crash && 0 == r_trace) ? 0 : XCC_ERRNO_JNI;
}
先看xc_crash_init
int xc_crash_init(){
…
//init for JNI callback
xc_crash_init_callback(env);//1設(shè)置信號native 信號回調(diào) jni到j(luò)ava
…
//register signal handler
return xcc_signal_crash_register(xc_crash_signal_handler);//2注冊信號handler互妓,能回調(diào)處理對應(yīng)的信號
}
1)設(shè)置callback:
xc_crash_init_callback最終回調(diào)的是NativeHandler的crashCallback
private static void crashCallback(String logPath, String emergency, boolean dumpJavaStacktrace, boolean isMainThread, String threadName) {
if (!TextUtils.isEmpty(logPath)) {
//append java stacktrace
TombstoneManager.appendSection(logPath, "java stacktrace", stacktrace);
...
//append memory info
TombstoneManager.appendSection(logPath, "memory info", Util.getProcessMemoryInfo());
//append background / foreground
TombstoneManager.appendSection(logPath, "foreground", ActivityMonitor.getInstance().isApplicationForeground() ? "yes" : "no");
}
//最后回調(diào)到client注冊的ICrashCallback.onCrash
ICrashCallback callback = NativeHandler.getInstance().crashCallback;
if (callback != null) {
callback.onCrash(logPath, emergency);
}
...
}
2)信號注冊:
static xcc_signal_crash_info_t xcc_signal_crash_info[] =
{
{.signum = SIGABRT},//調(diào)用abort函數(shù)生成的信號车猬,表示程序異常
{.signum = SIGBUS},// 非法地址,包括內(nèi)存地址對齊出錯
{.signum = SIGFPE},// 計算錯誤珠闰,比如除0、溢出
{.signum = SIGILL},// 強(qiáng)制結(jié)束程序
{.signum = SIGSEGV},// 非法內(nèi)存操作
{.signum = SIGTRAP},// 斷點時產(chǎn)生伏嗜,由debugger使用
{.signum = SIGSYS},// 非法的系統(tǒng)調(diào)用
{.signum = SIGSTKFLT}// 協(xié)處理器堆棧錯誤
};
int xcc_signal_crash_register(void (*handler)(int, siginfo_t *, void *))
{
stack_t ss;
if(NULL == (ss.ss_sp = calloc(1, XCC_SIGNAL_CRASH_STACK_SIZE))) return XCC_ERRNO_NOMEM;
ss.ss_size = XCC_SIGNAL_CRASH_STACK_SIZE;
ss.ss_flags = 0;
if(0 != sigaltstack(&ss, NULL)) return XCC_ERRNO_SYS;
struct sigaction act;
memset(&act, 0, sizeof(act));
sigfillset(&act.sa_mask);
act.sa_sigaction = handler;//設(shè)置信號回調(diào)handler
act.sa_flags = SA_RESTART | SA_SIGINFO | SA_ONSTACK;
size_t i;
//通過sigaction注冊上述信號組
for(i = 0; i < sizeof(xcc_signal_crash_info) / sizeof(xcc_signal_crash_info[0]); i++)
if(0 != sigaction(xcc_signal_crash_info[i].signum, &act, &(xcc_signal_crash_info[i].oldact)))
return XCC_ERRNO_SYS;
return 0;
}
注冊的是指針函數(shù):xc_crash_signal_handler,追過去看看:
static void xc_crash_signal_handler(int sig, siginfo_t *si, void *uc
{
...
//create and open log file 打開log文件
if((xc_crash_log_fd = xc_common_open_crash_log(xc_crash_log_pathname, sizeof(xc_crash_log_pathname), &xc_crash_log_from_placeholder)) < 0) goto end;
...
//spawn crash dumper process 起一個進(jìn)程來處理dump
pid_t dumper_pid = xc_crash_fork(xc_crash_exec_dumper);
...
//JNI callback 完成之后jni到j(luò)ava的callback回調(diào)
xc_crash_callback();
...
}
進(jìn)入xc_crash_exec_dumper指針函數(shù)裸影,看看進(jìn)程dump操作:
static int xc_crash_exec_dumper(void *arg)
{
…
//這里執(zhí)行的是#define XCC_UTIL_XCRASH_DUMPER_FILENAME "libxcrash_dumper.so"
execl(xc_crash_dumper_pathname, XCC_UTIL_XCRASH_DUMPER_FILENAME, NULL);
return 100 + errno;
}
這個部分是做各種數(shù)據(jù)的dump军熏。簡單找下main方法:
xcd_core.c
int main(int argc, char** argv)
{
...
//read args from stdin
if(0 != xcd_core_read_args()) exit(1);
//open log file
if(0 > (xcd_core_log_fd = XCC_UTIL_TEMP_FAILURE_RETRY(open(xcd_core_log_pathname, O_WRONLY | O_CLOEXEC)))) exit(2);
//register signal handler for catching self-crashing
xcc_unwind_init(xcd_core_spot.api_level);
xcc_signal_crash_register(xcd_core_signal_handler);
//create process object
if(0 != xcd_process_create())) exit(3);
//suspend all threads in the process
xcd_process_suspend_threads(xcd_core_proc);
//load process info
if(0 != xcd_process_load_info(xcd_core_proc)) exit(4);
//record system info
if(0 != xcd_sys_record(...)) exit(5);
//record process info
if(0 != xcd_process_record(...)) exit(6);
//resume all threads in the process
xcd_process_resume_threads(xcd_core_proc);
...
}
不細(xì)看了,整個過程先是掛起crash進(jìn)程的所以線程荡澎,然后收集相關(guān)log,最后resume所有線程摩幔。
xc_trace_init部分不分析了,與xc_jni_init分析方法一致焦影。這里也就簡單分析了個大脈絡(luò)。
Native崩潰處理步驟總結(jié):
- 注冊信號處理函數(shù)(signal handler)斯辰。
- 崩潰發(fā)生時創(chuàng)建子進(jìn)程收集信息(避免在崩潰進(jìn)程調(diào)用函數(shù)的系統(tǒng)限制)。
- suspend崩潰進(jìn)程中所有的線程椒涯,暫停logcat輸出,收集logcat废岂。
- 收集backtrace等信息。
- 收集內(nèi)存數(shù)據(jù)湖苞。
- 完成后恢復(fù)線程。
六财骨、捕獲ANR
同樣在Xcrash init時初始化
Crash.java
public static synchronized int init(Context ctx, InitParameters params) {
//init ANR handler (API level < 21)
if (params.enableAnrHandler && Build.VERSION.SDK_INT < 21) {
AnrHandler.getInstance().initialize(...);
}
}
這里有個限制藏姐,是sdk <21的版本才抓取。
AnrHandler.java
void initialize(Context ctx, int pid, String processName, String appId, String appVersion, String logDir,
boolean checkProcessState, int logcatSystemLines, int logcatEventsLines, int logcatMainLines,
boolean dumpFds, boolean dumpNetworkInfo, ICrashCallback callback) {
//check API level
if (Build.VERSION.SDK_INT >= 21) {
return;
}
...
//FileObserver是用來監(jiān)控文件系統(tǒng)羔杨,這里監(jiān)聽/data/anr/trace.txt
fileObserver = new FileObserver("/data/anr/", CLOSE_WRITE) {
public void onEvent(int event, String path) {
try {
if (path != null) {
String filepath = "/data/anr/" + path;
if (filepath.contains("trace")) {
//監(jiān)聽回調(diào),處理anr
handleAnr(filepath);
}
}
} catch (Exception e) {
XCrash.getLogger().e(Util.TAG, "AnrHandler fileObserver onEvent failed", e);
}
}
};
try {
//啟動監(jiān)聽
fileObserver.startWatching();
} catch (Exception e) {
fileObserver = null;
XCrash.getLogger().e(Util.TAG, "AnrHandler fileObserver startWatching failed", e);
}
}
高版本系統(tǒng)已經(jīng)沒有讀取/data/anr/的權(quán)限了理澎,因此FileObserver監(jiān)聽/data/anr/的方案只能支持<21的版本,而目前xcrash對>21的版本無法獲取anr日志糠爬。
然后看看handleAnr收集了哪些數(shù)據(jù):
private void handleAnr(String filepath) {
Date anrTime = new Date();
//check ANR time interval
if (anrTime.getTime() - lastTime < anrTimeoutMs) {
return;
}
//check process error state
if (this.checkProcessState) {
if (!Util.checkProcessAnrState(this.ctx, anrTimeoutMs)) {
return;
}
}
//create log file
logFile = FileManager.getInstance().createLogFile(logPath);
//write info to log file
//write emergency info
raf.write(emergency.getBytes("UTF-8"));
//write logcat
raf.write(Util.getLogcat(logcatMainLines, logcatSystemLines, logcatEventsLines).getBytes("UTF-8"));
//write fds
raf.write(Util.getFds().getBytes("UTF-8"));
//write network info
raf.write(Util.getNetworkInfo().getBytes("UTF-8"));
//write memory info
raf.write(Util.getMemoryInfo().getBytes("UTF-8"));
//callback
if (callback != null) {
try {
callback.onCrash(logFile == null ? null : logFile.getAbsolutePath(), emergency);
} catch (Exception ignored) {
}
}
}
這里重點關(guān)注checkProcessAnrState,它是AMS對外暴露的api举庶,從AMS的mLruProcesses中過濾出crash和anr異常的進(jìn)程,返回對應(yīng)的錯誤信息户侥。補(bǔ)充cause reason部分,也就是ANR in添祸。
static boolean checkProcessAnrState(Context ctx, long timeoutMs) {
ActivityManager am = (ActivityManager) ctx.getSystemService(Context.ACTIVITY_SERVICE);
if (am == null) return false;
int pid = android.os.Process.myPid();
long poll = timeoutMs / 500;
for (int i = 0; i < poll; i++) {
List<ActivityManager.ProcessErrorStateInfo> processErrorList = am.getProcessesInErrorState();
if (processErrorList != null) {
for (ActivityManager.ProcessErrorStateInfo errorStateInfo : processErrorList) {
if (errorStateInfo.pid == pid && errorStateInfo.condition == ActivityManager.ProcessErrorStateInfo.NOT_RESPONDING) {
return true;
}
}
}
try {
Thread.sleep(500);
} catch (Exception ignored) {
}
}
return false;
}
那么>21版本的anr如何抓取刃泌?
//init native crash handler / ANR handler (API level >= 21)
int r = Errno.OK;
if (params.enableNativeCrashHandler || (params.enableAnrHandler && Build.VERSION.SDK_INT >= 21)) {
r = NativeHandler.getInstance().initialize(...);
}
是通過nativeHandler來抓的。也就是前面提到的
//trace init 捕獲anr日志
r_trace = xc_trace_init(...);
它是native 注冊 SIGNAL_QUIT 信號耙替,ANR發(fā)生時接收回調(diào)去收集ANR信息。
int xc_trace_init(...)
{
int r;
pthread_t thd;
//capture SIGQUIT only for ART
if(xc_common_api_level < 21) return 0;
...
//init for JNI callback
xc_trace_init_callback(env);
//create event FD
if(0 > (xc_trace_notifier = eventfd(0, EFD_CLOEXEC))) return XCC_ERRNO_SYS;
//register signal handler
if(0 != (r = xcc_signal_trace_register(xc_trace_handler))) goto err2;
//create thread for dump trace
if(0 != (r = pthread_create(&thd, NULL, xc_trace_dumper, NULL))) goto err1;
...
return r;
}
這里xc_trace_notifier是一個eventfd 硝烂,在handler接收信號回調(diào)時被寫
static void xc_trace_handler(int sig, siginfo_t *si, void *uc)
{
uint64_t data;
(void)sig;
(void)si;
(void)uc;
if(xc_trace_notifier >= 0)
{
data = 1;
XCC_UTIL_TEMP_FAILURE_RETRY(write(xc_trace_notifier, &data, sizeof(data)));
}
}
然后xc_trace_dumper線程會解除阻塞狀態(tài)開始執(zhí)行dump任務(wù)。
static void *xc_trace_dumper(void *arg)
{
JNIEnv *env = NULL;
uint64_t data;
uint64_t trace_time;
int fd;
struct timeval tv;
char pathname[1024];
jstring j_pathname;
(void)arg;
pthread_detach(pthread_self());
JavaVMAttachArgs attach_args = {
.version = XC_JNI_VERSION,
.name = "xcrash_trace_dp",
.group = NULL
};
if(JNI_OK != (*xc_common_vm)->AttachCurrentThread(xc_common_vm, &env, &attach_args)) goto exit;
while(1)
{
//block here, waiting for sigquit
XCC_UTIL_TEMP_FAILURE_RETRY(read(xc_trace_notifier, &data, sizeof(data)));
//check if process already crashed
if(xc_common_native_crashed || xc_common_java_crashed) break;
//trace time
if(0 != gettimeofday(&tv, NULL)) break;
trace_time = (uint64_t)(tv.tv_sec) * 1000 * 1000 + (uint64_t)tv.tv_usec;
//Keep only one current trace.
if(0 != xc_trace_logs_clean()) continue;
//create and open log file
if((fd = xc_common_open_trace_log(pathname, sizeof(pathname), trace_time)) < 0) continue;
//write header info
if(0 != xc_trace_write_header(fd, trace_time)) goto end;
//write trace info from ART runtime
if(0 != xcc_util_write_format(fd, XCC_UTIL_THREAD_SEP"Cmd line: %s\n", xc_common_process_name)) goto end;
if(0 != xcc_util_write_str(fd, "Mode: ART DumpForSigQuit\n")) goto end;
if(0 != xc_trace_load_symbols())
{
if(0 != xcc_util_write_str(fd, "Failed to load symbols.\n")) goto end;
goto skip;
}
if(0 != xc_trace_check_address_valid())
{
if(0 != xcc_util_write_str(fd, "Failed to check runtime address.\n")) goto end;
goto skip;
}
if(dup2(fd, STDERR_FILENO) < 0)
{
if(0 != xcc_util_write_str(fd, "Failed to duplicate FD.\n")) goto end;
goto skip;
}
xc_trace_dump_status = XC_TRACE_DUMP_ON_GOING;
if(sigsetjmp(jmpenv, 1) == 0)
{
if(xc_trace_is_lollipop)
xc_trace_libart_dbg_suspend();
xc_trace_libart_runtime_dump(*xc_trace_libart_runtime_instance, xc_trace_libcpp_cerr);
if(xc_trace_is_lollipop)
xc_trace_libart_dbg_resume();
}
else
{
fflush(NULL);
XCD_LOG_WARN("longjmp to skip dumping trace\n");
}
dup2(xc_common_fd_null, STDERR_FILENO);
skip:
if(0 != xcc_util_write_str(fd, "\n"XCC_UTIL_THREAD_END"\n")) goto end;
//write other info
if(0 != xcc_util_record_logcat(fd, xc_common_process_id, xc_common_api_level, xc_trace_logcat_system_lines, xc_trace_logcat_events_lines, xc_trace_logcat_main_lines)) goto end;
if(xc_trace_dump_fds)
if(0 != xcc_util_record_fds(fd, xc_common_process_id)) goto end;
if(xc_trace_dump_network_info)
if(0 != xcc_util_record_network_info(fd, xc_common_process_id, xc_common_api_level)) goto end;
if(0 != xcc_meminfo_record(fd, xc_common_process_id)) goto end;
end:
//close log file
xc_common_close_trace_log(fd);
//rethrow SIGQUIT to ART Signal Catcher
if(xc_trace_rethrow && (XC_TRACE_DUMP_ART_CRASH != xc_trace_dump_status)) xc_trace_send_sigquit();
xc_trace_dump_status = XC_TRACE_DUMP_END;
//JNI callback
//Do we need to implement an emergency buffer for disk exhausted?
if(NULL == xc_trace_cb_method) continue;
if(NULL == (j_pathname = (*env)->NewStringUTF(env, pathname))) continue;
(*env)->CallStaticVoidMethod(env, xc_common_cb_class, xc_trace_cb_method, j_pathname, NULL);
XC_JNI_IGNORE_PENDING_EXCEPTION();
(*env)->DeleteLocalRef(env, j_pathname);
}
(*xc_common_vm)->DetachCurrentThread(xc_common_vm);
exit:
xc_trace_notifier = -1;
close(xc_trace_notifier);
return NULL;
}
本篇文章簡單分析了下xcrash2.5.7源碼串稀,結(jié)合之前java crash處理分析和native crash處理分析,對app收集奔潰日志的整個過程有了個全面了解母截。