调试没有core文件的coredump
阅读原文时间:2023年07月12日阅读:2

  对coredump的分析中,是依赖于core文件的,而core文件中也几乎包含了程序当前的所有状态(堆栈、内存、寄存器等)。然而在实际的线上环境中,由于core文件太大、保存core文件耗时太久,出于线上系统的稳定性与快速恢复考虑,我们往往不会保留core文件。同时,程序堆栈被破坏的情况下,即使我们保留了core文件,也无法准确获取程序崩溃时准确的上下文信息

  在不保留core文件的情况下,如何获取程序崩溃时候的上下文信息(主要是函数调用栈)

当程序发生内存越界访问等行为时,会触发OS的保护机制,此时OS会产生一个信号(signal)发送给对应的进程。当进程从内核态到用户态切换时,该进程会处理这个信号。此类信号(比如SEGV)的默认处理行为生成一个coredump文件。

这里会涉及以下几个问题:

1. 保存的core文件在什么地方?

2. core文件,具体会把进程地址空间的哪些内容保存下来?

3. 如何控制core文件的大小?

4. 如果在处理信号的时候,又产生了新的同类信号,该如何处理?

5. 处理信号的代码,是运行在用户态还是内核态?

6. 在一个多线程的程序中,是由哪个线程在处理这个信号?

/proc/sys/kernel/core_uses_pid` 取值是0或者1,表示是否在core文件名字后面加上进程号

`/proc/$pid/coredump_filter` 设置那些内存会被dump出来

- (bit 0) anonymous private memory
- (bit 1) anonymous shared memory
- (bit 2) file-backed private memory
- (bit 3) file-backed shared memory
- (bit 4) ELF header pages in file-backed private memory areas (it is effective only if the bit 2 is cleared)
- (bit 5) hugetlb private memory
- (bit 6) hugetlb shared memory
- (bit 7) DAX private memory
- (bit 8) DAX shared memory

ulimit  -c ` 决定save的core文件大小限制

  需要在自定义的信号处理函数中打印出程序崩溃时候的活跃函数堆栈信息。

这里我们有两种方式:

1.使用backtrace等方法,读取进程堆栈上的信息;

2.在函数调用的同时,用户自己维护一套数据结构,用于保存函数调用链,在信号处理函数中,将这个函数调用链打印出来

eg:

/**/
int bugreportsignal(const int sig)
{
struct sigaction action;
memset(&action, 0, sizeof(action));
action.sa_sigaction = signal_core_bugreport;
action.sa_flags = SA_SIGINFO;
return (-1 != sigaction(sig, &action, NULL));
}

static atmic_int first = 0;
static void signal_func_def(int sig)
{
fprintf(stderr, "recv signal %d \n", sig);
}
inline static void signal_core_bugreport(const int sig, siginfo_t * info, void * ptr)
{
switch (sig)
{
case SIGSEGV:
case SIGABRT:
case SIGFPE:
case SIGILL:

case SIGBUS:  
{  
    signal(sig, signal\_func);  
    if (atomic\_add(first, 1) == 1)  
    {  
        write\_stack\_msg(sig, info, ptr,"txt");  
    }

    signal(sig, SIG\_DFL);  
    kill(getpid(),sig);//让其生成core文件  
}  
break;

case SIGTERM:  
case SIGINT:  
case SIGQUIT:  
{  
    bugreport\_def\_return(sig)  
}  
break;  
case SIGUSR1:  
{  
     write\_stack\_msg(sig, info, ptr,"sigusr1");  
    break;  
}  
case SIGUSR2:  
{  
    write\_stack\_msg(sig, info, ptr,"sigusr2");  
    break;  
}  
default:  
    break;  
}  

}

static inline void bugreport_def_return(const int sig)
{

snprintf(stderr, 255, " normally exit , pid:%d, sig:%d\n", getpid(), sig);

if (SIGTERM == sig || true)
{
  signal(SIGTERM, SIG_DFL);
}
exit(0);
}

static inline void bugreport_save_task(int fd)
{
#define TASK_CMD "ps -aux | grep %s | grep -v grep"
FILE *stream;
char tmp_buf[1024] = {0};

snprintf(tmp\_buf, 1024, TASK\_CMD, bugreport\_process);  
stream = popen(tmp\_buf,  "r");  
if(stream < 0) {  
    return;  
}  
fprintf(fd, "ps -aux res:\\n");  
while (fgets(tmp\_buf, sizeof(tmp\_buf), stream)) {  
    fprintf(fd, "%s", tmp\_buf);  
}  
pclose(stream);  

}

static inline int write_stack_msg(const int sig, siginfo_t * info,
void * ptr,const char* logfile_suffix)
{
static const char * si_codes[3] = {"", "SEGV_MAPERR", "SEGV_ACCERR"};

size\_t i = 0;  
ucontext\_t \* ucontext = (ucontext\_t \*)ptr;

unsigned long stack\_start = 0;  
unsigned long stack\_end = 0;

snprintf(logpath, "xxxxxxxxxxxxxxxx", bugreport\_logpath);

foreach\_stack\_rang(gettid(), "/proc/getpid()/maps",&stack\_start, &stack\_end);

umask(0);

if (0 != mkdir(bugreport\_logpath, 0755))  
{  
}

snprintf(logfile, xxxx, "%s/%s\_time().txt", bugreport\_logpath, bugreport\_process\_name);

int fd = open(logfile, O\_RDWR | O\_CREAT | O\_APPEND, 0777);

if (-1 == fd)  
{  
    printf(stderr,"%s\\n", errmsg);  
    return 0;  
}

time\_t now;  
now = time(0);  
snprintf(stackinfo,""xxxxx""  
         "time:%ld sig:%d{%s} pid:%d-xxxx--tpid:%ld\\n-----xxx-----\\n",  
         now,  
         sig,  
         bugreport\_signals\[sig\],  
        getpid(),  
        gettid());  
write(fd, stackinfo, strlen(stackinfo));

int f = 0;  
Dl\_info dl\_info;  
void \*\* bp = 0;  
void \* ip = 0;

if (info->si\_code >= 0 && info->si\_code < 3) {  
    snprintf(stackinfo, xxxxxx, "Segmentation Fault!\\n"  
         "info.si\_signo = %d\\n"  
         "info.si\_errno = %d\\n"  
         "info.si\_code = %d (%s)\\n"  
         "info.si\_pid = %d\\n"  
         "info.si\_addr = %p\\n",  
         sig,  
         info->si\_errno,  
         info->si\_code,  
         si\_codes\[info->si\_code\],  
         info->si\_pid,  
         info->si\_addr  
        );  
} else {  
    snprintf(stackinfo,xxxxx, "Segmentation Fault!\\n"  
         "info.si\_signo = %d\\n"  
         "info.si\_errno = %d\\n"  
         "info.si\_code = %d\\n"  
         "info.si\_pid = %d\\n"  
         "info.si\_addr = %p\\n",  
         sig,  
         info->si\_errno,  
         info->si\_code,  
         info->si\_pid,  
         info->si\_addr  
        );  
}  
write(fd, stackinfo, strlen(stackinfo));

ip = (void \*)ucontext->uc\_mcontext.arm\_pc;  
bp = (void \*\*)ucontext->uc\_mcontext.arm\_fp;

write(fd, "REG:\\n", strlen("REG:\\n"));

for (i = 0; i < sizeof(ucontext->uc\_mcontext)/sizeof(unsigned long); i++) {  
    fprintf(fd, "\\t%s: 0x%08lx", rname\_index\[i\],  
            ((unsigned long\*)&ucontext->uc\_mcontext)\[i\]);  
    if (i % 4 == 3)  
        fprintf(fd, "\\n");  
}

write(fd, "\\nStack trace:\\n\\n", strlen("Stack trace:\\n\\n"));

while (bp && ip)  
{  
    if (!dladdr(ip, &dl\_info))  
    {  
        bugreporteak;  
    }

    const char \* symname = dl\_info.dli\_sname;

    fprintf(fd, "stack #%02d: bp:%p %s \[%p->%p\] <%s+%ld>\\n",  
             ++f,bp,  
             dl\_info.dli\_fname,  
             ip,  
             (void\*)((intptr\_t)ip - (intptr\_t)dl\_info.dli\_fbase),  
             symname,  
             (intptr\_t)ip - (intptr\_t)dl\_info.dli\_saddr  
            );

    if( !((unsigned long)bp > stack\_start && (unsigned long)bp < stack\_end) )  
    {  
        bugreporteak;  
    }

    ip = bp\[2\];  
    bp = (void \*\*)bp\[0\];  
}

write(fd, "End of stack trace\\n", strlen("End of stack trace\\n"));

save\_stacktrace(fd, ucontext->uc\_mcontext.arm\_sp);  
save\_proc(fd,"maps");  
save\_proc(fd,"status");  
bugreport\_save\_task(fd);  
bugreport\_save\_svninfo(fd);

close(fd);

return 0;  

}

int signal_bugreport_setup()
{
bugreport_signal_cb(SIGSEGV);
bugreport_signal_cb(SIGABRT);
bugreport_signal_cb(SIGFPE);

bugreport_signal_cb(SIGINT);

bugreport_signal_cb(SIGBUS);
bugreport_signal_cb(SIGILL);
bugreport_signal_cb(SIGQUIT);
bugreport_signal_cb(SIGTERM);

bugreport_signal_inore()(SIGHUP);
bugreport_signal_inore()(SIGPIPE);
//bugreport_signal_inore()(SIGCHLD);//忽略sigchld 会导致system函数返回值失效不能忽略
return 0;
}

static inline void bugreport_def_term(const int sig)
{
char log[256];
snprintf(log, 255, "Exit Normally, pid:%d, sig:%d\n", getpid(), sig);
printf("%s", log);

if (SIGTERM == sig)  
{  
    signal(SIGTERM, SIG\_DFL);  
}  
exit(0);  

}

void dump_trace(int Signal)

{

const int len = 200;

void\* buffer\[len\];

printf("dump\_trace\\n");

int nptrs = ::backtrace(buffer, len);

printf("backtrace\\n");

char\*\* buffer\_array = ::backtrace\_symbols(buffer, nptrs);

printf("sig:%d nptrs:%d\\n", Signal, nptrs);

if (buffer\_array) {

    for (int i = 0; i < nptrs; ++i) {

        printf("frame=%d||trace\_back=%s||\\n", i, buffer\_array\[i\]);

    }

    free(buffer\_array);

}

exit(0);  

}

https://www.man7.org/linux/man-pages/man2/sigaction.2.html

The siginfo_t argument to a SA_SIGINFO handler
When the SA_SIGINFO flag is specified in act.sa_flags, the signal
handler address is passed via the act.sa_sigaction field. This han‐
dler takes three arguments, as follows:

       void  
       handler(int sig, siginfo\_t \*info, void \*ucontext)  
       {  
           ...  
       }

   These three arguments are as follows

   sig    The number of the signal that caused invocation of the han‐  
          dler.

   info   A pointer to a siginfo\_t, which is a structure containing fur‐  
          ther information about the signal, as described below.

   ucontext  
          This is a pointer to a ucontext\_t structure, cast to void \*.  
          The structure pointed to by this field contains signal context  
          information that was saved on the user-space stack by the ker‐  
          nel; for details, see sigreturn(2).  Further information about  
          the ucontext\_t structure can be found in getcontext(3).  Com‐  
          monly, the handler function doesn't make any use of the third  
          argument.

   The siginfo\_t data type is a structure with the following fields:

       siginfo\_t {  
           int      si\_signo;     /\* Signal number \*/  
           int      si\_errno;     /\* An errno value \*/  
           int      si\_code;      /\* Signal code \*/  
           int      si\_trapno;    /\* Trap number that caused  
                                     hardware-generated signal  
                                     (unused on most architectures) \*/  
           pid\_t    si\_pid;       /\* Sending process ID \*/  
           uid\_t    si\_uid;       /\* Real user ID of sending process \*/  
           int      si\_status;    /\* Exit value or signal \*/  
           clock\_t  si\_utime;     /\* User time consumed \*/  
           clock\_t  si\_stime;     /\* System time consumed \*/  
           sigval\_t si\_value;     /\* Signal value \*/  
           int      si\_int;       /\* POSIX.1b signal \*/  
           void    \*si\_ptr;       /\* POSIX.1b signal \*/  
           int      si\_overrun;   /\* Timer overrun count;  
                                     POSIX.1b timers \*/  
           int      si\_timerid;   /\* Timer ID; POSIX.1b timers \*/  
           void    \*si\_addr;      /\* Memory location which caused fault \*/  
           long     si\_band;      /\* Band event (was int in  
                                     glibc 2.3.2 and earlier) \*/  
           int      si\_fd;        /\* File descriptor \*/  
           short    si\_addr\_lsb;  /\* Least significant bit of address  
                                     (since Linux 2.6.32) \*/  
           void    \*si\_lower;     /\* Lower bound when address violation  
                                     occurred (since Linux 3.19) \*/  
           void    \*si\_upper;     /\* Upper bound when address violation  
                                     occurred (since Linux 3.19) \*/  
           int      si\_pkey;      /\* Protection key on PTE that caused  
                                     fault (since Linux 4.6) \*/  
           void    \*si\_call\_addr; /\* Address of system call instruction  
                                     (since Linux 3.5) \*/  
           int      si\_syscall;   /\* Number of attempted system call  
                                     (since Linux 3.5) \*/  
           unsigned int si\_arch;  /\* Architecture of attempted system call  
                                     (since Linux 3.5) \*/  
       }

   si\_signo, si\_errno and si\_code are defined for all signals.  
   (si\_errno is generally unused on Linux.)  The rest of the struct may  
   be a union, so that one should read only the fields that are meaning‐  
   ful for the given signal:

#include

int backtrace(void **buffer, int size);
char **backtrace_symbols(void *const *buffer, int size);
void backtrace_symbols_fd(void *const *buffer, int size, int fd)

backtrace函数通过指针数组buffer返回调用程序的回溯信息,也就是所谓的函数调用栈。buffer数组中的元素是void*类型,也就是栈中保存的返回地址。

size参数指定buffer中可以保存的地址的最大个数。如果实际的回溯信息大于size,则只返回最近的size个地址。

backtrace函数返回buffer中保存的地址个数,返回值不会大于size。如果返回值小于size,则说明所有的回溯信息都已经返回了,如果等于size,则有可能被截断了。

backtrace函数在buffer数组中返回的都是一些虚拟地址,不适于分析。backtrace_symbols函数可以将backtrace返回的buffer中的地址,根据符号表中的信息,转换为字符串(函数名+偏移地址)。size参数指明了buffer中的地址个数。

backtrace_symbols返回字符串数组的首地址,该字符串是在backtrace_symbols中通过malloc分配的,因此,调用者必须使用free释放内存。如果发生了错误,则backtrace_symbols返回NULL

backtrace_symbols_fd类似于backtrace_symbols,只不过它是把字符串信息写到文件描述符fd所表示的文件中。backtrace_symbols_fd不会调用malloc函数

来自网上转载的
#include
#include
#include
#include
#include

#define BTSIZE 100

static void *getMcontextEip(ucontext_t *uc) {
#if defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
/* OSX < 10.6 */ #if defined(__x86_64__) return (void*) uc->uc_mcontext->__ss.__rip;
#elif defined(__i386__)
return (void*) uc->uc_mcontext->__ss.__eip;
#else
return (void*) uc->uc_mcontext->__ss.__srr0;
#endif
#elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
/* OSX >= 10.6 */
#if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
return (void*) uc->uc_mcontext->__ss.__rip;
#else
return (void*) uc->uc_mcontext->__ss.__eip;
#endif
#elif defined(__linux__)
/* Linux */
#if defined(__i386__)
return (void*) uc->uc_mcontext.gregs[14]; /* Linux 32 */
#elif defined(__X86_64__) || defined(__x86_64__)
return (void*) uc->uc_mcontext.gregs[16]; /* Linux 64 */
#elif defined(__ia64__) /* Linux IA64 */
return (void*) uc->uc_mcontext.sc_ip;
#endif
#else
return NULL;
#endif
}

static void sig_handler(int sig, siginfo_t *info, void *secret)
{
ucontext_t *uc = (ucontext_t*) secret;

void \*buffer\[BTSIZE\];  
char \*\*strings;  
int nptrs = 0;

printf("in sig\_handler\\n");  
printf("sig is %d, SIGSEGV is %d\\n", sig, SIGSEGV);  
printf("info.si\_signo is %d, info.si\_addr is %p\\n",  
    info->si\_signo, info->si\_addr);

if (sig == SIGSEGV)  
{  
    nptrs = backtrace(buffer, BTSIZE);  
    printf("backtrace() returned %d addresses\\n", nptrs);

    if (getMcontextEip(uc) != NULL)  
        buffer\[1\] = getMcontextEip(uc);

    strings = backtrace\_symbols(buffer, nptrs);  
    if (strings == NULL) {  
        perror("backtrace\_symbols");  
        exit(EXIT\_FAILURE);  
    }

    printf("backtrace: \\n");  
    int j;  
    for (j = 0; j < nptrs; j++)  
    {  
        printf("\[%d\]%s\\n", j, strings\[j\]);  
    }  
    free(strings);

    exit(0);  
}  

}

#ifdef CONFIG_ARM_UNWIND
static inline void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk)
{
unwind_backtrace(regs, tsk);
}
#else
static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk)
{
unsigned int fp, mode;
int ok = 1;

printk("Backtrace: ");

if (!tsk)  
    tsk = current;

if (regs) {  
    fp = frame\_pointer(regs);  
    mode = processor\_mode(regs);  
} else if (tsk != current) {  
    fp = thread\_saved\_fp(tsk);  
    mode = 0x10;  
} else {  
    asm("mov %0, fp" : "=r" (fp) : : "cc");  
    mode = 0x10;  
}

if (!fp) {  
    pr\_cont("no frame pointer");  
    ok = 0;  
} else if (verify\_stack(fp)) {  
    pr\_cont("invalid frame pointer 0x%08x", fp);  
    ok = 0;  
} else if (fp < (unsigned long)end\_of\_stack(tsk))  
    pr\_cont("frame pointer underflow");  
pr\_cont("\\n");

if (ok)  
    c\_backtrace(fp, mode);  

}

coredump文件本身主要的格式也是ELF格式,因此,我们可以通过readelf命令进行判断。

get_signal 这里没判断是不是信号是不是要触发core dump,然后调用do_coredump

最后会调用elf_core_dump以内核代码elf_core_dump函数为入口分析core文件怎么生成的: