在 Red Hat Enterprise Linux 7 中,内核在 netlink_compare 中崩溃或出现软锁定或挂起
Issue
- 内核在
netlink_compare
中崩溃,并带有类似如下的回溯追踪信息:
crash> bt
PID: 12924 TASK: ffff8801c4348b80 CPU: 12 COMMAND: "crond"
#0 [ffff88070d253928] machine_kexec at ffffffff81051e9b
#1 [ffff88070d253988] crash_kexec at ffffffff810f27a2
#2 [ffff88070d253a58] oops_end at ffffffff8163f448
#3 [ffff88070d253a80] die at ffffffff8101859b
#4 [ffff88070d253ab0] do_general_protection at ffffffff8163ed3e
#5 [ffff88070d253ae0] general_protection at ffffffff8163e5e8
[exception RIP: netlink_compare+11]
RIP: ffffffff8155654b RSP: ffff88070d253b90 RFLAGS: 00010246
RAX: 0000000000000000 RBX: ff04000000000006 RCX: 000000002ce2b4ee
RDX: 0000000000000000 RSI: ffff88070d253be0 RDI: ff03fffffffffb7e
RBP: ffff88070d253bc8 R8: ffff88070d253bdc R9: 752f223d65786520
R10: 2f6e6962732f7273 R11: 682022646e6f7263 R12: ffff8808140d2678
R13: ffff88070d253be0 R14: ffffffff81556540 R15: ffff8808091a6c00
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
#6 [ffff88070d253b98] rhashtable_lookup_compare at ffffffff813086f0
#7 [ffff88070d253bd0] netlink_lookup at ffffffff81556e7e
#8 [ffff88070d253c00] netlink_getsockbyportid at ffffffff8155821f
#9 [ffff88070d253c18] netlink_unicast at ffffffff8155a479
#10 [ffff88070d253c60] netlink_sendmsg at ffffffff8155a8b0
#11 [ffff88070d253cf8] sock_sendmsg at ffffffff815117a0
#12 [ffff88070d253e58] SYSC_sendto at ffffffff81511d11
#13 [ffff88070d253f70] sys_sendto at ffffffff8151279e
#14 [ffff88070d253f80] system_call_fastpath at ffffffff81646b49
kernel: INFO: task crond:96675 blocked for more than 120 seconds.
kernel: "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
kernel: crond D ffffffff81a2ad08 0 96675 3155 0x00000000
kernel: ffff8820283f3b30 0000000000000082 ffff88354e061700 ffff8820283f3fd8
kernel: ffff8820283f3fd8 ffff8820283f3fd8 ffff88354e061700 ffffffff81a2ad00
kernel: ffffffff81a2ad04 ffff88354e061700 00000000ffffffff ffffffff81a2ad08
kernel: Call Trace:
kernel: [<ffffffff8163d309>] schedule_preempt_disabled+0x29/0x70
kernel: [<ffffffff8163b005>] __mutex_lock_slowpath+0xc5/0x1c0
kernel: [<ffffffff8163a46f>] mutex_lock+0x1f/0x2f
kernel: [<ffffffff815574ae>] netlink_insert+0x4e/0x100
kernel: [<ffffffff81308470>] ? rhashtable_lookup_compare+0x30/0x90
kernel: [<ffffffff81557b30>] netlink_autobind.isra.37+0xc0/0x100
kernel: [<ffffffff8155a59a>] netlink_sendmsg+0x22a/0x770
kernel: [<ffffffff81189bba>] ? __dec_zone_page_state+0x2a/0x30
kernel: [<ffffffff815116a0>] sock_sendmsg+0xb0/0xf0
kernel: [<ffffffff81511c11>] SYSC_sendto+0x121/0x1c0
kernel: [<ffffffff8164271d>] ? __do_page_fault+0x16d/0x450
kernel: [<ffffffff81642a23>] ? do_page_fault+0x23/0x80
kernel: [<ffffffff811f1b10>] ? SyS_fcntl+0x4d0/0x5d0
kernel: [<ffffffff8151269e>] SyS_sendto+0xe/0x10
kernel: [<ffffffff81647209>] system_call_fastpath+0x16/0x1b
crash> bt
PID: 29345 TASK: ffff884081ac0000 CPU: 14 COMMAND: "crond"
#0 [ffff88097cb0f958] machine_kexec at ffffffff81051e9b
#1 [ffff88097cb0f9b8] crash_kexec at ffffffff810f27e2
#2 [ffff88097cb0fa88] oops_end at ffffffff8163f208
#3 [ffff88097cb0fab0] die at ffffffff8101859b
#4 [ffff88097cb0fae0] do_general_protection at ffffffff8163eafe
#5 [ffff88097cb0fb10] general_protection at ffffffff8163e3a8
[exception RIP: netlink_compare+11]
RIP: ffffffff81555f0b RSP: ffff88097cb0fbc8 RFLAGS: 00010246
RAX: 0000000000000000 RBX: 0033003600300038 RCX: 00000000c9851744
RDX: 00000000000072a1 RSI: ffff88097cb0fc18 RDI: 00330036002ffbb0
RBP: ffff88097cb0fc00 R8: ffff88097cb0fc14 R9: 000000000000000c
R10: 0000000000000000 R11: 0000000000000246 R12: ffff882278212678
R13: ffff88097cb0fc18 R14: ffffffff81555f00 R15: ffff882598f68fc0
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
#6 [ffff88097cb0fbd0] rhashtable_lookup_compare at ffffffff813080a0
#7 [ffff88097cb0fc08] netlink_autobind at ffffffff815576ee
#8 [ffff88097cb0fc60] netlink_sendmsg at ffffffff8155a16a
#9 [ffff88097cb0fcf8] sock_sendmsg at ffffffff815112a0
#10 [ffff88097cb0fe58] SYSC_sendto at ffffffff81511811
#11 [ffff88097cb0ff70] sys_sendto at ffffffff8151229e
#12 [ffff88097cb0ff80] system_call_fastpath at ffffffff81646909
- 另外一个在 ppc64 中的内核在
.netlink_compare+
中崩溃的示例,它带有类似如下的回溯追踪信息:
crash> bt
PID: 58490 TASK: c000000fa85dc1f0 CPU: 22 COMMAND: "httpd"
#0 [c000000c327cae90] .crash_kexec at c000000000183db0
#1 [c000000c327cb090] .die at c000000000020888
#2 [c000000c327cb140] .bad_page_fault at c0000000000567b8
#3 [c000000c327cb1c0] handle_page_fault at c000000000009588
Data Access [300] exception frame:
R0: c0000000007efe4c R1: c000000c327cb4b0 R2: c0000000013823e8
R3: 00000198da5f7380 R4: c000000c327cb5c0 R5: 00000000000000ac
R6: 000000007f612171 R7: 00000000e6b1d61c R8: 000000004637f299
R9: c0000000007eac90 R10: 0000000000000000 R11: 0000000000000000
R12: 0000000028042828 R13: c000000007b3c600 R14: 0000000000000000
R15: 00003fff8cbfe1e0 R16: 00003fff8cbfe1d8 R17: 00003fff4c01e35e
R18: 00003fff4c01e468 R19: 0000000000000000 R20: 000001003575c480
R21: 0000000000000000 R22: 0000000000000001 R23: 7fffffffffffffff
R24: fffffffffffff000 R25: c00000000136ea40 R26: c00000000130fe00
R27: c000000fe3fca8e0 R28: c000000c327cb5c0 R29: c000000001370a40
R30: c000000fed184000 R31: 00000198da5f7800
NIP: c0000000007eac90 MSR: 8000000000009032 OR3: c0000000004affd8
CTR: c0000000007eac90 LR: c0000000004affdc XER: 0000000000000000
CCR: 0000000028042824 MQ: 0000000000000001 DAR: 00000198da5f7678
DSISR: 0000000040000000 Syscall Result: 0000000000000000
#4 [c000000c327cb4b0] .netlink_compare at c0000000007eac90
[Link Register] [c000000c327cb4b0] .rhashtable_lookup_compare at c0000000004affdc
#5 [c000000c327cb550] .__netlink_dump_start at c0000000007efe4c
#6 [c000000c327cb610] .rtnetlink_rcv_msg at c0000000007c46c8
#7 [c000000c327cb6f0] .netlink_rcv_skb at c0000000007f094c
#8 [c000000c327cb780] .rtnetlink_rcv at c0000000007c4514
#9 [c000000c327cb800] .netlink_unicast at c0000000007f050c
#10 [c000000c327cb900] .netlink_sendmsg at c0000000007f19e0
#11 [c000000c327cba40] .sock_sendmsg at c00000000077b2d0
#12 [c000000c327cbc00] .sys_sendto at c00000000078143c
#13 [c000000c327cbd80] .sys_socketcall at c000000000782a18
#14 [c000000c327cbe30] system_call at c00000000000a17c
System Call [c00] exception frame:
R0: 0000000000000066 R1: 00003fff8cbed750 R2: 00003fffa12e74f8
R3: 000000000000000b R4: 00003fff8cbed800 R5: 0000000000000014
R6: 0000000000000000 R7: 0000000000000002 R8: 0000000000000000
R9: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000
R12: 0000000000000000 R13: 00003fff8cc06900 R14: 0000000000000000
R15: 00003fff8cbfe1e0 R16: 00003fff8cbfe1d8 R17: 00003fff4c01e35e
R18: 00003fff4c01e468 R19: 0000000000000000 R20: 000001003575c480
R21: 00003fff8cbfdf91 R22: 0000000000000020 R23: 00003fffa0fc6cd8
R24: 00003fff8cbed840 R25: 00003fff8cbfd8c0 R26: 00003fff8cbfdbc0
R27: 000000000000e42f R28: 00003fff8cbff910 R29: 00003fff8cbfd890
R30: 000000000000000f R31: 00003fff8cbfd7e0
NIP: 00003fffa122a850 MSR: 800000010000d032 OR3: 000000000000000b
CTR: 0000000000000000 LR: 00003fffa122a83c XER: 0000000000000000
CCR: 0000000044042848 MQ: 0000000000000001 DAR: 00003fff0c0e4c18
DSISR: 0000000042000000 Syscall Result: 0000000000000000
- 我们在升级到 3.10.0-327.22.2.el7 (rhel7.2.z)内核后,在 /var/log/messages 中出现以下信息:
kernel: audit: netlink_unicast sending to audit_pid=1234 returned error: -111
kernel: audit: audit_lost=1 audit_rate_limit=0 audit_backlog_limit=320
kernel: audit: audit_pid=1234 reset
之后,审计日志开始写入信息文件,而不是写入 audit.log 文件。auditd 进程的 strace 显示进程仍然存活并在进行 epoll_wait()。因此,出现问题的根本原因似乎与 netlink 连接相关。
Environment
- Red Hat Enterprise Linux 7.2
- 早于
kernel-3.10.0-327.54.1.el7
的内核版本
- 早于
Subscriber exclusive content
A Red Hat subscription provides unlimited access to our knowledgebase, tools, and much more.