Why the system crashed with message "WARNING: at drivers/net/qlcnic/qlcnic_init.c:1477 qlcnic_process_rxbuf+xxx [qlcnic]" ?
Environment
- Red Hat Enterprise Linux 6.4
- seen on kernels 2.6.32-358.0.1.el6 and 2.6.32-358.18.1.el6
- QLogic NIC
- Ethernet controller: QLogic Corp. cLOM8214 1/10GbE Controller
- NIC firmware version: 4.12.11
Issue
- System crashed after following traces:
WARNING: at drivers/net/qlcnic/qlcnic_init.c:1477 qlcnic_process_rxbuf+0xdb/0xf0 [qlcnic]() (Tainted: G W --------------- )
Hardware name: ProLiant DL580 G7
Modules linked in: ipmi_devintf nfs lockd fscache auth_rpcgss nfs_acl sunrpc cpufreq_ondemand freq_table pcc_cpufreq ipv6 power_meter qlcnic netxen_nic sg microcode serio_raw iTCO_wdt iTCO_vendor_support hpilo hpwdt i7core_edac edac_core shpchp ext4 mbcache jbd2 sr_mod cdrom sd_mod crc_t10dif pata_acpi ata_generic ata_piix hpsa radeon ttm drm_kms_helper drm i2c_algo_bit i2c_core dm_mirror dm_region_hash dm_log dm_mod [last unloaded: scsi_wait_scan]
Pid: 4, comm: ksoftirqd/0 Tainted: G W --------------- 2.6.32-358.0.1.el6.x86_64 #1
Call Trace:
<IRQ> [<ffffffff8106e2e7>] ? warn_slowpath_common+0x87/0xc0
[<ffffffff8106e33a>] ? warn_slowpath_null+0x1a/0x20
[<ffffffffa02d2dbb>] ? qlcnic_process_rxbuf+0xdb/0xf0 [qlcnic]
[<ffffffffa02d5622>] ? qlcnic_process_rcv_ring+0x112/0x8b0 [qlcnic]
[<ffffffffa02ccba3>] ? qlcnic_rx_poll+0x33/0x90 [qlcnic]
[<ffffffff810816ec>] ? run_timer_softirq+0x4c/0x340
[<ffffffff8144ce23>] ? net_rx_action+0x103/0x2f0
[<ffffffff81076fb1>] ? __do_softirq+0xc1/0x1e0
[<ffffffff810e1720>] ? handle_IRQ_event+0x60/0x170
[<ffffffff8100c1cc>] ? call_softirq+0x1c/0x30
[<ffffffff8100de05>] ? do_softirq+0x65/0xa0
[<ffffffff81076d95>] ? irq_exit+0x85/0x90
[<ffffffff81516d75>] ? do_IRQ+0x75/0xf0
[<ffffffff8100b9d3>] ? ret_from_intr+0x0/0x11
<EOI> [<ffffffff8105ab8f>] ? finish_task_switch+0x4f/0xe0
[<ffffffff8150d7c0>] ? thread_return+0x4e/0x76e
[<ffffffff8100b9ce>] ? common_interrupt+0xe/0x13
[<ffffffff81076b65>] ? ksoftirqd+0xd5/0x110
[<ffffffff81076a90>] ? ksoftirqd+0x0/0x110
[<ffffffff81096936>] ? kthread+0x96/0xa0
[<ffffffff8100c0ca>] ? child_rip+0xa/0x20
[<ffffffff810968a0>] ? kthread+0x0/0xa0
[<ffffffff8100c0c0>] ? child_rip+0x0/0x20
Resolution
- It has found to be an hardware problem and changing the NICs has resolved the issue.
Diagnostic Steps
- vmcore log
UPTIME: 6 days, 23:50:32
LOAD AVERAGE: 34.03, 16.26, 6.41 <<< Seems like spike
#4 [ffff880045603a80] __bad_area_nosemaphore at ffffffff81046e85
#5 [ffff880045603ad0] bad_area_nosemaphore at ffffffff81046f53
#6 [ffff880045603ae0] __do_page_fault at ffffffff810476b1
#7 [ffff880045603c00] do_page_fault at ffffffff815131fe
#8 [ffff880045603c30] page_fault at ffffffff815105b5
[exception RIP: _spin_lock+14]
RIP: ffffffff8151009e RSP: ffff880045603ce8 RFLAGS: 00010002
RAX: 0000000000010000 RBX: 0000000100000011 RCX: 0000000000000001
RDX: 0000000000000080 RSI: ffff88013601c8c0 RDI: 0000000100000011
RBP: ffff880045603ce8 R8: 0000000000000018 R9: 0000000000000001
R10: 0000000000000002 R11: 0000000000000053 R12: 0000000000000282
R13: ffff88013601c8c0 R14: ffff881034070440 R15: 0000000100000001
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
#9 [ffff880045603cf0] kmem_cache_free at ffffffff81168c2a
#10 [ffff880045603d60] mempool_free_slab at ffffffff8111c557
#11 [ffff880045603d70] mempool_free at ffffffff8111c6a5
#12 [ffff880045603d90] scsi_sg_free at ffffffff81379cbc
#13 [ffff880045603da0] __sg_free_table at ffffffff812860d0
#14 [ffff880045603df0] __scsi_release_buffers at ffffffff81379e34
0xffffffff81168c19 <kmem_cache_free+489>: je 0xffffffff81168c95 <kmem_cache_free+613>
/usr/src/debug/kernel-2.6.32-358.0.1.el6/linux-2.6.32-358.0.1.el6.x86_64/mm/slab.c: 1081
0xffffffff81168c1b <kmem_cache_free+491>: lea 0x10(%r15),%rbx
0xffffffff81168c1f <kmem_cache_free+495>: mov %edx,-0x50(%rbp)
0xffffffff81168c22 <kmem_cache_free+498>: mov %rbx,%rdi
0xffffffff81168c25 <kmem_cache_free+501>: callq 0xffffffff81510090 <_spin_lock>
/usr/src/debug/kernel-2.6.32-358.0.1.el6/linux-2.6.32-358.0.1.el6.x86_64/mm/slab.c: 1082
0xffffffff81168c2a <kmem_cache_free+506>: mov (%r15),%eax
1077 l3 = cachep->nodelists[node];
1078 STATS_INC_NODEFREES(cachep);
1079 if (l3->alien && l3->alien[nodeid]) {
1080 alien = l3->alien[nodeid];
1081 spin_lock(&alien->lock); <<< Seems waiting here
crash> p -x ((struct kmem_cache*)0xffff881034070440)->nodelists[0]
$1 = (struct kmem_list3 *) 0xffff88083394ac40 <<< Which seems to be pointing to the Corrupted slab part can see above from the bad slab list
crash> struct kmem_list3 0xffff88083394ac40
struct kmem_list3 {
slabs_partial = {
next = 0xffff8808339ac000,
prev = 0xffff8808339ac000
},
slabs_full = {
next = 0xffff880058072040,
prev = 0xffff88013601c040 <<--------------
},
slabs_free = {
next = 0xffff88083394ac60,
prev = 0xffff88083394ac60
},
free_objects = 0x1,
free_limit = 0x4fb,
colour_next = 0x0,
list_lock = {
raw_lock = {
slock = 0xb775b774
}
},
shared = 0xffff8808339ab000,
alien = 0xffff8808339a0740,
next_reap = 0x123ff4f5d,
free_touched = 0x1
}
crash> struct kmem_list3 0xffff88083394ac40
struct kmem_list3 {
alien = 0xffff8808339a0740,
crash> struct array_cache 0xffff8808339a0740
struct array_cache {
avail = 0x0,
limit = 0x0,
batchcount = 0x3394adc0,
touched = 0xffff8808,
lock = {
raw_lock = {
slock = 0x3394ad40 <<<---
}
},
entry = 0xffff8808339a0758
- Appears to be a case of slab corruption:
kmem: sgpool-8: full list: slab: ffff88013601c040 bad prev pointer: d0ebd1dd6a1f9463
kmem: sgpool-8: full list: slab: ffff88013601c040 bad inuse counter: 2151939875
kmem: sgpool-8: full list: slab: ffff88013601c040 bad inuse counter: 2151939875
kmem: sgpool-8: full list: slab: ffff88013601c040 bad s_mem pointer: 2b8ebc150a080101
- Or can see some of know issue log related to NFS
log | grep "Lock reclaim failed"
nfs4_reclaim_open_state: Lock reclaim failed!
nfs4_reclaim_open_state: Lock reclaim failed!
nfs4_reclaim_open_state: Lock reclaim failed!
nfs4_reclaim_open_state: Lock reclaim failed!
# kernel >= 2.6.32-358.el6 and kernel < 2.6.32-358.6.1.el6 Could see the issue here
RELEASE: 2.6.32-358.0.1.el6.x86_64 <<< kernel is in use
Which is a known issue related to NFS
- kernel was upgraded and system crashed again:
general protection fault: 0000 [#1] SMP
last sysfs file: /sys/devices/pci0000:00/0000:00:1e.0/0000:01:03.0/irq
CPU 6
Modules linked in: bridge stp llc ipmi_devintf nfs lockd fscache auth_rpcgss nfs_acl sunrpc cpufreq_ondemand freq_table pcc_cpufreq ipv6 power_meter qlcnic netxen_nic microcode sg serio_raw iTCO_wdt iTCO_vendor_support hpwdt hpilo i7core_edac edac_core shpchp ext4 jbd2 mbcache sr_mod cdrom sd_mod crc_t10dif pata_acpi ata_generic ata_piix hpsa radeon ttm drm_kms_helper drm i2c_algo_bit i2c_core dm_mirror dm_region_hash dm_log dm_mod [last unloaded: scsi_wait_scan]
Pid: 33831, comm: bptm Tainted: G W --------------- 2.6.32-358.18.1.el6.x86_64 #1 HP ProLiant DL580 G7
RIP: 0010:[<ffffffffa02d4157>] [<ffffffffa02d4157>] qlcnic_alloc_rx_skb+0x127/0x1b0 [qlcnic]
RSP: 0018:ffff880045783da8 EFLAGS: 00010246
RAX: 0000000000000000 RBX: ffff880832d47680 RCX: 0000000000002da0
RDX: dead0000001000f0 RSI: 0000000047bd8022 RDI: ffff880024060000
RBP: ffff880045783df8 R08: 0000000000000002 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: ffff8808339a3000
R13: ffff88083272b6e0 R14: ffff8808339a3090 R15: ffff880047bd8022
FS: 00007f817bba1720(0000) GS:ffff880045780000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 0000000001c292b8 CR3: 0000000139446000 CR4: 00000000000007e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process bptm (pid: 33831, threadinfo ffff880109c8a000, task ffff880109b2aaa0)
Stack:
ffff8808339a3090 00002da00000112c dead0000001000f0 0000000047bd8022
<d> ffff880045783df8 ffff881033693b00 dead000000100100 ffff8810324a3d90
<d> 0000000000000009 ffff88083272b6e0 ffff880045783e98 ffffffffa02d6870
Call Trace:
<IRQ>
[<ffffffffa02d6870>] qlcnic_process_rcv_ring+0x350/0x8f0 [qlcnic]
[<ffffffffa02cdba3>] qlcnic_rx_poll+0x33/0x90 [qlcnic]
[<ffffffff810817ec>] ? run_timer_softirq+0x4c/0x340
[<ffffffff8144d453>] net_rx_action+0x103/0x2f0
[<ffffffff810770b1>] __do_softirq+0xc1/0x1e0
[<ffffffff8100c1cc>] call_softirq+0x1c/0x30
<EOI>
[<ffffffff8100de05>] ? do_softirq+0x65/0xa0
[<ffffffff81076f3a>] local_bh_enable+0x9a/0xb0
[<ffffffff8148c678>] tcp_prequeue_process+0x88/0xa0
[<ffffffff8148fd7c>] tcp_recvmsg+0xacc/0xe80
[<ffffffff8100b9ce>] ? common_interrupt+0xe/0x13
[<ffffffff8100b9ce>] ? common_interrupt+0xe/0x13
[<ffffffff814b001a>] inet_recvmsg+0x5a/0x90
[<ffffffff81437b6d>] ? sock_recvmsg+0x12d/0x160
[<ffffffff81437b73>] sock_recvmsg+0x133/0x160
[<ffffffff81096da0>] ? autoremove_wake_function+0x0/0x40
[<ffffffff8119887c>] ? core_sys_select+0x1ec/0x2c0
[<ffffffff8100b9ce>] ? common_interrupt+0xe/0x13
[<ffffffff81182651>] ? fget_light+0x21/0x90
[<ffffffff81437cee>] sys_recvfrom+0xee/0x180
[<ffffffff810097cc>] ? __switch_to+0x1ac/0x320
[<ffffffff810dc7a6>] ? audit_syscall_entry+0x46/0x200
[<ffffffff810dc937>] ? audit_syscall_entry+0x1d7/0x200
[<ffffffff8100b072>] system_call_fastpath+0x16/0x1b
Code: 85 f6 74 68 49 8b 84 24 50 02 00 00 48 85 c0 74 5b 48 8b 40 60 48 85 c0 74 3a 48 8b 75 c8 4c 89 f7 ff d0 85 c0 75 65 48 8b 55 c0 <48> 89 5a 08 48 8b 45 c8 48 89 42 20 31 c0 48 8b 5d d8 4c 8b 65
RIP [<ffffffffa02d4157>] qlcnic_alloc_rx_skb+0x127/0x1b0 [qlcnic]
RSP <ffff880045783da8>
crash> bt
PID: 33831 TASK: ffff880109b2aaa0 CPU: 6 COMMAND: "bptm"
#0 [ffff880045783b30] machine_kexec at ffffffff81035d6b
#1 [ffff880045783b90] crash_kexec at ffffffff810c0e22
#2 [ffff880045783c60] oops_end at ffffffff81511c20
#3 [ffff880045783c90] die at ffffffff8100f19b
#4 [ffff880045783cc0] do_general_protection at ffffffff81511722
#5 [ffff880045783cf0] general_protection at ffffffff81510ef5
[exception RIP: qlcnic_alloc_rx_skb+295]
RIP: ffffffffa02d4157 RSP: ffff880045783da8 RFLAGS: 00010246
RAX: 0000000000000000 RBX: ffff880832d47680 RCX: 0000000000002da0
RDX: dead0000001000f0 RSI: 0000000047bd8022 RDI: ffff880024060000
RBP: ffff880045783df8 R8: 0000000000000002 R9: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: ffff8808339a3000
R13: ffff88083272b6e0 R14: ffff8808339a3090 R15: ffff880047bd8022
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
#6 [ffff880045783e00] qlcnic_process_rcv_ring at ffffffffa02d6870 [qlcnic]
#7 [ffff880045783ea0] qlcnic_rx_poll at ffffffffa02cdba3 [qlcnic]
#8 [ffff880045783ee0] net_rx_action at ffffffff8144d453
#9 [ffff880045783f40] __do_softirq at ffffffff810770b1
#10 [ffff880045783fb0] call_softirq at ffffffff8100c1cc
--- <IRQ stack> ---
#11 [ffff880109c8bae0] do_softirq at ffffffff8100de05
#12 [ffff880109c8bb00] local_bh_enable at ffffffff81076f3a
#13 [ffff880109c8bb30] tcp_recvmsg at ffffffff8148fd7c
#14 [ffff880109c8bc40] inet_recvmsg at ffffffff814b001a
#15 [ffff880109c8bc80] sock_recvmsg at ffffffff81437b73
#16 [ffff880109c8be40] sys_recvfrom at ffffffff81437cee
#17 [ffff880109c8bf80] system_call_fastpath at ffffffff8100b072
RIP: 000000329d6e96b2 RSP: 00007fff361e6600 RFLAGS: 00000213
RAX: 000000000000002d RBX: ffffffff8100b072 RCX: 000000329d6e96b2
RDX: 000000000000df00 RSI: 00007f817510f100 RDI: 0000000000000000
RBP: 0000000000000b50 R8: 0000000000000000 R9: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00007f817bba16a8
R13: 000000000000df00 R14: 00007f817510f100 R15: 0000000000000000
ORIG_RAX: 000000000000002d CS: 0033 SS: 002b
/usr/src/debug/kernel-2.6.32-358.18.1.el6/linux-2.6.32-358.18.1.el6.x86_64/arch/x86/include/asm/dma-mapping.h: 49
0xffffffffa02d4146 <qlcnic_alloc_rx_skb+278>: mov -0x38(%rbp),%rsi
0xffffffffa02d414a <qlcnic_alloc_rx_skb+282>: mov %r14,%rdi
0xffffffffa02d414d <qlcnic_alloc_rx_skb+285>: callq *%rax
/usr/src/debug/kernel-2.6.32-358.18.1.el6/linux-2.6.32-358.18.1.el6.x86_64/drivers/net/qlcnic/qlcnic_init.c: 1456
0xffffffffa02d414f <qlcnic_alloc_rx_skb+287>: test %eax,%eax
0xffffffffa02d4151 <qlcnic_alloc_rx_skb+289>: jne 0xffffffffa02d41b8 <qlcnic_alloc_rx_skb+392>
/usr/src/debug/kernel-2.6.32-358.18.1.el6/linux-2.6.32-358.18.1.el6.x86_64/drivers/net/qlcnic/qlcnic_init.c: 1462
0xffffffffa02d4153 <qlcnic_alloc_rx_skb+291>: mov -0x40(%rbp),%rdx
0xffffffffa02d4157 <qlcnic_alloc_rx_skb+295>: mov %rbx,0x8(%rdx)
1447 adapter->stats.skb_alloc_failure++;
1448 return -ENOMEM;
1449 }
1450
1451 skb_reserve(skb, NET_IP_ALIGN);
1452
1453 dma = pci_map_single(pdev, skb->data,
1454 rds_ring->dma_size, PCI_DMA_FROMDEVICE);
1455
1456 if (pci_dma_mapping_error(pdev, dma)) {
1457 adapter->stats.rx_dma_map_error++;
1458 dev_kfree_skb_any(skb);
1459 return -ENOMEM;
1460 }
1461
1462 buffer->skb = skb; <<------ here
1463 buffer->dma = dma;
1464
1465 return 0;
1466 }
1467
This solution is part of Red Hat’s fast-track publication program, providing a huge library of solutions that Red Hat engineers have created while supporting our customers. To give you the knowledge you need the instant it becomes available, these articles may be presented in a raw and unedited form.
Comments