Why the system crashed with message "WARNING: at drivers/net/qlcnic/qlcnic_init.c:1477 qlcnic_process_rxbuf+xxx [qlcnic]" ?

Solution Verified - Updated -

Environment

  • Red Hat Enterprise Linux 6.4
    • seen on kernels 2.6.32-358.0.1.el6 and 2.6.32-358.18.1.el6
  • QLogic NIC
    • Ethernet controller: QLogic Corp. cLOM8214 1/10GbE Controller
    • NIC firmware version: 4.12.11

Issue

  • System crashed after following traces:
WARNING: at drivers/net/qlcnic/qlcnic_init.c:1477 qlcnic_process_rxbuf+0xdb/0xf0 [qlcnic]() (Tainted: G        W  ---------------   )
Hardware name: ProLiant DL580 G7
Modules linked in: ipmi_devintf nfs lockd fscache auth_rpcgss nfs_acl sunrpc cpufreq_ondemand freq_table pcc_cpufreq ipv6 power_meter qlcnic netxen_nic sg microcode serio_raw iTCO_wdt iTCO_vendor_support hpilo hpwdt i7core_edac edac_core shpchp ext4 mbcache jbd2 sr_mod cdrom sd_mod crc_t10dif pata_acpi ata_generic ata_piix hpsa radeon ttm drm_kms_helper drm i2c_algo_bit i2c_core dm_mirror dm_region_hash dm_log dm_mod [last unloaded: scsi_wait_scan]
Pid: 4, comm: ksoftirqd/0 Tainted: G        W  ---------------    2.6.32-358.0.1.el6.x86_64 #1
Call Trace:
 <IRQ>  [<ffffffff8106e2e7>] ? warn_slowpath_common+0x87/0xc0
 [<ffffffff8106e33a>] ? warn_slowpath_null+0x1a/0x20
 [<ffffffffa02d2dbb>] ? qlcnic_process_rxbuf+0xdb/0xf0 [qlcnic]
 [<ffffffffa02d5622>] ? qlcnic_process_rcv_ring+0x112/0x8b0 [qlcnic]
 [<ffffffffa02ccba3>] ? qlcnic_rx_poll+0x33/0x90 [qlcnic]
 [<ffffffff810816ec>] ? run_timer_softirq+0x4c/0x340
 [<ffffffff8144ce23>] ? net_rx_action+0x103/0x2f0
 [<ffffffff81076fb1>] ? __do_softirq+0xc1/0x1e0
 [<ffffffff810e1720>] ? handle_IRQ_event+0x60/0x170
 [<ffffffff8100c1cc>] ? call_softirq+0x1c/0x30
 [<ffffffff8100de05>] ? do_softirq+0x65/0xa0
 [<ffffffff81076d95>] ? irq_exit+0x85/0x90
 [<ffffffff81516d75>] ? do_IRQ+0x75/0xf0
 [<ffffffff8100b9d3>] ? ret_from_intr+0x0/0x11
 <EOI>  [<ffffffff8105ab8f>] ? finish_task_switch+0x4f/0xe0
 [<ffffffff8150d7c0>] ? thread_return+0x4e/0x76e
 [<ffffffff8100b9ce>] ? common_interrupt+0xe/0x13
 [<ffffffff81076b65>] ? ksoftirqd+0xd5/0x110
 [<ffffffff81076a90>] ? ksoftirqd+0x0/0x110
 [<ffffffff81096936>] ? kthread+0x96/0xa0
 [<ffffffff8100c0ca>] ? child_rip+0xa/0x20
 [<ffffffff810968a0>] ? kthread+0x0/0xa0
 [<ffffffff8100c0c0>] ? child_rip+0x0/0x20

Resolution

  • It has found to be an hardware problem and changing the NICs has resolved the issue.

Diagnostic Steps

  • vmcore log
      UPTIME: 6 days, 23:50:32
LOAD AVERAGE: 34.03, 16.26, 6.41  <<< Seems like spike 

 #4 [ffff880045603a80] __bad_area_nosemaphore at ffffffff81046e85
 #5 [ffff880045603ad0] bad_area_nosemaphore at ffffffff81046f53
 #6 [ffff880045603ae0] __do_page_fault at ffffffff810476b1
 #7 [ffff880045603c00] do_page_fault at ffffffff815131fe
 #8 [ffff880045603c30] page_fault at ffffffff815105b5
    [exception RIP: _spin_lock+14]
    RIP: ffffffff8151009e  RSP: ffff880045603ce8  RFLAGS: 00010002
    RAX: 0000000000010000  RBX: 0000000100000011  RCX: 0000000000000001
    RDX: 0000000000000080  RSI: ffff88013601c8c0  RDI: 0000000100000011
    RBP: ffff880045603ce8   R8: 0000000000000018   R9: 0000000000000001
    R10: 0000000000000002  R11: 0000000000000053  R12: 0000000000000282
    R13: ffff88013601c8c0  R14: ffff881034070440  R15: 0000000100000001
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
 #9 [ffff880045603cf0] kmem_cache_free at ffffffff81168c2a
#10 [ffff880045603d60] mempool_free_slab at ffffffff8111c557
#11 [ffff880045603d70] mempool_free at ffffffff8111c6a5
#12 [ffff880045603d90] scsi_sg_free at ffffffff81379cbc
#13 [ffff880045603da0] __sg_free_table at ffffffff812860d0
#14 [ffff880045603df0] __scsi_release_buffers at ffffffff81379e34

0xffffffff81168c19 <kmem_cache_free+489>:       je     0xffffffff81168c95 <kmem_cache_free+613>
/usr/src/debug/kernel-2.6.32-358.0.1.el6/linux-2.6.32-358.0.1.el6.x86_64/mm/slab.c: 1081
0xffffffff81168c1b <kmem_cache_free+491>:       lea    0x10(%r15),%rbx
0xffffffff81168c1f <kmem_cache_free+495>:       mov    %edx,-0x50(%rbp)
0xffffffff81168c22 <kmem_cache_free+498>:       mov    %rbx,%rdi
0xffffffff81168c25 <kmem_cache_free+501>:       callq  0xffffffff81510090 <_spin_lock>
/usr/src/debug/kernel-2.6.32-358.0.1.el6/linux-2.6.32-358.0.1.el6.x86_64/mm/slab.c: 1082
0xffffffff81168c2a <kmem_cache_free+506>:       mov    (%r15),%eax


         1077   l3 = cachep->nodelists[node];
         1078   STATS_INC_NODEFREES(cachep);
         1079   if (l3->alien && l3->alien[nodeid]) {
         1080       alien = l3->alien[nodeid];
         1081       spin_lock(&alien->lock); <<< Seems waiting here 



crash> p -x ((struct kmem_cache*)0xffff881034070440)->nodelists[0]
$1 = (struct kmem_list3 *) 0xffff88083394ac40 <<< Which seems to be pointing to the Corrupted slab part can see above from the bad slab list

crash> struct kmem_list3 0xffff88083394ac40
struct kmem_list3 {
  slabs_partial = {
    next = 0xffff8808339ac000, 
    prev = 0xffff8808339ac000
  }, 
  slabs_full = {
    next = 0xffff880058072040, 
    prev = 0xffff88013601c040 <<-------------- 
  }, 
  slabs_free = {
    next = 0xffff88083394ac60, 
    prev = 0xffff88083394ac60
  }, 
  free_objects = 0x1, 
  free_limit = 0x4fb, 
  colour_next = 0x0, 
  list_lock = {
    raw_lock = {
      slock = 0xb775b774
    }
  }, 
  shared = 0xffff8808339ab000, 
  alien = 0xffff8808339a0740, 
  next_reap = 0x123ff4f5d, 
  free_touched = 0x1
}



crash> struct kmem_list3 0xffff88083394ac40
struct kmem_list3 {
  alien = 0xffff8808339a0740, 

crash> struct array_cache 0xffff8808339a0740
struct array_cache {
  avail = 0x0, 
  limit = 0x0, 
  batchcount = 0x3394adc0, 
  touched = 0xffff8808, 
  lock = {
    raw_lock = {
      slock = 0x3394ad40 <<<---
    }
  }, 
  entry = 0xffff8808339a0758

  • Appears to be a case of slab corruption:
kmem: sgpool-8: full list: slab: ffff88013601c040  bad prev pointer: d0ebd1dd6a1f9463
kmem: sgpool-8: full list: slab: ffff88013601c040  bad inuse counter: 2151939875
kmem: sgpool-8: full list: slab: ffff88013601c040  bad inuse counter: 2151939875
kmem: sgpool-8: full list: slab: ffff88013601c040  bad s_mem pointer: 2b8ebc150a080101
  • Or can see some of know issue log related to NFS
log | grep "Lock reclaim failed"

nfs4_reclaim_open_state: Lock reclaim failed!
nfs4_reclaim_open_state: Lock reclaim failed!
nfs4_reclaim_open_state: Lock reclaim failed!
nfs4_reclaim_open_state: Lock reclaim failed!

# kernel >= 2.6.32-358.el6 and kernel < 2.6.32-358.6.1.el6 Could see the issue here  
 RELEASE: 2.6.32-358.0.1.el6.x86_64 <<< kernel is in use 

Which is a known issue related to NFS
  • kernel was upgraded and system crashed again:
general protection fault: 0000 [#1] SMP 
last sysfs file: /sys/devices/pci0000:00/0000:00:1e.0/0000:01:03.0/irq
CPU 6 
Modules linked in: bridge stp llc ipmi_devintf nfs lockd fscache auth_rpcgss nfs_acl sunrpc cpufreq_ondemand freq_table pcc_cpufreq ipv6 power_meter qlcnic netxen_nic microcode sg serio_raw iTCO_wdt iTCO_vendor_support hpwdt hpilo i7core_edac edac_core shpchp ext4 jbd2 mbcache sr_mod cdrom sd_mod crc_t10dif pata_acpi ata_generic ata_piix hpsa radeon ttm drm_kms_helper drm i2c_algo_bit i2c_core dm_mirror dm_region_hash dm_log dm_mod [last unloaded: scsi_wait_scan]

Pid: 33831, comm: bptm Tainted: G        W  ---------------    2.6.32-358.18.1.el6.x86_64 #1 HP ProLiant DL580 G7
RIP: 0010:[<ffffffffa02d4157>]  [<ffffffffa02d4157>] qlcnic_alloc_rx_skb+0x127/0x1b0 [qlcnic]
RSP: 0018:ffff880045783da8  EFLAGS: 00010246
RAX: 0000000000000000 RBX: ffff880832d47680 RCX: 0000000000002da0
RDX: dead0000001000f0 RSI: 0000000047bd8022 RDI: ffff880024060000
RBP: ffff880045783df8 R08: 0000000000000002 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: ffff8808339a3000
R13: ffff88083272b6e0 R14: ffff8808339a3090 R15: ffff880047bd8022
FS:  00007f817bba1720(0000) GS:ffff880045780000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 0000000001c292b8 CR3: 0000000139446000 CR4: 00000000000007e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process bptm (pid: 33831, threadinfo ffff880109c8a000, task ffff880109b2aaa0)
Stack:
 ffff8808339a3090 00002da00000112c dead0000001000f0 0000000047bd8022
<d> ffff880045783df8 ffff881033693b00 dead000000100100 ffff8810324a3d90
<d> 0000000000000009 ffff88083272b6e0 ffff880045783e98 ffffffffa02d6870
Call Trace:
 <IRQ> 
 [<ffffffffa02d6870>] qlcnic_process_rcv_ring+0x350/0x8f0 [qlcnic]
 [<ffffffffa02cdba3>] qlcnic_rx_poll+0x33/0x90 [qlcnic]
 [<ffffffff810817ec>] ? run_timer_softirq+0x4c/0x340
 [<ffffffff8144d453>] net_rx_action+0x103/0x2f0
 [<ffffffff810770b1>] __do_softirq+0xc1/0x1e0
 [<ffffffff8100c1cc>] call_softirq+0x1c/0x30
 <EOI> 
 [<ffffffff8100de05>] ? do_softirq+0x65/0xa0
 [<ffffffff81076f3a>] local_bh_enable+0x9a/0xb0
 [<ffffffff8148c678>] tcp_prequeue_process+0x88/0xa0
 [<ffffffff8148fd7c>] tcp_recvmsg+0xacc/0xe80
 [<ffffffff8100b9ce>] ? common_interrupt+0xe/0x13
 [<ffffffff8100b9ce>] ? common_interrupt+0xe/0x13
 [<ffffffff814b001a>] inet_recvmsg+0x5a/0x90
 [<ffffffff81437b6d>] ? sock_recvmsg+0x12d/0x160
 [<ffffffff81437b73>] sock_recvmsg+0x133/0x160
 [<ffffffff81096da0>] ? autoremove_wake_function+0x0/0x40
 [<ffffffff8119887c>] ? core_sys_select+0x1ec/0x2c0
 [<ffffffff8100b9ce>] ? common_interrupt+0xe/0x13
 [<ffffffff81182651>] ? fget_light+0x21/0x90
 [<ffffffff81437cee>] sys_recvfrom+0xee/0x180
 [<ffffffff810097cc>] ? __switch_to+0x1ac/0x320
 [<ffffffff810dc7a6>] ? audit_syscall_entry+0x46/0x200
 [<ffffffff810dc937>] ? audit_syscall_entry+0x1d7/0x200
 [<ffffffff8100b072>] system_call_fastpath+0x16/0x1b
Code: 85 f6 74 68 49 8b 84 24 50 02 00 00 48 85 c0 74 5b 48 8b 40 60 48 85 c0 74 3a 48 8b 75 c8 4c 89 f7 ff d0 85 c0 75 65 48 8b 55 c0 <48> 89 5a 08 48 8b 45 c8 48 89 42 20 31 c0 48 8b 5d d8 4c 8b 65 
RIP  [<ffffffffa02d4157>] qlcnic_alloc_rx_skb+0x127/0x1b0 [qlcnic]
 RSP <ffff880045783da8>
crash> bt
PID: 33831  TASK: ffff880109b2aaa0  CPU: 6   COMMAND: "bptm"
 #0 [ffff880045783b30] machine_kexec at ffffffff81035d6b
 #1 [ffff880045783b90] crash_kexec at ffffffff810c0e22
 #2 [ffff880045783c60] oops_end at ffffffff81511c20
 #3 [ffff880045783c90] die at ffffffff8100f19b
 #4 [ffff880045783cc0] do_general_protection at ffffffff81511722
 #5 [ffff880045783cf0] general_protection at ffffffff81510ef5
    [exception RIP: qlcnic_alloc_rx_skb+295]
    RIP: ffffffffa02d4157  RSP: ffff880045783da8  RFLAGS: 00010246
    RAX: 0000000000000000  RBX: ffff880832d47680  RCX: 0000000000002da0
    RDX: dead0000001000f0  RSI: 0000000047bd8022  RDI: ffff880024060000
    RBP: ffff880045783df8   R8: 0000000000000002   R9: 0000000000000000
    R10: 0000000000000000  R11: 0000000000000000  R12: ffff8808339a3000
    R13: ffff88083272b6e0  R14: ffff8808339a3090  R15: ffff880047bd8022
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
 #6 [ffff880045783e00] qlcnic_process_rcv_ring at ffffffffa02d6870 [qlcnic]
 #7 [ffff880045783ea0] qlcnic_rx_poll at ffffffffa02cdba3 [qlcnic]
 #8 [ffff880045783ee0] net_rx_action at ffffffff8144d453
 #9 [ffff880045783f40] __do_softirq at ffffffff810770b1
#10 [ffff880045783fb0] call_softirq at ffffffff8100c1cc
--- <IRQ stack> ---
#11 [ffff880109c8bae0] do_softirq at ffffffff8100de05
#12 [ffff880109c8bb00] local_bh_enable at ffffffff81076f3a
#13 [ffff880109c8bb30] tcp_recvmsg at ffffffff8148fd7c
#14 [ffff880109c8bc40] inet_recvmsg at ffffffff814b001a
#15 [ffff880109c8bc80] sock_recvmsg at ffffffff81437b73
#16 [ffff880109c8be40] sys_recvfrom at ffffffff81437cee
#17 [ffff880109c8bf80] system_call_fastpath at ffffffff8100b072
    RIP: 000000329d6e96b2  RSP: 00007fff361e6600  RFLAGS: 00000213
    RAX: 000000000000002d  RBX: ffffffff8100b072  RCX: 000000329d6e96b2
    RDX: 000000000000df00  RSI: 00007f817510f100  RDI: 0000000000000000
    RBP: 0000000000000b50   R8: 0000000000000000   R9: 0000000000000000
    R10: 0000000000000000  R11: 0000000000000246  R12: 00007f817bba16a8
    R13: 000000000000df00  R14: 00007f817510f100  R15: 0000000000000000
    ORIG_RAX: 000000000000002d  CS: 0033  SS: 002b
/usr/src/debug/kernel-2.6.32-358.18.1.el6/linux-2.6.32-358.18.1.el6.x86_64/arch/x86/include/asm/dma-mapping.h: 49
0xffffffffa02d4146 <qlcnic_alloc_rx_skb+278>:   mov    -0x38(%rbp),%rsi
0xffffffffa02d414a <qlcnic_alloc_rx_skb+282>:   mov    %r14,%rdi
0xffffffffa02d414d <qlcnic_alloc_rx_skb+285>:   callq  *%rax
/usr/src/debug/kernel-2.6.32-358.18.1.el6/linux-2.6.32-358.18.1.el6.x86_64/drivers/net/qlcnic/qlcnic_init.c: 1456
0xffffffffa02d414f <qlcnic_alloc_rx_skb+287>:   test   %eax,%eax
0xffffffffa02d4151 <qlcnic_alloc_rx_skb+289>:   jne    0xffffffffa02d41b8 <qlcnic_alloc_rx_skb+392>
/usr/src/debug/kernel-2.6.32-358.18.1.el6/linux-2.6.32-358.18.1.el6.x86_64/drivers/net/qlcnic/qlcnic_init.c: 1462
0xffffffffa02d4153 <qlcnic_alloc_rx_skb+291>:   mov    -0x40(%rbp),%rdx
0xffffffffa02d4157 <qlcnic_alloc_rx_skb+295>:   mov    %rbx,0x8(%rdx)
         1447       adapter->stats.skb_alloc_failure++;
         1448       return -ENOMEM;
         1449   }
         1450 
         1451   skb_reserve(skb, NET_IP_ALIGN);
         1452 
         1453   dma = pci_map_single(pdev, skb->data,
         1454           rds_ring->dma_size, PCI_DMA_FROMDEVICE);
         1455 
         1456   if (pci_dma_mapping_error(pdev, dma)) {
         1457       adapter->stats.rx_dma_map_error++;
         1458       dev_kfree_skb_any(skb);
         1459       return -ENOMEM;
         1460   }
         1461 
         1462   buffer->skb = skb;   <<------ here
         1463   buffer->dma = dma; 
         1464 
         1465   return 0;
         1466 }
         1467

This solution is part of Red Hat’s fast-track publication program, providing a huge library of solutions that Red Hat engineers have created while supporting our customers. To give you the knowledge you need the instant it becomes available, these articles may be presented in a raw and unedited form.

Comments