Fibre channel port failure eventually leads to panic in lpfc_els_free_iocb() during port recovery after devloss timeouts

Solution Verified - Updated -

Issue

A series of events starting with devloss tmo can lead to a kernel panic after reaching lpfc_els_free_iocb() when using Emulex Fibre Channel dual port HBA with Emulex kernel driver.

Start of issue:
[251531.349815] lpfc 0000:3b:00.0: 0:3094 Start rport recovery on shost id 0xf fc_id 0x2105a0 vpi 0x0 rpi 0x5 state 0x7 flags 0x80000000
[251531.353675] lpfc 0000:3b:00.0: 0:3094 Start rport recovery on shost id 0xf fc_id 0x1f07e0 vpi 0x0 rpi 0x3 state 0x7 flags 0x80000000
[251531.357382] lpfc 0000:3b:00.0: 0:3094 Start rport recovery on shost id 0xf fc_id 0x2105a0 vpi 0x0 rpi 0x5 state 0x7 flags 0x408000
:
>>
Last few messags before panic:
:
[260084.756912] lpfc 0000:3b:00.0: 0:(0):0127 ELS timeout Data: x5 x1f07e0 x8a x7c3
[260104.740741] lpfc 0000:3b:00.0: 0:(0):0127 ELS timeout Data: x5 x1f07e0 x8a x7c4
[260104.741497] lpfc 0000:3b:00.0: 0:(0):0127 ELS timeout Data: x5 x1f07e0 x8a x7c3
[260110.614717] lpfc 0000:3b:00.0: 0:2885 Port Status Event: port status reg 0x81000000, port smphr reg 0xc000, error 1=0x9f000013, error 2=0x1001
[260112.445398] lpfc 0000:3b:00.0: 0:2887 Reset Needed: Attempting Port Recovery...
[260112.777811] lpfc 0000:3b:00.0: 0:(0):2756 LOGO failure, No Retry DID:2105A0 Status:x3/x103
[260112.778830] lpfc 0000:3b:00.0: 0:(0):2756 LOGO failure, No Retry DID:1F07E0 Status:x3/x103
[260112.780797] Modules linked in: oracleacfs(POE) oracleadvm(POE) oracleoks(POE) ip6table_filter ip6_tables tcp_diag inet_diag nfsv3 nfs_acl nfs lockd grace fscache 
8021q garp mrp stp llc bonding falcon_lsm_serviceable(PE) falcon_nf_netcontain(PE) falcon_kal(E) falcon_lsm_pinned_11312(E) sunrpc vfat fat dell_smbios dell_wmi_descr
iptor skx_edac intel_powerclamp coretemp intel_rapl iosf_mbi kvm_intel kvm dcdbas irqbypass crc32_pclmul ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper ablk
_helper cryptd wdat_wdt pcspkr dm_service_time cdc_ether sg usbnet mei_me mii lpc_ich i2c_i801 mei wmi ipmi_si ipmi_devintf ipmi_msghandler acpi_power_meter acpi_pad 
dm_multipath dm_mod binfmt_misc ip_tables xfs libcrc32c sd_mod lpfc mgag200 i2c_algo_bit drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops
[260112.789758] CPU: 19 PID: 479 Comm: lpfc_worker_0 Kdump: loaded Tainted: P           OE  ------------   3.10.0-1160.21.1.el7.x86_64 #1

[260112.792999] RIP: 0010:[<ffffffffc079c0ad>]  [<ffffffffc079c0ad>] lpfc_scsi_cmd_iocb_cmpl+0x4d/0x1160 [lpfc]
PID: 479    TASK: ffff9672fe78b180  CPU: 19  COMMAND: "lpfc_worker_0"

backtrace in crash:
crash> bt
PID: 479    TASK: ffff9672fe78b180  CPU: 19  COMMAND: "lpfc_worker_0"
 #0 [ffff9672d03ef888] machine_kexec at ffffffff9f0662c4
 #1 [ffff9672d03ef8e8] __crash_kexec at ffffffff9f122732
 #2 [ffff9672d03ef9b8] crash_kexec at ffffffff9f122820
 #3 [ffff9672d03ef9d0] oops_end at ffffffff9f78d798
 #4 [ffff9672d03ef9f8] die at ffffffff9f030a7b
 #5 [ffff9672d03efa28] do_general_protection at ffffffff9f78d092
 #6 [ffff9672d03efa60] general_protection at ffffffff9f78c718
    [exception RIP: lpfc_scsi_cmd_iocb_cmpl+77]
    RIP: ffffffffc079c0ad  RSP: ffff9672d03efb18  RFLAGS: 00010282
    RAX: 2030203020302030  RBX: ffff9672a97bec00  RCX: ffffffffc079c060
    RDX: ffff9672a97bec78  RSI: ffff9672a97bec78  RDI: ffff9672e9b84000
    RBP: ffff9672d03efbb8   R8: ffff9672a97bec78   R9: 000000018040003d
    R10: 0000000067134c01  R11: ffff96a867134d40  R12: ffff96a86ea09340
    R13: ffff9672e9b84000  R14: ffff967337fe6740  R15: ffff9672a97bec78
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
 #7 [ffff9672d03efbc0] lpfc_sli_cancel_iocbs at ffffffffc0740d9e [lpfc]
 #8 [ffff9672d03efbf8] lpfc_hba_clean_txcmplq at ffffffffc0779f48 [lpfc]
 #9 [ffff9672d03efc50] lpfc_hba_down_post_s4 at ffffffffc077a5d1 [lpfc]
#10 [ffff9672d03efcc8] lpfc_hba_down_post at ffffffffc077b532 [lpfc]
#11 [ffff9672d03efcd8] lpfc_sli_brdrestart_s4 at ffffffffc0742926 [lpfc]
#12 [ffff9672d03efd08] lpfc_sli_brdrestart at ffffffffc07429b2 [lpfc]
#13 [ffff9672d03efd18] lpfc_sli4_port_sta_fn_reset.constprop.41 at ffffffffc07801fa [lpfc]
#14 [ffff9672d03efd40] lpfc_handle_eratt_s4 at ffffffffc07804d1 [lpfc]
#15 [ffff9672d03efd90] lpfc_handle_eratt at ffffffffc077bbc5 [lpfc]      
#16 [ffff9672d03efda0] lpfc_work_done at ffffffffc0775370 [lpfc]
#17 [ffff9672d03efe58] lpfc_do_work at ffffffffc07756b0 [lpfc]
#18 [ffff9672d03efec8] kthread at ffffffff9f0c5da1

static void
lpfc_scsi_cmd_iocb_cmpl(struct lpfc_hba *phba, struct lpfc_iocbq *pIocbIn,
                        struct lpfc_iocbq *pIocbOut)
{
:
.
        /* Sanity check on return of outstanding command */
        cmd = lpfc_cmd->pCmd;
        if (!cmd)
                return;
        shost = cmd->device->host;                   ***********************>

..
}

RAX is invalid RAX: 2030203020302030

crash> dis lpfc_scsi_cmd_iocb_cmpl
0xffffffffc079c060 <lpfc_scsi_cmd_iocb_cmpl>:   nopl   0x0(%rax,%rax,1) [FTRACE NOP]
0xffffffffc079c065 <lpfc_scsi_cmd_iocb_cmpl+5>: push   %rbp
0xffffffffc079c066 <lpfc_scsi_cmd_iocb_cmpl+6>: mov    %rsp,%rbp
0xffffffffc079c069 <lpfc_scsi_cmd_iocb_cmpl+9>: push   %r15
0xffffffffc079c06b <lpfc_scsi_cmd_iocb_cmpl+11>:    push   %r14
0xffffffffc079c06d <lpfc_scsi_cmd_iocb_cmpl+13>:    push   %r13
0xffffffffc079c06f <lpfc_scsi_cmd_iocb_cmpl+15>:    mov    %rdi,%r13
0xffffffffc079c072 <lpfc_scsi_cmd_iocb_cmpl+18>:    push   %r12
0xffffffffc079c074 <lpfc_scsi_cmd_iocb_cmpl+20>:    push   %rbx
0xffffffffc079c075 <lpfc_scsi_cmd_iocb_cmpl+21>:    sub    $0x78,%rsp
0xffffffffc079c079 <lpfc_scsi_cmd_iocb_cmpl+25>:    mov    0x188(%rsi),%rbx
0xffffffffc079c080 <lpfc_scsi_cmd_iocb_cmpl+32>:    mov    0x180(%rsi),%r14
0xffffffffc079c087 <lpfc_scsi_cmd_iocb_cmpl+39>:    mov    0x18(%rbx),%rax
0xffffffffc079c08b <lpfc_scsi_cmd_iocb_cmpl+43>:    mov    (%rax),%rax
0xffffffffc079c08e <lpfc_scsi_cmd_iocb_cmpl+46>:    mov    %rax,-0x38(%rbp)
0xffffffffc079c092 <lpfc_scsi_cmd_iocb_cmpl+50>:    lock incl 0x1138(%rdi)
0xffffffffc079c099 <lpfc_scsi_cmd_iocb_cmpl+57>:    mov    0x10(%rbx),%r12
0xffffffffc079c09d <lpfc_scsi_cmd_iocb_cmpl+61>:    test   %r12,%r12
0xffffffffc079c0a0 <lpfc_scsi_cmd_iocb_cmpl+64>:    je     0xffffffffc079c218 <lpfc_scsi_cmd_iocb_cmpl+440>
0xffffffffc079c0a6 <lpfc_scsi_cmd_iocb_cmpl+70>:    mov    (%r12),%rax
0xffffffffc079c0aa <lpfc_scsi_cmd_iocb_cmpl+74>:    mov    %rdx,%r15
>> dies here.
0xffffffffc079c0ad <lpfc_scsi_cmd_iocb_cmpl+77>:    mov    (%rax),%rax

RAX is invalid RAX: 2030203020302030

Environment

  • Red Hat Enterprise Linux 7.9

Subscriber exclusive content

A Red Hat subscription provides unlimited access to our knowledgebase, tools, and much more.

Current Customers and Partners

Log in for full access

Log In

New to Red Hat?

Learn more about Red Hat subscriptions

Using a Red Hat product through a public cloud?

How to access this content