Fibre channel port failure eventually leads to panic in lpfc_els_free_iocb() during port recovery after devloss timeouts
Issue
A series of events starting with devloss tmo can lead to a kernel panic after reaching lpfc_els_free_iocb() when using Emulex Fibre Channel dual port HBA with Emulex kernel driver.
Start of issue:
[251531.349815] lpfc 0000:3b:00.0: 0:3094 Start rport recovery on shost id 0xf fc_id 0x2105a0 vpi 0x0 rpi 0x5 state 0x7 flags 0x80000000
[251531.353675] lpfc 0000:3b:00.0: 0:3094 Start rport recovery on shost id 0xf fc_id 0x1f07e0 vpi 0x0 rpi 0x3 state 0x7 flags 0x80000000
[251531.357382] lpfc 0000:3b:00.0: 0:3094 Start rport recovery on shost id 0xf fc_id 0x2105a0 vpi 0x0 rpi 0x5 state 0x7 flags 0x408000
:
>>
Last few messags before panic:
:
[260084.756912] lpfc 0000:3b:00.0: 0:(0):0127 ELS timeout Data: x5 x1f07e0 x8a x7c3
[260104.740741] lpfc 0000:3b:00.0: 0:(0):0127 ELS timeout Data: x5 x1f07e0 x8a x7c4
[260104.741497] lpfc 0000:3b:00.0: 0:(0):0127 ELS timeout Data: x5 x1f07e0 x8a x7c3
[260110.614717] lpfc 0000:3b:00.0: 0:2885 Port Status Event: port status reg 0x81000000, port smphr reg 0xc000, error 1=0x9f000013, error 2=0x1001
[260112.445398] lpfc 0000:3b:00.0: 0:2887 Reset Needed: Attempting Port Recovery...
[260112.777811] lpfc 0000:3b:00.0: 0:(0):2756 LOGO failure, No Retry DID:2105A0 Status:x3/x103
[260112.778830] lpfc 0000:3b:00.0: 0:(0):2756 LOGO failure, No Retry DID:1F07E0 Status:x3/x103
[260112.780797] Modules linked in: oracleacfs(POE) oracleadvm(POE) oracleoks(POE) ip6table_filter ip6_tables tcp_diag inet_diag nfsv3 nfs_acl nfs lockd grace fscache
8021q garp mrp stp llc bonding falcon_lsm_serviceable(PE) falcon_nf_netcontain(PE) falcon_kal(E) falcon_lsm_pinned_11312(E) sunrpc vfat fat dell_smbios dell_wmi_descr
iptor skx_edac intel_powerclamp coretemp intel_rapl iosf_mbi kvm_intel kvm dcdbas irqbypass crc32_pclmul ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper ablk
_helper cryptd wdat_wdt pcspkr dm_service_time cdc_ether sg usbnet mei_me mii lpc_ich i2c_i801 mei wmi ipmi_si ipmi_devintf ipmi_msghandler acpi_power_meter acpi_pad
dm_multipath dm_mod binfmt_misc ip_tables xfs libcrc32c sd_mod lpfc mgag200 i2c_algo_bit drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops
[260112.789758] CPU: 19 PID: 479 Comm: lpfc_worker_0 Kdump: loaded Tainted: P OE ------------ 3.10.0-1160.21.1.el7.x86_64 #1
[260112.792999] RIP: 0010:[<ffffffffc079c0ad>] [<ffffffffc079c0ad>] lpfc_scsi_cmd_iocb_cmpl+0x4d/0x1160 [lpfc]
PID: 479 TASK: ffff9672fe78b180 CPU: 19 COMMAND: "lpfc_worker_0"
backtrace in crash:
crash> bt
PID: 479 TASK: ffff9672fe78b180 CPU: 19 COMMAND: "lpfc_worker_0"
#0 [ffff9672d03ef888] machine_kexec at ffffffff9f0662c4
#1 [ffff9672d03ef8e8] __crash_kexec at ffffffff9f122732
#2 [ffff9672d03ef9b8] crash_kexec at ffffffff9f122820
#3 [ffff9672d03ef9d0] oops_end at ffffffff9f78d798
#4 [ffff9672d03ef9f8] die at ffffffff9f030a7b
#5 [ffff9672d03efa28] do_general_protection at ffffffff9f78d092
#6 [ffff9672d03efa60] general_protection at ffffffff9f78c718
[exception RIP: lpfc_scsi_cmd_iocb_cmpl+77]
RIP: ffffffffc079c0ad RSP: ffff9672d03efb18 RFLAGS: 00010282
RAX: 2030203020302030 RBX: ffff9672a97bec00 RCX: ffffffffc079c060
RDX: ffff9672a97bec78 RSI: ffff9672a97bec78 RDI: ffff9672e9b84000
RBP: ffff9672d03efbb8 R8: ffff9672a97bec78 R9: 000000018040003d
R10: 0000000067134c01 R11: ffff96a867134d40 R12: ffff96a86ea09340
R13: ffff9672e9b84000 R14: ffff967337fe6740 R15: ffff9672a97bec78
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
#7 [ffff9672d03efbc0] lpfc_sli_cancel_iocbs at ffffffffc0740d9e [lpfc]
#8 [ffff9672d03efbf8] lpfc_hba_clean_txcmplq at ffffffffc0779f48 [lpfc]
#9 [ffff9672d03efc50] lpfc_hba_down_post_s4 at ffffffffc077a5d1 [lpfc]
#10 [ffff9672d03efcc8] lpfc_hba_down_post at ffffffffc077b532 [lpfc]
#11 [ffff9672d03efcd8] lpfc_sli_brdrestart_s4 at ffffffffc0742926 [lpfc]
#12 [ffff9672d03efd08] lpfc_sli_brdrestart at ffffffffc07429b2 [lpfc]
#13 [ffff9672d03efd18] lpfc_sli4_port_sta_fn_reset.constprop.41 at ffffffffc07801fa [lpfc]
#14 [ffff9672d03efd40] lpfc_handle_eratt_s4 at ffffffffc07804d1 [lpfc]
#15 [ffff9672d03efd90] lpfc_handle_eratt at ffffffffc077bbc5 [lpfc]
#16 [ffff9672d03efda0] lpfc_work_done at ffffffffc0775370 [lpfc]
#17 [ffff9672d03efe58] lpfc_do_work at ffffffffc07756b0 [lpfc]
#18 [ffff9672d03efec8] kthread at ffffffff9f0c5da1
static void
lpfc_scsi_cmd_iocb_cmpl(struct lpfc_hba *phba, struct lpfc_iocbq *pIocbIn,
struct lpfc_iocbq *pIocbOut)
{
:
.
/* Sanity check on return of outstanding command */
cmd = lpfc_cmd->pCmd;
if (!cmd)
return;
shost = cmd->device->host; ***********************>
..
}
RAX is invalid RAX: 2030203020302030
crash> dis lpfc_scsi_cmd_iocb_cmpl
0xffffffffc079c060 <lpfc_scsi_cmd_iocb_cmpl>: nopl 0x0(%rax,%rax,1) [FTRACE NOP]
0xffffffffc079c065 <lpfc_scsi_cmd_iocb_cmpl+5>: push %rbp
0xffffffffc079c066 <lpfc_scsi_cmd_iocb_cmpl+6>: mov %rsp,%rbp
0xffffffffc079c069 <lpfc_scsi_cmd_iocb_cmpl+9>: push %r15
0xffffffffc079c06b <lpfc_scsi_cmd_iocb_cmpl+11>: push %r14
0xffffffffc079c06d <lpfc_scsi_cmd_iocb_cmpl+13>: push %r13
0xffffffffc079c06f <lpfc_scsi_cmd_iocb_cmpl+15>: mov %rdi,%r13
0xffffffffc079c072 <lpfc_scsi_cmd_iocb_cmpl+18>: push %r12
0xffffffffc079c074 <lpfc_scsi_cmd_iocb_cmpl+20>: push %rbx
0xffffffffc079c075 <lpfc_scsi_cmd_iocb_cmpl+21>: sub $0x78,%rsp
0xffffffffc079c079 <lpfc_scsi_cmd_iocb_cmpl+25>: mov 0x188(%rsi),%rbx
0xffffffffc079c080 <lpfc_scsi_cmd_iocb_cmpl+32>: mov 0x180(%rsi),%r14
0xffffffffc079c087 <lpfc_scsi_cmd_iocb_cmpl+39>: mov 0x18(%rbx),%rax
0xffffffffc079c08b <lpfc_scsi_cmd_iocb_cmpl+43>: mov (%rax),%rax
0xffffffffc079c08e <lpfc_scsi_cmd_iocb_cmpl+46>: mov %rax,-0x38(%rbp)
0xffffffffc079c092 <lpfc_scsi_cmd_iocb_cmpl+50>: lock incl 0x1138(%rdi)
0xffffffffc079c099 <lpfc_scsi_cmd_iocb_cmpl+57>: mov 0x10(%rbx),%r12
0xffffffffc079c09d <lpfc_scsi_cmd_iocb_cmpl+61>: test %r12,%r12
0xffffffffc079c0a0 <lpfc_scsi_cmd_iocb_cmpl+64>: je 0xffffffffc079c218 <lpfc_scsi_cmd_iocb_cmpl+440>
0xffffffffc079c0a6 <lpfc_scsi_cmd_iocb_cmpl+70>: mov (%r12),%rax
0xffffffffc079c0aa <lpfc_scsi_cmd_iocb_cmpl+74>: mov %rdx,%r15
>> dies here.
0xffffffffc079c0ad <lpfc_scsi_cmd_iocb_cmpl+77>: mov (%rax),%rax
RAX is invalid RAX: 2030203020302030
Environment
- Red Hat Enterprise Linux 7.9
Subscriber exclusive content
A Red Hat subscription provides unlimited access to our knowledgebase, tools, and much more.