hard-lockup occurs on one of the CPUs which is stuck with IO operation in nouveau module driver.

Solution Unverified - Updated -

Issue

  • The system crashed.
  • Kernel ring buffer:
[7560994.819432] nouveau E[   PFIFO][0000:02:00.0] PBDMA0: ch 2 [Xorg[5542]] subc 0 mthd 0x001c data 0x00001004
[7560994.819432] nouveau E[   PFIFO][0000:02:00.0] PBDMA0: ACQUIRE
[7560994.819432] nouveau E[   PFIFO][0000:02:00.0] PBDMA0: ch 2 [Xorg[5542]] subc 0 mthd 0x001c data 0x00001004
[7560994.819432] nouveau E[   PFIFO][0000:02:00.0] PBDMA0: ACQUIRE
[7560994.819432] nouveau E[   PFIFO][0000:02:00.0] PBDMA0: ch 2 [Xorg[5542]] subc 0 mthd 0x001c data 0x00001004
[7560994.819432] nouveau E[   PFIFO][0000:02:00.0] PBDMA0: ACQUIRE
[7560994.819432] nouveau E[   PFIFO][0000:02:00.0] PBDMA0: ch 2 [Xorg[5542]] subc 0 mthd 0x001c data 0x00001004
[7560994.819432] nouveau E[   PFIFO][0000:02:00.0] PBDMA0: ACQUIRE
[7560994.819432] nouveau E[   PFIFO][0000:02:00.0] PBDMA0: ch 2 [Xorg[5542]] subc 0 mthd 0x001c data 0x00001004
[7560994.819432] nouveau E[   PFIFO][0000:02:00.0] PBDMA0: ACQUIRE
[7560994.819432] nouveau E[   PFIFO][0000:02:00.0] PBDMA0: ch 2 [Xorg[5542]] subc 0 mthd 0x001c data 0x00001004
[7560994.819432] nouveau E[   PFIFO][0000:02:00.0] PBDMA0: ACQUIRE
[7560994.819432] nouveau E[   PFIFO][0000:02:00.0] PBDMA0: ch 2 [Xorg[5542]] subc 0 mthd 0x001c data 0x00001004
[7560994.819432] nouveau E[   PFIFO][0000:02:00.0] PBDMA0: ACQUIRE
[7560994.819432] nouveau E[   PFIFO][0000:02:00.0] PBDMA0: ch 2 [Xorg[5542]] subc 0 mthd 0x001c data 0x00001004
[7560994.819432] nouveau E[   PFIFO][0000:02:00.0] PBDMA0: ACQUIRE
[7560994.819432] nouveau E[   PFIFO][0000:02:00.0] PBDMA0: ch 2 [Xorg[5542]] subc 0 mthd 0x001c data 0x00001004
[7560994.819432] nouveau E[   PFIFO][0000:02:00.0] PBDMA0: ACQUIRE
[7560994.819432] nouveau E[   PFIFO][0000:02:00.0] PBDMA0: ch 2 [Xorg[5542]] subc 0 mthd 0x001c data 0x00001004
[7560994.819432] nouveau E[   PFIFO][0000:02:00.0] PBDMA0: ACQUIRE
[7560994.819432] nouveau E[   PFIFO][0000:02:00.0] PBDMA0: ch 2 [Xorg[5542]] subc 0 mthd 0x001c data 0x00001004
[7560994.819432] nouveau E[   PFIFO][0000:02:00.0] PBDMA0: ACQUIRE
[7560994.819432] nouveau E[   PFIFO][0000:02:00.0] PBDMA0: ch 2 [Xorg[5542]] subc 0 mthd 0x001c data 0x00001004
[7560994.819432] nouveau E[   PFIFO][0000:02:00.0] PBDMA0: ACQUIRE
[7560994.819432] nouveau E[   PFIFO][0000:02:00.0] PBDMA0: ch 2 [Xorg[5542]] subc 0 mthd 0x001c data 0x00001004
[7560994.819432] Hardware name: Dell Inc. Precision Tower 5810/0K240Y, BIOS A08 07/31/2015
[7560994.819432]  ffffffff818674b8 00000000b52d055a ffff88045fd05af0 ffffffff816351f1
[7560994.819432]  ffff88045fd05b70 ffffffff8162ea6c 0000000000000010 ffff88045fd05b80
[7560994.819432]  ffff88045fd05b20 00000000b52d055a ffffffff8101cd69 0000000000000002
[7560994.819432] Call Trace:
[7560994.819432]  <NMI>  [<ffffffff816351f1>] dump_stack+0x19/0x1b
[7560994.819432]  [<ffffffff8162ea6c>] panic+0xd8/0x1e7
[7560994.819432]  [<ffffffff8101cd69>] ? sched_clock+0x9/0x10
[7560994.819432]  [<ffffffff8111b450>] ? restart_watchdog_hrtimer+0x50/0x50
[7560994.819432]  [<ffffffff8111b512>] watchdog_overflow_callback+0xc2/0xd0
[7560994.819432]  [<ffffffff8115ed71>] __perf_event_overflow+0xa1/0x250
[7560994.819432]  [<ffffffff8115f844>] perf_event_overflow+0x14/0x20
[7560994.819432]  [<ffffffff810325a8>] intel_pmu_handle_irq+0x1e8/0x470
[7560994.819432]  [<ffffffff8101cd15>] ? native_sched_clock+0x35/0x80
[7560994.819432]  [<ffffffff810bb45d>] ? sched_clock_local+0x1d/0x80
[7560994.819432]  [<ffffffff8163ebeb>] perf_event_nmi_handler+0x2b/0x50
[7560994.819432]  [<ffffffff8163e339>] nmi_handle.isra.0+0x69/0xb0
[7560994.819432]  [<ffffffff8163e4e9>] do_nmi+0x169/0x340
[7560994.819432]  [<ffffffff8163d771>] end_repeat_nmi+0x1e/0x2e
[7560994.819432]  [<ffffffff810d8a20>] ? get_monotonic_boottime+0xb0/0x100
[7560994.819432]  [<ffffffff810d8a20>] ? get_monotonic_boottime+0xb0/0x100
[7560994.819432]  [<ffffffff810d8a20>] ? get_monotonic_boottime+0xb0/0x100
[7560994.819432]  <<EOE>>  [<ffffffff810a3d91>] posix_get_boottime+0x11/0x20
[7560994.819432]  [<ffffffff810a5464>] SyS_clock_gettime+0x54/0xc0
[7560994.819432]  [<ffffffff81645909>] system_call_fastpath+0x16/0x1b
  • Although any 'hard LOCKUP' message is not observed in the kernel ring buffer at the time of the crash, it looks like hard lockup is encountered.
  • Backtrace:
crash> bt -a
PID: 0      TASK: ffffffff81951440  CPU: 0   COMMAND: "swapper/0"
 #0 [ffff88045fc05af8] panic at ffffffff8162e9f0
 #1 [ffff88045fc05b78] watchdog_overflow_callback at ffffffff8111b512
 #2 [ffff88045fc05b88] __perf_event_overflow at ffffffff8115ed71
 #3 [ffff88045fc05c00] perf_event_overflow at ffffffff8115f844
 #4 [ffff88045fc05c10] intel_pmu_handle_irq at ffffffff810325a8
 #5 [ffff88045fc05e60] perf_event_nmi_handler at ffffffff8163ebeb
 #6 [ffff88045fc05e80] nmi_handle at ffffffff8163e339
 #7 [ffff88045fc05ec8] do_nmi at ffffffff8163e450
 #8 [ffff88045fc05ef0] end_repeat_nmi at ffffffff8163d771
    [exception RIP: ioread32+66]
    RIP: ffffffff813095d2  RSP: ffff88045fc03d80  RFLAGS: 00000092
    RAX: 000000008000001d  RBX: 0000000000000000  RCX: ffff880035eea508
    RDX: ffffffffa0257980  RSI: 0000000000000016  RDI: ffffc900121400c0
    RBP: ffff88045fc03e68   R8: 0000000000000000   R9: ffff88044ec00000
    R10: 0000000000000000  R11: 0000000000000000  R12: 0000000004000000
    R13: 0000000000040120  R14: ffff880035eea400  R15: 00000000000400c0
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
--- <NMI exception stack> ---
 #9 [ffff88045fc03d80] ioread32 at ffffffff813095d2
#10 [ffff88045fc03d80] gk104_fifo_intr at ffffffffa0257caf [nouveau]
#11 [ffff88045fc03e70] nvkm_mc_intr at ffffffffa0230815 [nouveau]
#12 [ffff88045fc03eb0] handle_irq_event_percpu at ffffffff8111c2be
#13 [ffff88045fc03ef8] handle_irq_event at ffffffff8111c49d
#14 [ffff88045fc03f20] handle_edge_irq at ffffffff8111f137
#15 [ffff88045fc03f40] handle_irq at ffffffff81016ecf
#16 [ffff88045fc03f78] do_IRQ at ffffffff81647daf
--- <IRQ stack> ---
#17 [ffffffff8193fe28] ret_from_intr at ffffffff8163d0ed
    [exception RIP: tick_nohz_idle_enter+68]
    RIP: ffffffff810e1034  RSP: ffffffff8193fed0  RFLAGS: 00000202
    RAX: 00000002c2a30af7  RBX: 001adc722f68bf10  RCX: 0000000000000000
    RDX: 00000000004b0a31  RSI: 0000000000000086  RDI: 0000000000000086
    RBP: ffffffff8193fed0   R8: 0000000000000000   R9: 0000000000000000
    R10: 0000000000000000  R11: 0000000000000000  R12: ffffffff810e0a38
    R13: ffffffff8193fe90  R14: ffffffff810a9c62  R15: ffffffff8193fe38
    ORIG_RAX: ffffffffffffff2d  CS: 0010  SS: 0018
#18 [ffffffff8193fed8] cpu_startup_entry at ffffffff810d615e
#19 [ffffffff8193ff30] rest_init at ffffffff81624e07
#20 [ffffffff8193ff40] start_kernel at ffffffff81a8d057
#21 [ffffffff8193ff88] x86_64_start_reservations at ffffffff81a8c5ee
#22 [ffffffff8193ff98] x86_64_start_kernel at ffffffff81a8c742

PID: 0      TASK: ffff880449655080  CPU: 1   COMMAND: "swapper/1"
 #0 [ffff88045fc85e70] crash_nmi_callback at ffffffff810458f2
 #1 [ffff88045fc85e80] nmi_handle at ffffffff8163e339
 #2 [ffff88045fc85ec8] do_nmi at ffffffff8163e450
 #3 [ffff88045fc85ef0] end_repeat_nmi at ffffffff8163d771
    [exception RIP: intel_idle+215]
    RIP: ffffffff8135de17  RSP: ffff88044967fe10  RFLAGS: 00000046
    RAX: 0000000000000020  RBX: 0000000000000008  RCX: 0000000000000001
    RDX: 0000000000000000  RSI: ffff88044967ffd8  RDI: 000000000194a000
    RBP: ffff88044967fe40   R8: 000000000fa7d767   R9: 0000000000000018
    R10: 0000000000046cbf  R11: 000000000000001e  R12: ffff88044967ffd8
    R13: 0000000000000004  R14: 0000000000000020  R15: ffffffff819fdeb8
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
--- <NMI exception stack> ---
 #4 [ffff88044967fe10] intel_idle at ffffffff8135de17
 #5 [ffff88044967fe48] cpuidle_enter_state at ffffffff814d4540
 #6 [ffff88044967fe80] cpuidle_idle_call at ffffffff814d4699
 #7 [ffff88044967fec0] arch_cpu_idle at ffffffff8101e4be
 #8 [ffff88044967fed0] cpu_startup_entry at ffffffff810d6305
 #9 [ffff88044967ff28] start_secondary at ffffffff810475fa

PID: 572    TASK: ffff880443c25080  CPU: 2   COMMAND: "systemd-journal"
 #0 [ffff88045fd059c8] machine_kexec at ffffffff81051beb
 #1 [ffff88045fd05a28] crash_kexec at ffffffff810f2542
 #2 [ffff88045fd05af8] panic at ffffffff8162ea73
 #3 [ffff88045fd05b78] watchdog_overflow_callback at ffffffff8111b512
 #4 [ffff88045fd05b88] __perf_event_overflow at ffffffff8115ed71
 #5 [ffff88045fd05c00] perf_event_overflow at ffffffff8115f844
 #6 [ffff88045fd05c10] intel_pmu_handle_irq at ffffffff810325a8
 #7 [ffff88045fd05e60] perf_event_nmi_handler at ffffffff8163ebeb
 #8 [ffff88045fd05e80] nmi_handle at ffffffff8163e339
 #9 [ffff88045fd05ec8] do_nmi at ffffffff8163e4e9
#10 [ffff88045fd05ef0] end_repeat_nmi at ffffffff8163d771
    [exception RIP: get_monotonic_boottime+176]
    RIP: ffffffff810d8a20  RSP: ffff880447ec3f10  RFLAGS: 00000216
    RAX: 69cf8c6df0c1c509  RBX: ffffffff81965380  RCX: 0000000000000000
    RDX: 00000000850ed66b  RSI: ffffffc313319b93  RDI: ffffffffa8e13a9e
    RBP: ffff880447ec3f38   R8: 0000000000735f30   R9: 0000000000000000
    R10: 0000000000000002  R11: 0000000000000202  R12: 0000000000735e0f
    R13: 0000000000000000  R14: ffff880447ec3f58  R15: 000000008ffabd80
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
--- <NMI exception stack> ---
#11 [ffff880447ec3f10] get_monotonic_boottime at ffffffff810d8a20
#12 [ffff880447ec3f40] posix_get_boottime at ffffffff810a3d91
#13 [ffff880447ec3f50] sys_clock_gettime at ffffffff810a5464
#14 [ffff880447ec3f80] system_call_fastpath at ffffffff81645909
    RIP: 00007ffd0f3a87c2  RSP: 00007ffd0f2a0420  RFLAGS: 00000246
    RAX: 00000000000000e4  RBX: ffffffff81645909  RCX: 0000000000000000
    RDX: 0000000000000000  RSI: 00007ffd0f2a0400  RDI: 0000000000000007
    RBP: 00007ffd0f2a03e0   R8: 0000000000735f30   R9: 0000000000000000
    R10: 0000000000000002  R11: 0000000000000202  R12: 00007ffd0f2a0430
    R13: 00000000b52d055a  R14: ffffffffffffffff  R15: 0000000000000001
    ORIG_RAX: 00000000000000e4  CS: 0033  SS: 002b

PID: 0      TASK: ffff880449656780  CPU: 3   COMMAND: "swapper/3"
 #0 [ffff88045fd85e70] crash_nmi_callback at ffffffff810458f2
 #1 [ffff88045fd85e80] nmi_handle at ffffffff8163e339
 #2 [ffff88045fd85ec8] do_nmi at ffffffff8163e450
 #3 [ffff88045fd85ef0] end_repeat_nmi at ffffffff8163d771
    [exception RIP: intel_idle+215]
    RIP: ffffffff8135de17  RSP: ffff88044968be10  RFLAGS: 00000046
    RAX: 0000000000000020  RBX: 0000000000000008  RCX: 0000000000000001
    RDX: 0000000000000000  RSI: ffff88044968bfd8  RDI: 0000000000000003
    RBP: ffff88044968be40   R8: 0000000020c49b8f   R9: 000000000000001c
    R10: 0000000010708b19  R11: 000000000000001e  R12: ffff88044968bfd8
    R13: 0000000000000004  R14: 0000000000000020  R15: ffffffff819fdeb8
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
--- <NMI exception stack> ---
 #4 [ffff88044968be10] intel_idle at ffffffff8135de17
 #5 [ffff88044968be48] cpuidle_enter_state at ffffffff814d4540
 #6 [ffff88044968be80] cpuidle_idle_call at ffffffff814d4699
 #7 [ffff88044968bec0] arch_cpu_idle at ffffffff8101e4be
 #8 [ffff88044968bed0] cpu_startup_entry at ffffffff810d6305
 #9 [ffff88044968bf28] start_secondary at ffffffff810475fa

Environment

  • Red Hat Enterprise Linux 7.2 (kernel-3.10.0-327.el7)
  • With the kernel's inbox nouveau driver

Subscriber exclusive content

A Red Hat subscription provides unlimited access to our knowledgebase, tools, and much more.

Current Customers and Partners

Log in for full access

Log In

New to Red Hat?

Learn more about Red Hat subscriptions

Using a Red Hat product through a public cloud?

How to access this content