Local Filesystem Deadlock while a memory allocation wait on page writeback
Environment
Seen on:
- Red Hat Enterprise Linux 7.2
- Local Filesystems (XFS, Ext4)
- Kernel 3.10.0-327.36.3.el7
Issue
- Local Filesystem facing a deadlock while trying to allocate pages to extent mapping on write operation when the caller application is running under a CGroup;
Resolution
- Upgrade to kernel version kernel-3.10.0-693.el7 or later which has the fix for the issue.
- The issue was addressed in Bug 1433103 via following errata RHSA-2017:1842
Root Cause
Local Filesystems like XFS and EXT4 doesn't submit to I/O pages marked as PG_writeback right away it happens, instead of it tries to add the max number of pages as possible to the same I/O request before really submit it. When trying to allocate memory the filesystem may trigger memcg charge which might decide to reclaim memory pages and ending up waiting on a page which is marked as PG_writeback but it's part of this delayed allocation scheme leading the system to a deadlock.
The following backtrack may be observed for the hunging task:
#0 [ffff88467dbd3200] __schedule+0x28d at ffffffff8163b49d
#1 [ffff88467dbd3268] schedule+0x29 at ffffffff8163bb39
#2 [ffff88467dbd3278] schedule_timeout+0x209 at ffffffff81639829
#3 [ffff88467dbd3328] io_schedule_timeout+0xae at ffffffff8163b16e
#4 [ffff88467dbd3358] io_schedule+0x18 at ffffffff8163b208
#5 [ffff88467dbd3368] sleep_on_page+0xe at ffffffff81168d1e
#6 [ffff88467dbd3378] __wait_on_bit+0x60 at ffffffff816399b0
#7 [ffff88467dbd33b8] wait_on_page_bit+0x86 at ffffffff81168aa6
#8 [ffff88467dbd3410] shrink_page_list+0x6c2 at ffffffff8117d8b2
#9 [ffff88467dbd3548] shrink_inactive_list+0x1ea at ffffffff8117e37a
#10 [ffff88467dbd3610] shrink_lruvec+0x375 at ffffffff8117ee45
#11 [ffff88467dbd3710] shrink_zone+0x76 at ffffffff8117f2a6
#12 [ffff88467dbd3768] do_try_to_free_pages+0xf0 at ffffffff8117f7b0
#13 [ffff88467dbd37e0] try_to_free_mem_cgroup_pages+0xca at ffffffff8117fdea
#14 [ffff88467dbd3878] mem_cgroup_reclaim+0x4e at ffffffff811d27ce
#15 [ffff88467dbd38b8] __mem_cgroup_try_charge+0x42c at ffffffff811d2c8c
#16 [ffff88467dbd3978] mem_cgroup_charge_common+0x59 at ffffffff811d35e9
#17 [ffff88467dbd39b0] mem_cgroup_cache_charge+0x8a at ffffffff811d57da
#18 [ffff88467dbd39e0] __add_to_page_cache_locked+0x52 at ffffffff81169932
#19 [ffff88467dbd3a38] add_to_page_cache_lru+0x37 at ffffffff81169b97
#20 [ffff88467dbd3a68] find_or_create_page+0x5e at ffffffff81169f1e
#21 [ffff88467dbd3aa8] __getblk+0xf8 at ffffffff812135b8
#22 [ffff88467dbd3b08] __read_extent_tree_block+0x41 at ffffffffa05144e1 [ext4]
#23 [ffff88467dbd3b60] ext4_ext_find_extent+0x12c at ffffffffa051557c [ext4]
#24 [ffff88467dbd3bc0] ext4_ext_map_blocks+0x81 at ffffffffa0519581 [ext4]
#25 [ffff88467dbd3ca8] ext4_map_blocks+0x179 at ffffffffa04e7cf9 [ext4]
#26 [ffff88467dbd3d20] ext4_writepages+0x692 at ffffffffa04eb222 [ext4]
#27 [ffff88467dbd3e58] do_writepages+0x1e at ffffffff81175d3e
#28 [ffff88467dbd3e68] __filemap_fdatawrite_range+0x65 at ffffffff8116aad5
#29 [ffff88467dbd3eb8] filemap_write_and_wait_range+0x2a at ffffffff8116abea
#30 [ffff88467dbd3ee8] ext4_sync_file+0xba at ffffffffa04e27aa [ext4]
#31 [ffff88467dbd3f30] do_fsync+0x65 at ffffffff81210185
#32 [ffff88467dbd3f70] sys_fdatasync+0x13 at ffffffff81210473
#33 [ffff88467dbd3f80] tracesys+0xdd at ffffffff81646d52 (via system_call)
RIP: 00007fd0debd7ddd RSP: 00007fd0dddfde20 RFLAGS: 00000246
RAX: ffffffffffffffda RBX: ffffffff81646d52 RCX: ffffffffffffffff
RDX: 00000000007220a8 RSI: 0000000000000000 RDI: 0000000000000021
RBP: 00007fd0d44664a8 R8: 0000000000000000 R9: 0000000000000010
R10: 0000000000000000 R11: 0000000000000246 R12: ffffffff81210473
R13: ffff88467dbd3f78 R14: 00000000007220a8 R15: 00007fd0d1bff160
ORIG_RAX: 000000000000004b CS: 0033 SS: 002b
Diagnostic Steps
Let's check on which page kernel is waiting at:
684 void wait_on_page_bit(struct page *page, int bit_nr)
685 {
0xffffffff81168a20 <wait_on_page_bit>: nopl 0x0(%rax,%rax,1) [FTRACE NOP]
0xffffffff81168a25 <wait_on_page_bit+5>: push %rbp
0xffffffff81168a26 <wait_on_page_bit+6>: mov %rdi,%r8
0xffffffff81168a2e <wait_on_page_bit+14>: mov %rsp,%rbp
0xffffffff81168a31 <wait_on_page_bit+17>: and $0xfffffffffffffff0,%rsp
0xffffffff81168a35 <wait_on_page_bit+21>: sub $0x40,%rsp
0xffffffff81168a39 <wait_on_page_bit+25>: mov %gs:0x28,%rax
0xffffffff81168a42 <wait_on_page_bit+34>: mov %rax,0x38(%rsp)
0xffffffff81168a47 <wait_on_page_bit+39>: xor %eax,%eax
0xffffffff81168a29 <wait_on_page_bit+9>: mov $0x7,%ecx
0xffffffff81168a49 <wait_on_page_bit+41>: mov %rsp,%rdi
0xffffffff81168a4c <wait_on_page_bit+44>: rep stos %rax,%es:(%rdi)
0xffffffff81168a58 <wait_on_page_bit+56>: mov %rax,0x18(%rsp)
0xffffffff81168a5d <wait_on_page_bit+61>: lea 0x28(%rsp),%rax
0xffffffff81168a62 <wait_on_page_bit+66>: mov %r8,(%rsp)
0xffffffff81168a66 <wait_on_page_bit+70>: mov %esi,0x8(%rsp)
0xffffffff81168a6a <wait_on_page_bit+74>: movq $0xffffffff810a6c00,0x20(%rsp)
0xffffffff81168a73 <wait_on_page_bit+83>: mov %rax,0x28(%rsp)
0xffffffff81168a78 <wait_on_page_bit+88>: mov %rax,0x30(%rsp)
686 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
687
0xffffffff81168a7d <wait_on_page_bit+93>: bt %esi,(%r8) --------------------------------> %r8 = page
0xffffffff81168a81 <wait_on_page_bit+97>: sbb %esi,%es
688 if (test_bit(bit_nr, &page->flags))
689 __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page,
690 TASK_UNINTERRUPTIBLE);
691 }
692 EXPORT_SYMBOL(wait_on_page_bit);
When calling wait_on_page_bit:
/usr/src/debug/kernel-3.10.0-327.36.3.el7/linux-3.10.0-327.36.3.el7.x86_64/include/linux/pagemap.h: 432
0xffffffff8117d89e <shrink_page_list+1710>: mov $0xd,%esi
0xffffffff8117d8a3 <shrink_page_list+1715>: mov %r14,%rdi --------------------------------------------> %r14 has page
0xffffffff8117d8a6 <shrink_page_list+1718>: mov %r8,-0xa0(%rbp)
0xffffffff8117d8ad <shrink_page_list+1725>: callq 0xffffffff81168a20 <wait_on_page_bit>
0xffffffff8117d8b2 <shrink_page_list+1730>: mov -0xa0(%rbp),%r8
as it doesn't touch %r14 we check the stack for __wait_on_bit:
crash> dis -rl ffffffff816399b0
/usr/src/debug/kernel-3.10.0-327.36.3.el7/linux-3.10.0-327.36.3.el7.x86_64/kernel/wait.c: 196
0xffffffff81639950 <__wait_on_bit>: nopl 0x0(%rax,%rax,1) [FTRACE NOP]
0xffffffff81639955 <__wait_on_bit+5>: push %rbp
0xffffffff81639956 <__wait_on_bit+6>: mov %rsp,%rbp
0xffffffff81639959 <__wait_on_bit+9>: push %r15
0xffffffff8163995b <__wait_on_bit+11>: mov %rsi,%r15
0xffffffff8163995e <__wait_on_bit+14>: push %r14 --------------------------------------> %r14 at 3rd push
0xffffffff81639960 <__wait_on_bit+16>: mov %rdx,%r14
0xffffffff81639963 <__wait_on_bit+19>: push %r13
0xffffffff81639965 <__wait_on_bit+21>: mov %ecx,%r13d
0xffffffff81639968 <__wait_on_bit+24>: push %r12
0xffffffff8163996a <__wait_on_bit+26>: mov %rdi,%r12
0xffffffff8163996d <__wait_on_bit+29>: push %rbx
0xffffffff8163996e <__wait_on_bit+30>: lea 0x10(%rsi),%rbx
0xffffffff81639972 <__wait_on_bit+34>: sub $0x8,%rsp
0xffffffff81639976 <__wait_on_bit+38>: jmp 0xffffffff8163998d <__wait_on_bit+61>
0xffffffff81639978 <__wait_on_bit+40>: nopl 0x0(%rax,%rax,1)
crash> kmem ffffea00ad83b040
PAGE PHYSICAL MAPPING INDEX CNT FLAGS
ffffea00ad83b040 2b60ec1000 ffff8871bda0a2c0 1e3a4 3 2fffff00042809 locked,uptodate,private,writeback,reclaim
crash> page.mapping ffffea00ad83b040
mapping = 0xffff8871bda0a2c0
crash> address_space.host 0xffff8871bda0a2c0
host = 0xffff8871bda0a170
The inode associated with the address_space mapping where this page is mapped is the same ext4 is trying to write:
88 int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
89 {
0xffffffffa04e26f0 <ext4_sync_file>: nopl 0x0(%rax,%rax,1) [FTRACE NOP]
0xffffffffa04e26f5 <ext4_sync_file+5>: push %rbp
0xffffffffa04e26f6 <ext4_sync_file+6>: mov %rsp,%rbp
0xffffffffa04e26f9 <ext4_sync_file+9>: push %r15
0xffffffffa04e26fb <ext4_sync_file+11>: push %r14
0xffffffffa04e26fd <ext4_sync_file+13>: push %r13
0xffffffffa04e26ff <ext4_sync_file+15>: push %r12
0xffffffffa04e2701 <ext4_sync_file+17>: mov %rdi,%r12 ---------------------------------------> file was mov from %rdi to %r12
0xffffffffa04e2704 <ext4_sync_file+20>: push %rbx
0xffffffffa04e2705 <ext4_sync_file+21>: sub $0x10,%rsp
0xffffffffa04e2710 <ext4_sync_file+32>: mov %rdx,-0x38(%rbp)
0xffffffffa04e2714 <ext4_sync_file+36>: mov %rsi,-0x30(%rbp)
0xffffffffa04e2709 <ext4_sync_file+25>: mov 0xd0(%rdi),%rax ----------------------------------> file is at %rdi
0xffffffffa04e2718 <ext4_sync_file+40>: mov (%rax),%rbx --------------------------------------> inode is at %rbx
90 struct inode *inode = file->f_mapping->host;
91 struct ext4_inode_info *ei = EXT4_I(inode);
0xffffffffa04e271b <ext4_sync_file+43>: mov 0x28(%rbx),%rax
0xffffffffa04e271f <ext4_sync_file+47>: mov 0x350(%rax),%rdx
0xffffffffa04e2726 <ext4_sync_file+54>: mov 0x1f8(%rdx),%r14
92 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
93 int ret = 0, err;
94 tid_t commit_tid;
95 bool needs_barrier = false;
96
0xffffffffa04e2736 <ext4_sync_file+70>: cmpq $0x0,0x820(%rdx)
0xffffffffa04e273e <ext4_sync_file+78>: jne 0xffffffffa04e2a06 <ext4_sync_file+790>
0xffffffffa04e2744 <ext4_sync_file+84>: mov %ecx,%r13d
0xffffffffa04e2747 <ext4_sync_file+87>: nopl 0x0(%rax,%rax,1)
97 J_ASSERT(ext4_journal_current_handle() == NULL);
98
99 trace_ext4_sync_file_enter(file, datasync);
100
0xffffffffa04e274c <ext4_sync_file+92>: testb $0x1,0x50(%rax)
0xffffffffa04e2750 <ext4_sync_file+96>: je 0xffffffffa04e2790 <ext4_sync_file+160>
101 if (inode->i_sb->s_flags & MS_RDONLY) {
102 /* Make sure that we read updated s_mount_flags value */
0xffffffffa04e2752 <ext4_sync_file+98>: mov 0x28(%rbx),%rax
103 smp_rmb();
0xffffffffa04e275c <ext4_sync_file+108>: mov 0x350(%rax),%rax
0xffffffffa04e2763 <ext4_sync_file+115>: testb $0x2,0x80(%rax)
0xffffffffa04e276a <ext4_sync_file+122>: je 0xffffffffa04e2850 <ext4_sync_file+352>
0xffffffffa04e2770 <ext4_sync_file+128>: nopl 0x0(%rax,%rax,1
104 if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
0xffffffffa04e2756 <ext4_sync_file+102>: mov $0xffffffe2,%r13d
105 ret = -EROFS;
106 goto out;
107 }
108
0xffffffffa04e2790 <ext4_sync_file+160>: test %r14,%r14
0xffffffffa04e2793 <ext4_sync_file+163>: je 0xffffffffa04e2860 <ext4_sync_file+368>
109 if (!journal) {
110 ret = generic_file_fsync(file, start, end, datasync);
111 if (!ret && !hlist_empty(&inode->i_dentry))
112 ret = ext4_sync_parent(inode);
113 goto out;
114 }
115
0xffffffffa04e2799 <ext4_sync_file+169>: mov 0x30(%rbx),%rdi
0xffffffffa04e279d <ext4_sync_file+173>: mov -0x38(%rbp),%rdx
0xffffffffa04e27a1 <ext4_sync_file+177>: mov -0x30(%rbp),%rsi
0xffffffffa04e27a5 <ext4_sync_file+181>: callq 0xffffffff8116abc0 <filemap_write_and_wait_range>
116 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
117 if (ret)
118 return ret;
Let's recover the inode and file from stack (%rbx and %r12):
crash> dis -rl ffffffff8116abea | head
/usr/src/debug/kernel-3.10.0-327.36.3.el7/linux-3.10.0-327.36.3.el7.x86_64/mm/filemap.c: 437
0xffffffff8116abc0 <filemap_write_and_wait_range>: nopl 0x0(%rax,%rax,1) [FTRACE NOP]
0xffffffff8116abc5 <filemap_write_and_wait_range+5>: push %rbp
0xffffffff8116abc6 <filemap_write_and_wait_range+6>: mov %rsp,%rbp
0xffffffff8116abc9 <filemap_write_and_wait_range+9>: push %r14
0xffffffff8116abcb <filemap_write_and_wait_range+11>: push %r13
0xffffffff8116abcd <filemap_write_and_wait_range+13>: push %r12 -------------------------------> %r12 is at 4th push
0xffffffff8116abcf <filemap_write_and_wait_range+15>: mov %rdi,%r12
0xffffffff8116abd2 <filemap_write_and_wait_range+18>: push %rbx -------------------------------> %rbx is at 5th push
/usr/src/debug/kernel-3.10.0-327.36.3.el7/linux-3.10.0-327.36.3.el7.x86_64/mm/filemap.c: 440
We have:
crash> p/x 0xffff88467dbd3ee8 - 8 * 5
$3 = 0xffff88467dbd3ec0
crash> rd 0xffff88467dbd3ec0
ffff88467dbd3ec0: ffff8871bda0a170 p...q...
crash> kmem ffff8871bda0a170
CACHE NAME OBJSIZE ALLOCATED TOTAL SLABS SSIZE
ffff885d3b883200 ext4_inode_cache 1040 18008 19871 641 32k
SLAB MEMORY NODE TOTAL ALLOCATED FREE
ffffea01c6f68200 ffff8871bda08000 2 31 29 2
FREE / [ALLOCATED]
[ffff8871bda0a0c0]
As ext4_writepages is waiting the memory allocation ends to complete the bio before send the I/O request the memory reclaim code will keep
waiting indefinitely creating a deadlock situation
This solution is part of Red Hat’s fast-track publication program, providing a huge library of solutions that Red Hat engineers have created while supporting our customers. To give you the knowledge you need the instant it becomes available, these articles may be presented in a raw and unedited form.
Welcome! Check out the Getting Started with Red Hat page for quick tours and guides for common tasks.
