Ceph において、QEMU または KVM で稼動している RHEL6.5 が mkfs を断続的にハングする
Issue
RHOS にホストされる仮想マシンで RHEL6.5 を実行しています。仮想マシン (/dev/vdb や /dev/vdc) にボリュームを添付すると、'mkfs' を実行した時にマシンがハングします。
システムから、プロセスのコールトレースと、以下に示すカーネルメッセージを含む情報を取得できます。さらに、ハングが発生するまではディスク IO を確認できますが、その後は (root ディスク上でも) ディスク IO がありません。
以下は、デバッグ情報となります。
SysRq :Show Blocked State
task PC stack pid father
flush-253:32 D 0000000000000000 0 1132 2 0x00000080
ffff88005ed6b6e0 0000000000000046 00000000ffffffff 0000031b5d59a193
0000000000000001 ffff88007cc9eb40 000000000003e2fc ffffffffac045d71
ffff88007db1c5f8 ffff88005ed6bfd8 000000000000fbc8 ffff88007db1c5f8
Call Trace:
[<ffffffff810a6d01>] ? ktime_get_ts+0xb1/0xf0
[<ffffffff81528383>] io_schedule+0x73/0xc0
[<ffffffff81267858>] get_request_wait+0x108/0x1d0
[<ffffffff8109af00>] ? autoremove_wake_function+0x0/0x40
[<ffffffff812679b9>] blk_queue_bio+0x99/0x620
[<ffffffff81266a40>] generic_make_request+0x240/0x5a0
[<ffffffff81121d45>] ? mempool_alloc_slab+0x15/0x20
[<ffffffff81266e10>] submit_bio+0x70/0x120
[<ffffffff811bd99d>] submit_bh+0x11d/0x1f0
[<ffffffff811c0158>] __block_write_full_page+0x1c8/0x330
[<ffffffff8127bae0>] ? cfq_kick_queue+0x0/0x60
[<ffffffff811bf120>] ? end_buffer_async_write+0x0/0x190
[<ffffffff811c3fc0>] ? blkdev_get_block+0x0/0x20
[<ffffffff811c3fc0>] ? blkdev_get_block+0x0/0x20
[<ffffffff811c03a0>] block_write_full_page_endio+0xe0/0x120
[<ffffffff8111f040>] ? find_get_pages_tag+0x40/0x130
[<ffffffff811c03f5>] block_write_full_page+0x15/0x20
[<ffffffff811c5158>] blkdev_writepage+0x18/0x20
[<ffffffff81133337>] __writepage+0x17/0x40
[<ffffffff811345fd>] write_cache_pages+0x1fd/0x4c0
[<ffffffff8105af80>] ?__dequeue_entity+0x30/0x50
[<ffffffff81133320>] ?__writepage+0x0/0x40
[<ffffffff811348e4>] generic_writepages+0x24/0x30
[<ffffffff81134911>] do_writepages+0x21/0x40
[<ffffffff811b4c8d>] writeback_single_inode+0xdd/0x290
[<ffffffff811b508d>] writeback_sb_inodes+0xbd/0x170
[<ffffffff811b51eb>] writeback_inodes_wb+0xab/0x1b0
[<ffffffff811b55e3>] wb_writeback+0x2f3/0x410
[<ffffffff81527bfe>] ? thread_return+0x4e/0x760
[<ffffffff811b58a5>] wb_do_writeback+0x1a5/0x240
[<ffffffff811b59a3>] bdi_writeback_task+0x63/0x1b0
[<ffffffff8109ad87>] ? bit_waitqueue+0x17/0xd0
[<ffffffff81143590>] ? bdi_start_fn+0x0/0x100
[<ffffffff81143616>] bdi_start_fn+0x86/0x100
[<ffffffff81143590>] ? bdi_start_fn+0x0/0x100
[<ffffffff8109ab56>] kthread+0x96/0xa0
[<ffffffff8100c20a>] child_rip+0xa/0x20
[<ffffffff8109aac0>] ? kthread+0x0/0xa0
[<ffffffff8100c200>] ? child_rip+0x0/0x20
mkfs.ext4 D 0000000000000000 0 1144 1100 0x00000084
ffff88004c0ff578 0000000000000046 00000000ffffffff 0000031b5d59a193
0000000000040001 ffff88007cc9e520 000000000003e3fe ffffffffac045d71
ffff880037a69098 ffff88004c0fffd8 000000000000fbc8 ffff880037a69098
Call Trace:
[<ffffffff810a6d01>] ? ktime_get_ts+0xb1/0xf0
[<ffffffff81528383>] io_schedule+0x73/0xc0
[<ffffffff81267858>] get_request_wait+0x108/0x1d0
[<ffffffff8109af00>] ? autoremove_wake_function+0x0/0x40
[<ffffffff812679b9>] blk_queue_bio+0x99/0x620
[<ffffffff81266a40>] generic_make_request+0x240/0x5a0
[<ffffffff81121d45>] ? mempool_alloc_slab+0x15/0x20
[<ffffffff811c31e3>] ? bio_init+0x13/0x30
[<ffffffff81266e10>] submit_bio+0x70/0x120
[<ffffffff811bd99d>] submit_bh+0x11d/0x1f0
[<ffffffff811c0158>] __block_write_full_page+0x1c8/0x330
[<ffffffff8100bb8e>] ? apic_timer_interrupt+0xe/0x20
[<ffffffff811bf120>] ? end_buffer_async_write+0x0/0x190
[<ffffffff811c3fc0>] ? blkdev_get_block+0x0/0x20
[<ffffffff811c3fc0>] ? blkdev_get_block+0x0/0x20
[<ffffffff811c03a0>] block_write_full_page_endio+0xe0/0x120
[<ffffffff8111f040>] ? find_get_pages_tag+0x40/0x130
[<ffffffff811c03f5>] block_write_full_page+0x15/0x20
[<ffffffff811c5158>] blkdev_writepage+0x18/0x20
[<ffffffff81133337>] __writepage+0x17/0x40
[<ffffffff811345fd>] write_cache_pages+0x1fd/0x4c0
[<ffffffff81527bfe>] ? thread_return+0x4e/0x760
[<ffffffff81133320>] ?__writepage+0x0/0x40
[<ffffffff811348e4>] generic_writepages+0x24/0x30
[<ffffffff81134911>] do_writepages+0x21/0x40
[<ffffffff8111fd7b>] __filemap_fdatawrite_range+0x5b/0x60
[<ffffffff811202bf>] filemap_fdatawrite+0x1f/0x30
[<ffffffff81120305>] filemap_write_and_wait+0x35/0x60
[<ffffffff811c54e4>] __sync_blockdev+0x24/0x50
[<ffffffff811c5523>] sync_blockdev+0x13/0x20
[<ffffffff811c56a8>] __blkdev_put+0x178/0x1b0
[<ffffffff811c56f0>] blkdev_put+0x10/0x20
[<ffffffff811c5733>] blkdev_close+0x33/0x60
[<ffffffff8118a3e5>] __fput+0xf5/0x210
[<ffffffff8118a525>] fput+0x25/0x30
[<ffffffff8118584d>] filp_close+0x5d/0x90
[<ffffffff81074a7f>] put_files_struct+0x7f/0xf0
[<ffffffff81074b43>] exit_files+0x53/0x70
[<ffffffff81076bbd>] do_exit+0x18d/0x870
[<ffffffff810772f8>] do_group_exit+0x58/0xd0
[<ffffffff8108cca6>] get_signal_to_deliver+0x1f6/0x460
[<ffffffff8100a265>] do_signal+0x75/0x800
[<ffffffff81333413>] ? tty_write+0x233/0x2a0
[<ffffffff81188cb2>] ? vfs_write+0x132/0x1a0
[<ffffffff8100aa80>] do_notify_resume+0x90/0xc0
[<ffffffff8100b341>] int_signal+0x12/0x17
Sched Debug Version: v0.09, 2.6.32-431.17.1.el6.x86_64 #1
now at 1030726.634816 msecs
.jiffies :4295697947
.sysctl_sched_latency :5.000000
.sysctl_sched_min_granularity :1.000000
.sysctl_sched_wakeup_granularity :1.000000
.sysctl_sched_child_runs_first :0.000000
.sysctl_sched_features :3183
.sysctl_sched_tunable_scaling :1 (logaritmic)
cpu#0, 3324.996 MHz
.nr_running :1
.load :1024
.nr_switches :126743
.nr_load_updates :103087
.nr_uninterruptible :2
.next_balance :4295.698946
.curr->pid :1074
.clock :1030651.103597
.cpu_load[0] :0
.cpu_load[1] :256
.cpu_load[2] :192
.cpu_load[3] :112
.cpu_load[4] :60
.yld_count :0
.sched_switch :0
.sched_count :127917
.sched_goidle :42293
.avg_idle :1000000
.ttwu_count :66009
.ttwu_local :66009
.bkl_count :2988
cfs_rq[0]:/
.exec_clock :92211.891372
.MIN_vruntime :0.000001
.min_vruntime :62788.810918
.max_vruntime :0.000001
.spread :0.000000
.spread0 :0.000000
.nr_spread_over :10
.nr_running :1
.load :1024
.tg->cfs_bandwidth.timer_active:0
.throttled :0
.throttle_count :0
.load_avg :0.000000
.load_period :0.000000
.load_contrib :0
.load_tg :0
rt_rq[0]:/
.rt_nr_running :0
.rt_throttled :0
.rt_time :0.000000
.rt_runtime :950.000000
runnable tasks:
task PID tree-key switches prio exec-runtime sum-exec sum-sleep
----------------------------------------------------------------------------------------------------------
R bash 1074 62786.852903 52 120 62786.852903 148.361328 980142.582470 /
-------------------------------------
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
flush-253:32 D 0000000000000000 0 1132 2 0x00000080
ffff88005ed6b6e0 0000000000000046 00000000ffffffff 0000031b5d59a193
0000000000000001 ffff88007cc9eb40 000000000003e2fc ffffffffac045d71
ffff88007db1c5f8 ffff88005ed6bfd8 000000000000fbc8 ffff88007db1c5f8
Call Trace:
[<ffffffff810a6d01>] ? ktime_get_ts+0xb1/0xf0
[<ffffffff81528383>] io_schedule+0x73/0xc0
[<ffffffff81267858>] get_request_wait+0x108/0x1d0
[<ffffffff8109af00>] ? autoremove_wake_function+0x0/0x40
[<ffffffff812679b9>] blk_queue_bio+0x99/0x620
[<ffffffff81266a40>] generic_make_request+0x240/0x5a0
[<ffffffff81121d45>] ? mempool_alloc_slab+0x15/0x20
[<ffffffff81266e10>] submit_bio+0x70/0x120
[<ffffffff811bd99d>] submit_bh+0x11d/0x1f0
[<ffffffff811c0158>] __block_write_full_page+0x1c8/0x330
[<ffffffff8127bae0>] ? cfq_kick_queue+0x0/0x60
[<ffffffff811bf120>] ? end_buffer_async_write+0x0/0x190
[<ffffffff811c3fc0>] ? blkdev_get_block+0x0/0x20
[<ffffffff811c3fc0>] ? blkdev_get_block+0x0/0x20
[<ffffffff811c03a0>] block_write_full_page_endio+0xe0/0x120
[<ffffffff8111f040>] ? find_get_pages_tag+0x40/0x130
[<ffffffff811c03f5>] block_write_full_page+0x15/0x20
[<ffffffff811c5158>] blkdev_writepage+0x18/0x20
[<ffffffff81133337>] __writepage+0x17/0x40
[<ffffffff811345fd>] write_cache_pages+0x1fd/0x4c0
[<ffffffff8105af80>] ?__dequeue_entity+0x30/0x50
[<ffffffff81133320>] ?__writepage+0x0/0x40
[<ffffffff811348e4>] generic_writepages+0x24/0x30
[<ffffffff81134911>] do_writepages+0x21/0x40
[<ffffffff811b4c8d>] writeback_single_inode+0xdd/0x290
[<ffffffff811b508d>] writeback_sb_inodes+0xbd/0x170
[<ffffffff811b51eb>] writeback_inodes_wb+0xab/0x1b0
[<ffffffff811b55e3>] wb_writeback+0x2f3/0x410
[<ffffffff81527bfe>] ? thread_return+0x4e/0x760
[<ffffffff811b58a5>] wb_do_writeback+0x1a5/0x240
[<ffffffff811b59a3>] bdi_writeback_task+0x63/0x1b0
[<ffffffff8109ad87>] ? bit_waitqueue+0x17/0xd0
[<ffffffff81143590>] ? bdi_start_fn+0x0/0x100
[<ffffffff81143616>] bdi_start_fn+0x86/0x100
[<ffffffff81143590>] ? bdi_start_fn+0x0/0x100
[<ffffffff8109ab56>] kthread+0x96/0xa0
[<ffffffff8100c20a>] child_rip+0xa/0x20
[<ffffffff8109aac0>] ? kthread+0x0/0xa0
[<ffffffff8100c200>] ? child_rip+0x0/0x20
INFO: task mkfs.ext4:1144 blocked for more than 120 seconds.
Not tainted 2.6.32-431.17.1.el6.x86_64 #1
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
mkfs.ext4 D 0000000000000000 0 1144 1100 0x00000084
ffff88004c0ff578 0000000000000046 00000000ffffffff 0000031b5d59a193
0000000000040001 ffff88007cc9e520 000000000003e3fe ffffffffac045d71
ffff880037a69098 ffff88004c0fffd8 000000000000fbc8 ffff880037a69098
Call Trace:
[<ffffffff810a6d01>] ? ktime_get_ts+0xb1/0xf0
[<ffffffff81528383>] io_schedule+0x73/0xc0
[<ffffffff81267858>] get_request_wait+0x108/0x1d0
[<ffffffff8109af00>] ? autoremove_wake_function+0x0/0x40
[<ffffffff812679b9>] blk_queue_bio+0x99/0x620
[<ffffffff81266a40>] generic_make_request+0x240/0x5a0
[<ffffffff81121d45>] ? mempool_alloc_slab+0x15/0x20
[<ffffffff811c31e3>] ? bio_init+0x13/0x30
[<ffffffff81266e10>] submit_bio+0x70/0x120
[<ffffffff811bd99d>] submit_bh+0x11d/0x1f0
[<ffffffff811c0158>] __block_write_full_page+0x1c8/0x330
[<ffffffff8100bb8e>] ? apic_timer_interrupt+0xe/0x20
[<ffffffff811bf120>] ? end_buffer_async_write+0x0/0x190
[<ffffffff811c3fc0>] ? blkdev_get_block+0x0/0x20
[<ffffffff811c3fc0>] ? blkdev_get_block+0x0/0x20
[<ffffffff811c03a0>] block_write_full_page_endio+0xe0/0x120
[<ffffffff8111f040>] ? find_get_pages_tag+0x40/0x130
[<ffffffff811c03f5>] block_write_full_page+0x15/0x20
[<ffffffff811c5158>] blkdev_writepage+0x18/0x20
[<ffffffff81133337>] __writepage+0x17/0x40
[<ffffffff811345fd>] write_cache_pages+0x1fd/0x4c0
[<ffffffff81527bfe>] ? thread_return+0x4e/0x760
[<ffffffff81133320>] ?__writepage+0x0/0x40
[<ffffffff811348e4>] generic_writepages+0x24/0x30
[<ffffffff81134911>] do_writepages+0x21/0x40
[<ffffffff8111fd7b>] __filemap_fdatawrite_range+0x5b/0x60
[<ffffffff811202bf>] filemap_fdatawrite+0x1f/0x30
[<ffffffff81120305>] filemap_write_and_wait+0x35/0x60
[<ffffffff811c54e4>] __sync_blockdev+0x24/0x50
[<ffffffff811c5523>] sync_blockdev+0x13/0x20
[<ffffffff811c56a8>] __blkdev_put+0x178/0x1b0
[<ffffffff811c56f0>] blkdev_put+0x10/0x20
[<ffffffff811c5733>] blkdev_close+0x33/0x60
[<ffffffff8118a3e5>] __fput+0xf5/0x210
[<ffffffff8118a525>] fput+0x25/0x30
[<ffffffff8118584d>] filp_close+0x5d/0x90
[<ffffffff81074a7f>] put_files_struct+0x7f/0xf0
[<ffffffff81074b43>] exit_files+0x53/0x70
[<ffffffff81076bbd>] do_exit+0x18d/0x870
[<ffffffff810772f8>] do_group_exit+0x58/0xd0
[<ffffffff8108cca6>] get_signal_to_deliver+0x1f6/0x460
[<ffffffff8100a265>] do_signal+0x75/0x800
[<ffffffff81333413>] ? tty_write+0x233/0x2a0
[<ffffffff81188cb2>] ? vfs_write+0x132/0x1a0
[<ffffffff8100aa80>] do_notify_resume+0x90/0xc0
[<ffffffff8100b341>] int_signal+0x12/0x17
Environment
- Red Hat Enterprise Linux 6.5
- ceph rdb ボリュームを使用する qemu-kvm
Subscriber exclusive content
A Red Hat subscription provides unlimited access to our knowledgebase of over 48,000 articles and solutions.
Welcome! Check out the Getting Started with Red Hat page for quick tours and guides for common tasks.
