Ceph: MDS pods stuck in CrashLoopBackOff (CLBO) / assert in MDCache::add_inode
Issue
Ceph MDS pods stuck in CrashLoopBackOff (CLBO) / assert in MDCache::add_inode
Below are 2 examples of the issue:
Example 1:
(gdb) bt
#0 0x00007fde6f1b9abf in raise () from /lib64/libpthread.so.0
#1 0x000055cfed3673e3 in reraise_fatal (signum=6) at /usr/src/debug/ceph-14.2.11-208.el8cp.x86_64/src/global/signal_handler.cc:326
#2 handle_fatal_signal (signum=6) at /usr/src/debug/ceph-14.2.11-208.el8cp.x86_64/src/global/signal_handler.cc:326
#3 <signal handler called>
#4 0x00007fde6dc0137f in raise () from /lib64/libc.so.6
#5 0x00007fde6dbebdb5 in abort () from /lib64/libc.so.6
#6 0x00007fde713cf359 in ceph::__ceph_assert_fail(char const*, char const*, int, char const*) () from /usr/lib64/ceph/libceph-common.so.0
#7 0x00007fde713cf522 in ceph::__ceph_assert_fail(ceph::assert_data const&) () from /usr/lib64/ceph/libceph-common.so.0
#8 0x000055cfed0fb45e in MDCache::add_inode (this=0x55cff0756000, in=<optimized out>, in@entry=0x55cffb4b0000) at /usr/include/c++/8/array:185
#9 0x000055cfed086960 in Server::prepare_new_inode (this=0x55cfefa0d040, mdr=..., dir=<optimized out>, useino=..., mode=<optimized out>, layout=<optimized out>) at /usr/src/debug/ceph-14.2.11-208.el8cp.x86_64/src/mds/Server.cc:3250
#10 0x000055cfed097a99 in Server::handle_client_openc (this=0x55cfefa0d040, mdr=...) at /usr/src/debug/ceph-14.2.11-208.el8cp.x86_64/src/mds/CDentry.h:151
#11 0x000055cfed0c230b in Server::dispatch_client_request (this=0x55cfefa0d040, mdr=...) at /usr/src/debug/ceph-14.2.11-208.el8cp.x86_64/src/mds/Server.cc:2439
#12 0x000055cfed0c29a2 in Server::handle_client_request (this=0x55cfefa0d040, req=...) at /usr/src/debug/ceph-14.2.11-208.el8cp.x86_64/src/mds/Server.cc:2311
#13 0x000055cfed0cf44a in Server::dispatch (this=0x55cfefa0d040, m=...) at /usr/src/debug/ceph-14.2.11-208.el8cp.x86_64/src/common/RefCountedObj.h:171
#14 0x000055cfed038344 in MDSRank::handle_deferrable_message (this=0x55cff0754808, m=...) at /usr/src/debug/ceph-14.2.11-208.el8cp.x86_64/src/mds/MDSRank.cc:1192
#15 0x000055cfed03a75f in MDSRank::_dispatch (this=0x55cff0754808, m=..., new_msg=<optimized out>) at /usr/src/debug/ceph-14.2.11-208.el8cp.x86_64/src/mds/MDSRank.cc:1041
#16 0x000055cfed03ad66 in MDSRank::retry_dispatch (this=0x55cff0754808, m=...) at /usr/src/debug/ceph-14.2.11-208.el8cp.x86_64/src/mds/MDSRank.cc:1478
#17 0x000055cfed2dc5df in Context::complete (r=0, this=0x55cff5de2fc0) at /usr/src/debug/ceph-14.2.11-208.el8cp.x86_64/src/include/Context.h:77
#18 MDSContext::complete (this=0x55cff5de2fc0, r=0) at /usr/src/debug/ceph-14.2.11-208.el8cp.x86_64/src/mds/MDSContext.cc:29
#19 0x000055cfed030805 in finish_contexts<std::vector<MDSContext*, std::allocator<MDSContext*> > > (cct=0x55cfefa42000, finished=..., result=0) at /usr/src/debug/ceph-14.2.11-208.el8cp.x86_64/src/log/SubsystemMap.h:72
#20 0x000055cfed040c9a in MDSRank::active_start (this=0x55cff0754808) at /usr/src/debug/ceph-14.2.11-208.el8cp.x86_64/src/mds/MDSRank.cc:2006
#21 0x000055cfed04e882 in MDSRankDispatcher::handle_mds_map (this=0x55cff0754800, m=..., oldmap=...) at /usr/src/debug/ceph-14.2.11-208.el8cp.x86_64/src/mds/MDSRank.cc:2266
#22 0x000055cfed025055 in MDSDaemon::handle_mds_map (this=0x55cff0728a00, m=...) at /usr/src/debug/ceph-14.2.11-208.el8cp.x86_64/src/mds/MDSDaemon.cc:893
#23 0x000055cfed026a38 in MDSDaemon::handle_core_message (this=0x55cff0728a00, m=...) at /usr/src/debug/ceph-14.2.11-208.el8cp.x86_64/src/common/RefCountedObj.h:171
#24 0x000055cfed027363 in MDSDaemon::ms_dispatch2 (this=0x55cff0728a00, m=...) at /usr/src/debug/ceph-14.2.11-208.el8cp.x86_64/src/common/RefCountedObj.h:171
#25 0x00007fde7162931a in DispatchQueue::entry() () from /usr/lib64/ceph/libceph-common.so.0
#26 0x00007fde716df971 in DispatchQueue::DispatchThread::entry() () from /usr/lib64/ceph/libceph-common.so.0
#27 0x00007fde6f1af17a in start_thread () from /lib64/libpthread.so.0
#28 0x00007fde6dcc6dc3 in clone () from /lib64/libc.so.6
Example 2:
2023-04-21T08:56:11.915680583Z debug -4> 2023-04-21 08:56:11.820 7f200b6e3700 -1 /builddir/build/BUILD/ceph-14.2.11/src/mds/MDCache.cc: In function 'void MDCache::add_inode(CInode*)' thread 7f200b6e3700 time 2023-04-21 08:56:11.820593
2023-04-21T08:56:11.915680583Z /builddir/build/BUILD/ceph-14.2.11/src/mds/MDCache.cc: 283: FAILED ceph_assert(!p)
2023-04-21T08:56:11.915680583Z
2023-04-21T08:56:11.915680583Z ceph version 14.2.11-208.el8cp (6738ba96f296a41c24357c12e8d594fbde457abc) nautilus (stable)
2023-04-21T08:56:11.915680583Z 1: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x156) [0x7f20154b6308]
2023-04-21T08:56:11.915680583Z 2: (()+0x275522) [0x7f20154b6522]
2023-04-21T08:56:11.915680583Z 3: (()+0x22645e) [0x557ca224945e]
2023-04-21T08:56:11.915680583Z 4: (Server::prepare_new_inode(boost::intrusive_ptr<MDRequestImpl>&, CDir*, inodeno_t, unsigned int, file_layout_t*)+0x4a0) [0x557ca21d4960]
2023-04-21T08:56:11.915680583Z 5: (Server::handle_client_openc(boost::intrusive_ptr<MDRequestImpl>&)+0xdc9) [0x557ca21e5a99]
2023-04-21T08:56:11.915680583Z 6: (Server::dispatch_client_request(boost::intrusive_ptr<MDRequestImpl>&)+0xb8b) [0x557ca221030b]
2023-04-21T08:56:11.915680583Z 7: (Server::handle_client_request(boost::intrusive_ptr<MClientRequest const> const&)+0x402) [0x557ca22109a2]
2023-04-21T08:56:11.915680583Z 8: (Server::dispatch(boost::intrusive_ptr<Message const> const&)+0x12a) [0x557ca221d44a]
2023-04-21T08:56:11.915680583Z 9: (MDSRank::handle_deferrable_message(boost::intrusive_ptr<Message const> const&)+0xa94) [0x557ca2186344]
2023-04-21T08:56:11.915680583Z 10: (MDSRank::_dispatch(boost::intrusive_ptr<Message const> const&, bool)+0x80f) [0x557ca218875f]
2023-04-21T08:56:11.915680583Z 11: (MDSRank::retry_dispatch(boost::intrusive_ptr<Message const> const&)+0x16) [0x557ca2188d66]
2023-04-21T08:56:11.915680583Z 12: (MDSContext::complete(int)+0x7f) [0x557ca242a5df]
2023-04-21T08:56:11.915680583Z 13: (void finish_contexts<std::vector<MDSContext*, std::allocator<MDSContext*> > >(CephContext*, std::vector<MDSContext*, std::allocator<MDSContext*> >&, int)+0x85) [0x557ca217e805]
2023-04-21T08:56:11.915680583Z 14: (MDSRank::active_start()+0x8a) [0x557ca218ec9a]
2023-04-21T08:56:11.915680583Z 15: (MDSRankDispatcher::handle_mds_map(boost::intrusive_ptr<MMDSMap const> const&, MDSMap const&)+0x1d72) [0x557ca219c882]
2023-04-21T08:56:11.915680583Z 16: (MDSDaemon::handle_mds_map(boost::intrusive_ptr<MMDSMap const> const&)+0xbc5) [0x557ca2173055]
2023-04-21T08:56:11.915680583Z 17: (MDSDaemon::handle_core_message(boost::intrusive_ptr<Message const> const&)+0xc8) [0x557ca2174a38]
2023-04-21T08:56:11.915680583Z 18: (MDSDaemon::ms_dispatch2(boost::intrusive_ptr<Message> const&)+0xc3) [0x557ca2175363]
2023-04-21T08:56:11.915680583Z 19: (DispatchQueue::entry()+0x134a) [0x7f201571031a]
2023-04-21T08:56:11.915680583Z 20: (DispatchQueue::DispatchThread::entry()+0x11) [0x7f20157c6971]
2023-04-21T08:56:11.915680583Z 21: (()+0x817a) [0x7f201329617a]
2023-04-21T08:56:11.915680583Z 22: (clone()+0x43) [0x7f2011daddc3]
Environment
Red Hat Ceph Storage (RHCS) 4.X+
Red Hat OpenShift Data Foundation (ODF) 4.X+
Subscriber exclusive content
A Red Hat subscription provides unlimited access to our knowledgebase, tools, and much more.