Ceph: OSD crashes in BlueStore::Onode::put
Environment
Red Hat Ceph Storage (RHCS) 5.x
Red Hat Ceph Storage (RHCS) 6.x
Issue
OSD crashes in BlueStore::Onode::put
Full back trace:
[root@LinuxPrompt0 ~]# ceph crash info 2023-06-07T05:00:04.183655Z_3e2a0xxx-Redacted-Cluster-ID-yyy73541104f
{
"archived": "2023-06-07 07:11:14.916372",
"backtrace": [
"/lib64/libpthread.so.0(+0x12ce0) [0x7f6579539ce0]",
"(ceph::buffer::v15_2_0::ptr::release()+0x13) [0x55c478ef3773]",
"(BlueStore::Onode::put()+0x1a9) [0x55c478b7f8a9]", <----- Here
"(std::_Hashtable<ghobject_t, std::pair<ghobject_t const, boost::intrusive_ptr<BlueStore::Onode> >, mempool::pool_allocator<(mempool::pool_index_t)4, std::pair<ghobject_t const, boost::intrusive_ptr<BlueStore::Onode> > >, std::__detail::_Select1st, std::equal_to<ghobject_t>, std::hash<ghobject_t>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<true, false, true> >::_M_erase(unsigned long, std::__detail::_Hash_node_base*, std::__detail::_Hash_node<std::pair<ghobject_t const, boost::intrusive_ptr<BlueStore::Onode> >, true>*)+0x68) [0x55c478c36818]",
"(BlueStore::OnodeSpace::_remove(ghobject_t const&)+0x27a) [0x55c478b7f6aa]",
"(LruOnodeCacheShard::_trim_to(unsigned long)+0xe3) [0x55c478c36d73]",
"(BlueStore::OnodeSpace::add(ghobject_t const&, boost::intrusive_ptr<BlueStore::Onode>&)+0x49d) [0x55c478b8030d]",
"(BlueStore::Collection::get_onode(ghobject_t const&, bool, bool)+0x46a) [0x55c478bc62ba]",
"(BlueStore::_txc_add_transaction(BlueStore::TransContext*, ceph::os::Transaction*)+0x10b4) [0x55c478bea534]",
"(BlueStore::queue_transactions(boost::intrusive_ptr<ObjectStore::CollectionImpl>&, std::vector<ceph::os::Transaction, std::allocator<ceph::os::Transaction> >&, boost::intrusive_ptr<TrackedOp>, ThreadPool::TPHandle*)+0x316) [0x55c478c07bc6]",
"(non-virtual thunk to PrimaryLogPG::queue_transactions(std::vector<ceph::os::Transaction, std::allocator<ceph::os::Transaction> >&, boost::intrusive_ptr<OpRequest>)+0x58) [0x55c478847f38]",
"(ReplicatedBackend::submit_transaction(hobject_t const&, object_stat_sum_t const&, eversion_t const&, std::unique_ptr<PGTransaction, std::default_delete<PGTransaction> >&&, eversion_t const&, eversion_t const&, std::vector<pg_log_entry_t, std::allocator<pg_log_entry_t> >&&, std::optional<pg_hit_set_history_t>&, Context*, unsigned long, osd_reqid_t, boost::intrusive_ptr<OpRequest>)+0xcad) [0x55c478a37e1d]",
"(PrimaryLogPG::issue_repop(PrimaryLogPG::RepGather*, PrimaryLogPG::OpContext*)+0xcf0) [0x55c4787b2fe0]",
"(PrimaryLogPG::execute_ctx(PrimaryLogPG::OpContext*)+0x115d) [0x55c47880f5bd]",
"(PrimaryLogPG::do_op(boost::intrusive_ptr<OpRequest>&)+0x2de7) [0x55c478818147]",
"(PrimaryLogPG::do_request(boost::intrusive_ptr<OpRequest>&, ThreadPool::TPHandle&)+0xd1c) [0x55c47881f48c]",
"(OSD::dequeue_op(boost::intrusive_ptr<PG>, boost::intrusive_ptr<OpRequest>, ThreadPool::TPHandle&)+0x309) [0x55c4786a2c89]",
"(ceph::osd::scheduler::PGOpItem::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&)+0x68) [0x55c478905888]",
"(OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0xc28) [0x55c4786c02f8]",
"(ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x5c4) [0x55c478d42d64]",
"(ShardedThreadPool::WorkThreadSharded::entry()+0x14) [0x55c478d45c44]",
"/lib64/libpthread.so.0(+0x81cf) [0x7f657952f1cf]",
"clone()"
],
"ceph_version": "16.2.8-84.el8cp",
"crash_id": "2023-06-07T05:00:04.183655Z_3e2a0xxx-Redacted-Cluster-ID-yyy73541104f",
"entity_name": "osd.36",
"os_id": "rhel",
"os_name": "Red Hat Enterprise Linux",
"os_version": "8.6 (Ootpa)",
"os_version_id": "8.6",
"process_name": "ceph-osd",
"stack_sig": "387bdad7e321a8b482ce7c25081e214416a524e4fcddc77dcd2295ccfe0a319b",
"timestamp": "2023-06-07T05:00:04.183655Z",
"utsname_hostname": "LinuxPrompt2",
"utsname_machine": "x86_64",
"utsname_release": "4.18.0-348.7.1.el8_5.x86_64",
"utsname_sysname": "Linux",
"utsname_version": "#1 SMP Wed Dec 8 21:51:17 EST 2021"
}
Resolution
- The OSD service which crashes due to this issue will restart
- The impact to the Ceph Cluster should be minimal
- This issue is resolved in RHCS 5.3z5 and higher.
- The resolution already exists in RHCS 6.1 and higher.
- The issue is NOT resolved in RHCS 6.0.x.
- To remediate this issue, RH recommends an upgrade to a release where the issue is remediated.
Root Cause
Minor OSD code issue in RHCS 5.x and RHCS 6.x
Artifacts:
Ceph Upstream Tracker #56382
RHCS 5.x Bugzilla #2218445
RHCS 5.3.5 Errata RHBA-2023:4760
This solution is part of Red Hat’s fast-track publication program, providing a huge library of solutions that Red Hat engineers have created while supporting our customers. To give you the knowledge you need the instant it becomes available, these articles may be presented in a raw and unedited form.
Comments