Ceph: OSD crashes in BlueStore::Onode::put

Solution Verified - Updated -

Environment

Red Hat Ceph Storage (RHCS) 5.x
Red Hat Ceph Storage (RHCS) 6.x

Issue

OSD crashes in BlueStore::Onode::put

Full back trace:

[root@LinuxPrompt0 ~]# ceph crash info 2023-06-07T05:00:04.183655Z_3e2a0xxx-Redacted-Cluster-ID-yyy73541104f
{
    "archived": "2023-06-07 07:11:14.916372",
    "backtrace": [
        "/lib64/libpthread.so.0(+0x12ce0) [0x7f6579539ce0]",
        "(ceph::buffer::v15_2_0::ptr::release()+0x13) [0x55c478ef3773]",
        "(BlueStore::Onode::put()+0x1a9) [0x55c478b7f8a9]",          <----- Here
        "(std::_Hashtable<ghobject_t, std::pair<ghobject_t const, boost::intrusive_ptr<BlueStore::Onode> >, mempool::pool_allocator<(mempool::pool_index_t)4, std::pair<ghobject_t const, boost::intrusive_ptr<BlueStore::Onode> > >, std::__detail::_Select1st, std::equal_to<ghobject_t>, std::hash<ghobject_t>, std::__detail::_Mod_range_hashing, std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, std::__detail::_Hashtable_traits<true, false, true> >::_M_erase(unsigned long, std::__detail::_Hash_node_base*, std::__detail::_Hash_node<std::pair<ghobject_t const, boost::intrusive_ptr<BlueStore::Onode> >, true>*)+0x68) [0x55c478c36818]",
        "(BlueStore::OnodeSpace::_remove(ghobject_t const&)+0x27a) [0x55c478b7f6aa]",
        "(LruOnodeCacheShard::_trim_to(unsigned long)+0xe3) [0x55c478c36d73]",
        "(BlueStore::OnodeSpace::add(ghobject_t const&, boost::intrusive_ptr<BlueStore::Onode>&)+0x49d) [0x55c478b8030d]",
        "(BlueStore::Collection::get_onode(ghobject_t const&, bool, bool)+0x46a) [0x55c478bc62ba]",
        "(BlueStore::_txc_add_transaction(BlueStore::TransContext*, ceph::os::Transaction*)+0x10b4) [0x55c478bea534]",
        "(BlueStore::queue_transactions(boost::intrusive_ptr<ObjectStore::CollectionImpl>&, std::vector<ceph::os::Transaction, std::allocator<ceph::os::Transaction> >&, boost::intrusive_ptr<TrackedOp>, ThreadPool::TPHandle*)+0x316) [0x55c478c07bc6]",
        "(non-virtual thunk to PrimaryLogPG::queue_transactions(std::vector<ceph::os::Transaction, std::allocator<ceph::os::Transaction> >&, boost::intrusive_ptr<OpRequest>)+0x58) [0x55c478847f38]",
        "(ReplicatedBackend::submit_transaction(hobject_t const&, object_stat_sum_t const&, eversion_t const&, std::unique_ptr<PGTransaction, std::default_delete<PGTransaction> >&&, eversion_t const&, eversion_t const&, std::vector<pg_log_entry_t, std::allocator<pg_log_entry_t> >&&, std::optional<pg_hit_set_history_t>&, Context*, unsigned long, osd_reqid_t, boost::intrusive_ptr<OpRequest>)+0xcad) [0x55c478a37e1d]",
        "(PrimaryLogPG::issue_repop(PrimaryLogPG::RepGather*, PrimaryLogPG::OpContext*)+0xcf0) [0x55c4787b2fe0]",
        "(PrimaryLogPG::execute_ctx(PrimaryLogPG::OpContext*)+0x115d) [0x55c47880f5bd]",
        "(PrimaryLogPG::do_op(boost::intrusive_ptr<OpRequest>&)+0x2de7) [0x55c478818147]",
        "(PrimaryLogPG::do_request(boost::intrusive_ptr<OpRequest>&, ThreadPool::TPHandle&)+0xd1c) [0x55c47881f48c]",
        "(OSD::dequeue_op(boost::intrusive_ptr<PG>, boost::intrusive_ptr<OpRequest>, ThreadPool::TPHandle&)+0x309) [0x55c4786a2c89]",
        "(ceph::osd::scheduler::PGOpItem::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&)+0x68) [0x55c478905888]",
        "(OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0xc28) [0x55c4786c02f8]",
        "(ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x5c4) [0x55c478d42d64]",
        "(ShardedThreadPool::WorkThreadSharded::entry()+0x14) [0x55c478d45c44]",
        "/lib64/libpthread.so.0(+0x81cf) [0x7f657952f1cf]",
        "clone()"
    ],
    "ceph_version": "16.2.8-84.el8cp",
    "crash_id": "2023-06-07T05:00:04.183655Z_3e2a0xxx-Redacted-Cluster-ID-yyy73541104f",
    "entity_name": "osd.36",
    "os_id": "rhel",
    "os_name": "Red Hat Enterprise Linux",
    "os_version": "8.6 (Ootpa)",
    "os_version_id": "8.6",
    "process_name": "ceph-osd",
    "stack_sig": "387bdad7e321a8b482ce7c25081e214416a524e4fcddc77dcd2295ccfe0a319b",
    "timestamp": "2023-06-07T05:00:04.183655Z",
    "utsname_hostname": "LinuxPrompt2",
    "utsname_machine": "x86_64",
    "utsname_release": "4.18.0-348.7.1.el8_5.x86_64",
    "utsname_sysname": "Linux",
    "utsname_version": "#1 SMP Wed Dec 8 21:51:17 EST 2021"
}

Resolution

  • The OSD service which crashes due to this issue will restart
  • The impact to the Ceph Cluster should be minimal
  • This issue is resolved in RHCS 5.3z5 and higher.
  • The resolution already exists in RHCS 6.1 and higher.
  • The issue is NOT resolved in RHCS 6.0.x.
  • To remediate this issue, RH recommends an upgrade to a release where the issue is remediated.

Root Cause

Minor OSD code issue in RHCS 5.x and RHCS 6.x

Artifacts:
Ceph Upstream Tracker #56382
RHCS 5.x Bugzilla #2218445
RHCS 5.3.5 Errata RHBA-2023:4760

This solution is part of Red Hat’s fast-track publication program, providing a huge library of solutions that Red Hat engineers have created while supporting our customers. To give you the knowledge you need the instant it becomes available, these articles may be presented in a raw and unedited form.

Comments