udevd continues to trigger on not ready (NR) devices making dmesg usage unusable for troubleshooting

Solution Unverified - Updated -

Issue

  • udevd continues to trigger on not ready (NR) devices making dmesg usage unusable for troubleshooting.

Note: a RHEL5 box hooked to the same Symmetrix with visibility to the same devices does not exhibit this behavior.

Because of the this behavior admins can not make use of dmesg for troubleshooting as the error handling floods the kernel ring buffer

With removal of powerpath and dm-multipath the issue still occurs.

To prove that udevd seems to be the initiator we used the following systemtap script.

Based on the user knowing a device that aligns to a not ready lun we watched vfs.read/vfs.write for this device

#! /usr/bin/env stap

global disk_major, disk_minor, diskname, rq_timeout, rq_retries, rq_deadline, dev_id, device_of_interest

probe begin {
    printf("Enabling Probes.\n");
}

probe kernel.statement("blk_update_request@blk-core.c:2225")
{

  #printf("%-s %-s %-s\n", "COMMAND", "PID", "TASK_STRUCT")
  #printf("%-s %-d %-p\n", kernel_string($t->comm), $t->pid, $t)
  req_addr = @choose_defined($req, $rq);
  if(@cast(req_addr,"request")->rq_disk == 0)  {
            disk_major = -1
            disk_minor = -1
  } else {
            disk_major = @cast(req_addr,"request")->rq_disk->major
            disk_minor = @cast(req_addr,"request")->rq_disk->first_minor
            diskname  = @cast(req_addr,"request")->rq_disk->disk_name
            rq_timeout = @cast(req_addr,"request")->timeout
            rq_retries = @cast(req_addr,"request")->retries
            rq_deadline = @cast(req_addr,"request")->deadline
            dev_id = &@cast(req_addr, "request")->rq_disk->part0->__dev
            device_of_interest = MKDEV(disk_major,disk_minor)
  }

  strdiskname = kernel_string(diskname)
  if (strdiskname == "sdew") {

  printf(" \n")
  printf("blk_update_request diskname %-s\n", kernel_string(diskname))
  printf("blk_update_request major minor %-d %-d\n", disk_major, disk_minor)
  printf("blk_update_request error %-d\n", $error)
  printf("blk_update_request timeout %-d\n", rq_timeout)
  printf("blk_update_request retries %-d\n", rq_retries)
  printf("blk_update_request deadline %-d\n",rq_deadline)
  printf("blk_update_request device %-x\n",dev_id)
  printf("blk_update_request device_of_interest %04x\n",device_of_interest)
  #device_state = get_devstate_from_req($q)
  #device_state_str = describe_device_state(device_state)
  #printf("%-s\n", device_state_str)
  }

}

probe vfs.write, vfs.read
{
  #dev_str = "800010"
  dev_str = "8100080"
  device_of_interest_str = sprintf("%04x", device_of_interest);

  if (dev_str == device_of_interest_str)
    printf ("%s(%d) %s 0x%x\n",
            execname(), pid(), ppfunc(), dev)
}

probe end {
    printf("Disabling Probes.\n");
}

here's a snapshot, note there is a 900 second pause between triggers, seems like some sort of timeout, but the trigger continues over and over.

Note when this happens the end_request errors flow into the kernel ring buffer overrunning, making dmesg not very usable for troubleshooting.

increasing log_buf_len doesn't help

2014-08-21 11:17:54 blk_update_request device_of_interest 8100080
2014-08-21 11:32:54
2014-08-21 11:32:54 blk_update_request diskname sdew
2014-08-21 11:32:54 blk_update_request major minor 129 128
2014-08-21 11:32:54 blk_update_request error 0
2014-08-21 11:32:54 blk_update_request timeout 7000
2014-08-21 11:32:54 blk_update_request retries 0
2014-08-21 11:32:54 blk_update_request deadline 4296644413
2014-08-21 11:32:54 blk_update_request device ffff8817f7afc460
2014-08-21 11:32:54 blk_update_request device_of_interest 8100080
2014-08-21 11:32:54
2014-08-21 11:32:54 blk_update_request diskname sdew
2014-08-21 11:32:54 blk_update_request major minor 129 128
2014-08-21 11:32:54 blk_update_request error 0
2014-08-21 11:32:54 blk_update_request timeout 7000
2014-08-21 11:32:54 blk_update_request retries 0
2014-08-21 11:32:54 blk_update_request deadline 4296644414
2014-08-21 11:32:54 blk_update_request device ffff8817f7afc460
2014-08-21 11:32:54 blk_update_request device_of_interest 8100080
2014-08-21 11:32:54 udevd(8285) vfs_read 0x5
2014-08-21 11:32:54 udevd(8329) vfs_write 0x5
2014-08-21 11:32:54 udevd(8684) vfs_read 0xfd00000
2014-08-21 11:32:54 udevd(8439) vfs_read 0x5
2014-08-21 11:32:54 udevd(8684) vfs_read 0xfd00000
2014-08-21 11:32:54 udevd(8346) vfs_read 0x3
2014-08-21 11:32:54 udevd(8297) vfs_read 0x5
2014-08-21 11:32:54 udevd(8361) vfs_read 0x3
2014-08-21 11:32:54 udevd(8419) vfs_read 0x3

continue end_request filling printk log bug making dmesg usage for troubleshooting unavailable

Environment

  • Red Hat Enterprise Linux 6.5

Subscriber exclusive content

A Red Hat subscription provides unlimited access to our knowledgebase, tools, and much more.

Current Customers and Partners

Log in for full access

Log In

New to Red Hat?

Learn more about Red Hat subscriptions

Using a Red Hat product through a public cloud?

How to access this content