Host compute3 high usage consuming Memory

Solution In Progress - Updated -

Issue

  • One of our compute consumes almost all its available RAM when only half of it is actually allocated to VMs:
[root@overcloud-compute-0 ~]# free -m
              total        used        free      shared  buff/cache   available
Mem:         515698      401225      103072          21       11401      113049
Swap:             0           0           0
  • On this compute, there is 54 VMs and the following amount of ram is allocated to them through nova:
# grep -r memory * | grep -v unit  | awk -F\> '{ print $2 }' | sed -e 's/<.*//' | awk 'BEGIN {a=0}{a+=$1}END{ print "total " a "MB"}'
total 256000 MB
  • There is currently 424GB of ram being actively consumed:
# ps aufxg | awk 'BEGIN {a=0}{a+=$6}END{ print "total " a "KB"}'
total 424905412MB
  • The following process are the top 10 RAM consumers:
root      713002  2.6  2.7 14717928 14673084 ?   S    Apr10 4831:07 /usr/lib/systemd/systemd-udevd
qemu      435663  0.4  3.1 17435412 16386548 ?   Sl   Apr10 843:39 /usr/libexec/qemu-kvm -name guest=instance-00000405,debug-threads=on -S -object secret,id=masterKey0,format=raw,file=/var/lib/libvirt/qemu/domain-26-instance-00000405/master-key.aes -machine pc-i440fx-rhel7.6.0,accel=kvm,usb=off,dump-guest-core=off -cpu Broadwell-IBRS,vme=on,ss=on,f16c=on,rdrand=on,hypervisor=on,arat=on,tsc_adjust=on,md-clear=on,stibp=on,ssbd=on,xsaveopt=on,pdpe1gb=on,abm=on -m 16384 -realtime mlock=off -smp 4,sockets=4,cores=1,threads=1 -uuid 881b75ee-f33f-4fab-9c9c-83ba11e01317 -smbios type=1,manufacturer=Red Hat,product=OpenStack Compute,version=17.0.10-6.el7ost,serial=66e686ec-7c4d-1000-95bf-54ab3a591010,uuid=881b75ee-f33f-4fab-9c9c-83ba11e01317,family=Virtual Machine -no-user-config -nodefaults -chardev socket,id=charmonitor,fd=103,server,nowait -mon chardev=charmonitor,id=monitor,mode=control -rtc base=utc,driftfix=slew -global kvm-pit.lost_tick_policy=delay -no-hpet -no-shutdown -boot strict=on -device piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2 -drive file=/dev/disk/by-id/dm-uuid-mpath-360060e8012314f005040314f00000374,format=raw,if=none,id=drive-virtio-disk0,serial=44d58b5c-5b44-4f47-ac2a-96aad09ef653,cache=none,aio=native -device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1,write-cache=on -drive file=/dev/disk/by-id/dm-uuid-mpath-360060e8012314f005040314f0000038e,format=raw,if=none,id=drive-virtio-disk1,serial=c9e66ba0-9306-4467-8ba0-87a7d3c83ba1,cache=none,aio=native -device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x6,drive=drive-virtio-disk1,id=virtio-disk1,write-cache=on -drive file=/dev/disk/by-id/dm-uuid-mpath-360060e8012314f005040314f000003a4,format=raw,if=none,id=drive-virtio-disk2,serial=13bac374-fb26-4c43-939b-d04367355b37,cache=none,aio=native -device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x7,drive=drive-virtio-disk2,id=virtio-disk2,write-cache=on -drive file=/dev/disk/by-id/dm-uuid-mpath-360060e8012314f005040314f000003a9,format=raw,if=none,id=drive-virtio-disk3,serial=439f9efe-f238-4565-b4c0-06b3917a2403,cache=none,aio=native -device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x8,drive=drive-virtio-disk3,id=virtio-disk3,write-cache=on -drive file=/dev/disk/by-id/dm-uuid-mpath-360060e8012314f005040314f000003e1,format=raw,if=none,id=drive-virtio-disk4,serial=cbce97ba-4c80-433c-9ef7-d1ac69798cf8,cache=none,aio=native -device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x9,drive=drive-virtio-disk4,id=virtio-disk4,write-cache=on -drive file=/dev/disk/by-id/dm-uuid-mpath-360060e8012314f005040314f0000043c,format=raw,if=none,id=drive-virtio-disk5,serial=6e930d27-fbd9-4462-a8d2-16b202b0bc11,cache=none,aio=native -device virtio-blk-pci,scsi=off,bus=pci.0,addr=0xa,drive=drive-virtio-disk5,id=virtio-disk5,write-cache=on -drive file=/dev/disk/by-id/dm-uuid-mpath-360060e8012314f005040314f0000043b,format=raw,if=none,id=drive-virtio-disk6,serial=3aaa7110-6f8f-46ec-94fa-89f646aefac1,cache=none,aio=native -device virtio-blk-pci,scsi=off,bus=pci.0,addr=0xb,drive=drive-virtio-disk6,id=virtio-disk6,write-cache=on -netdev tap,fd=105,id=hostnet0,vhost=on,vhostfd=106 -device virtio-net-pci,rx_queue_size=512,host_mtu=1450,netdev=hostnet0,id=net0,mac=fa:16:3e:61:f7:59,bus=pci.0,addr=0x3 -netdev tap,fd=107,id=hostnet1,vhost=on,vhostfd=108 -device virtio-net-pci,rx_queue_size=512,host_mtu=1450,netdev=hostnet1,id=net1,mac=fa:16:3e:53:47:de,bus=pci.0,addr=0x4 -add-fd set=5,fd=110 -chardev pty,id=charserial0,logfile=/dev/fdset/5,logappend=on -device isa-serial,chardev=charserial0,id=serial0 -device usb-tablet,id=input0,bus=usb.0,port=1 -vnc 10.154.155.19:24 -k en-us -device cirrus-vga,id=video0,bus=pci.0,addr=0x2 -incoming defer -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0xc -sandbox on,obsolete=deny,elevateprivileges=deny,spawn=deny,resourcecontrol=deny -msg timestamp=on
root        1079 84.7  3.4 18118556 18075100 ?   Rs   Apr10 155703:57 /usr/lib/systemd/systemd-udevd
root      700480 10.3  3.7 19796620 19751788 ?   S    Apr10 18958:15 /usr/lib/systemd/systemd-udevd
root      700482  9.3  4.8 25816260 25771432 ?   S    Apr10 17159:23 /usr/lib/systemd/systemd-udevd
root      700488  8.3  4.9 26139852 26095060 ?   S    Apr10 15277:56 /usr/lib/systemd/systemd-udevd
root      700470 22.1  5.0 26922556 26877736 ?   S    Apr10 40592:53 /usr/lib/systemd/systemd-udevd
qemu      878663  120  6.3 34330504 33631968 ?   Sl   Apr10 220701:12 /usr/libexec/qemu-kvm -name guest=instance-00000f3e,debug-threads=on -S -object secret,id=masterKey0,format=raw,file=/var/lib/libvirt/qemu/domain-63-instance-00000f3e/master-key.aes -machine pc-i440fx-rhel7.6.0,accel=kvm,usb=off,dump-guest-core=off -cpu Broadwell-IBRS,vme=on,ss=on,f16c=on,rdrand=on,hypervisor=on,arat=on,tsc_adjust=on,md-clear=on,stibp=on,ssbd=on,xsaveopt=on,pdpe1gb=on,abm=on -m 32768 -realtime mlock=off -smp 8,sockets=8,cores=1,threads=1 -uuid 65fc64b8-6024-4b99-bec2-59745ebd3e00 -smbios type=1,manufacturer=Red Hat,product=OpenStack Compute,version=17.0.10-6.el7ost,serial=f606d278-7c7e-1000-be7a-54ab3aee2cd1,uuid=65fc64b8-6024-4b99-bec2-59745ebd3e00,family=Virtual Machine -no-user-config -nodefaults -chardev socket,id=charmonitor,fd=139,server,nowait -mon chardev=charmonitor,id=monitor,mode=control -rtc base=utc,driftfix=slew -global kvm-pit.lost_tick_policy=delay -no-hpet -no-shutdown -boot strict=on -device piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2 -drive file=/dev/disk/by-id/dm-uuid-mpath-360060e8012314f005040314f00000349,format=raw,if=none,id=drive-virtio-disk0,serial=d41cd1d8-0e1b-47ea-8f64-ab54a6387e60,cache=none,aio=native -device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1,write-cache=on -netdev tap,fd=141,id=hostnet0,vhost=on,vhostfd=142 -device virtio-net-pci,rx_queue_size=512,host_mtu=9000,netdev=hostnet0,id=net0,mac=fa:16:3e:5a:f1:68,bus=pci.0,addr=0x3 -add-fd set=3,fd=144 -chardev pty,id=charserial0,logfile=/dev/fdset/3,logappend=on -device isa-serial,chardev=charserial0,id=serial0 -device usb-tablet,id=input0,bus=usb.0,port=1 -vnc 10.154.155.19:60 -k en-us -device cirrus-vga,id=video0,bus=pci.0,addr=0x2 -incoming defer -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x5 -sandbox on,obsolete=deny,elevateprivileges=deny,spawn=deny,resourcecontrol=deny -msg timestamp=on
qemu      946592 15.5  6.3 34514176 33633264 ?   Sl   Apr10 28433:41 /usr/libexec/qemu-kvm -name guest=instance-00000d6a,debug-threads=on -S -object secret,id=masterKey0,format=raw,file=/var/lib/libvirt/qemu/domain-68-instance-00000d6a/master-key.aes -machine pc-i440fx-rhel7.6.0,accel=kvm,usb=off,dump-guest-core=off -cpu Broadwell-IBRS,vme=on,ss=on,f16c=on,rdrand=on,hypervisor=on,arat=on,tsc_adjust=on,md-clear=on,stibp=on,ssbd=on,xsaveopt=on,pdpe1gb=on,abm=on -m 32768 -realtime mlock=off -smp 32,sockets=32,cores=1,threads=1 -uuid 3b0aae79-acbf-493f-8bba-b34dd5c5af01 -smbios type=1,manufacturer=Red Hat,product=OpenStack Compute,version=17.0.10-6.el7ost,serial=f606d278-7c7e-1000-be7a-54ab3aee2cd1,uuid=3b0aae79-acbf-493f-8bba-b34dd5c5af01,family=Virtual Machine -no-user-config -nodefaults -chardev socket,id=charmonitor,fd=144,server,nowait -mon chardev=charmonitor,id=monitor,mode=control -rtc base=utc,driftfix=slew -global kvm-pit.lost_tick_policy=delay -no-hpet -no-shutdown -boot strict=on -device piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2 -drive file=/dev/disk/by-id/dm-uuid-mpath-360060e8012314f005040314f000003d5,format=raw,if=none,id=drive-virtio-disk0,serial=24d09b0b-406c-4cdd-9421-4a864a946956,cache=none,aio=native -device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1,write-cache=on -netdev tap,fd=146,id=hostnet0,vhost=on,vhostfd=147 -device virtio-net-pci,rx_queue_size=512,host_mtu=1450,netdev=hostnet0,id=net0,mac=fa:16:3e:97:9d:50,bus=pci.0,addr=0x3 -add-fd set=3,fd=149 -chardev pty,id=charserial0,logfile=/dev/fdset/3,logappend=on -device isa-serial,chardev=charserial0,id=serial0 -device usb-tablet,id=input0,bus=usb.0,port=1 -vnc 10.154.155.19:65 -k en-us -device cirrus-vga,id=video0,bus=pci.0,addr=0x2 -incoming defer -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x5 -sandbox on,obsolete=deny,elevateprivileges=deny,spawn=deny,resourcecontrol=deny -msg timestamp=on
root      700458 25.1 12.2 64589408 64544588 ?   R    Apr10 46114:41 /usr/lib/systemd/systemd-udevd
  • We do notice systemd-udevd is consuming a lot of RAM in various different process which appears to be an abnormal behavior at this time.

  • Some paths are down:

Aug 14 03:22:38 overcloud-compute-0 multipathd: 360060e8012314f005040314f0000037e: sdagb - tur checker reports path is down
  • Many paths are failed in multipath -ll:
# multipath -ll | grep faulty
  |- 11:0:1:108 sdagf 69:1008  failed faulty running
  |- 11:0:3:108 sdagg 70:768   failed faulty running
  |- 11:0:0:108 sdagh 70:784   failed faulty running
  |- 11:0:2:108 sdagi 70:800   failed faulty running
  |- 12:0:0:108 sdagj 70:816   failed faulty running
  |- 12:0:2:108 sdagk 70:832   failed faulty running
  |- 12:0:1:108 sdagl 70:848   failed faulty running
  `- 12:0:3:108 sdagm 70:864   failed faulty running
  |- 11:0:1:93  sdacn 135:752  failed ready running
  |- 11:0:3:93  sdaco 8:768    failed ready running
  |- 11:0:0:93  sdacp 8:784    failed ready running
  |- 11:0:2:93  sdacq 8:800    failed ready running
  |- 12:0:0:93  sdacr 8:816    failed ready running
  |- 12:0:2:93  sdacs 8:832    failed ready running
  |- 12:0:1:93  sdact 8:848    failed ready running
  `- 12:0:3:93  sdacu 8:864    failed ready running
  |- 11:0:1:0   sdqv  132:496  failed faulty running
  |- 11:0:3:0   sdqw  133:256  failed faulty running
  |- 11:0:0:0   sdqx  133:272  failed faulty running
  |- 11:0:2:0   sdqy  133:288  failed faulty running
  |- 12:0:0:0   sdqz  133:304  failed faulty running
  |- 12:0:2:0   sdra  133:320  failed faulty running
  |- 12:0:1:0   sdrb  133:336  failed faulty running
  `- 12:0:3:0   sdrc  133:352  failed faulty running
  |- 11:0:1:107 sdafx 69:880   failed faulty running
  |- 11:0:3:107 sdafy 69:896   failed faulty running
  |- 11:0:0:107 sdafz 69:912   failed faulty running
  |- 11:0:2:107 sdaga 69:928   failed faulty running
  |- 12:0:0:107 sdagb 69:944   failed faulty running
  |- 12:0:2:107 sdagc 69:960   failed faulty running
  |- 12:0:1:107 sdagd 69:976   failed faulty running
  `- 12:0:3:107 sdage 69:992   failed faulty running
  |- 11:0:1:106 sdafp 68:1008  failed faulty running
  |- 11:0:3:106 sdafq 69:768   failed faulty running
  |- 11:0:0:106 sdafr 69:784   failed faulty running
  |- 11:0:2:106 sdafs 69:800   failed faulty running
  |- 12:0:0:106 sdaft 69:816   failed faulty running
  |- 12:0:2:106 sdafu 69:832   failed faulty running
  |- 12:0:1:106 sdafv 69:848   failed faulty running
  `- 12:0:3:106 sdafw 69:864   failed faulty running
  • pvs seems to not be able to read many devices:
  Error reading device /dev/sdwy15 at 4096 length 4.
  Error reading device /dev/sdadd at 0 length 512.
  Error reading device /dev/sdadd at 0 length 4.
  Error reading device /dev/sdadd at 4096 length 4.
  Error reading device /dev/sdaej at 0 length 512.
  Error reading device /dev/sdaej at 0 length 4.
  Error reading device /dev/sdaej at 4096 length 4.
  Error reading device /dev/sdaof at 0 length 512.
  Error reading device /dev/sdaof at 0 length 4.
  Error reading device /dev/sdaof at 4096 length 4.
  Error reading device /dev/sdft at 0 length 512.
  Error reading device /dev/sdft at 0 length 4.
  Error reading device /dev/sdft at 4096 length 4.
  Error reading device /dev/sdamj at 0 length 512.
  Error reading device /dev/sdamj at 0 length 4.
  Error reading device /dev/sdamj at 4096 length 4.
  Error reading device /dev/sdadd1 at 0 length 4.
  Error reading device /dev/sdadd1 at 4096 length 4.
  Error reading device /dev/sdaej1 at 0 length 4.
  Error reading device /dev/sdaej1 at 4096 length 4.
  Error reading device /dev/sdaof1 at 0 length 4.
  Error reading device /dev/sdaof1 at 4096 length 4.
  Error reading device /dev/sdft1 at 0 length 4.
  Error reading device /dev/sdft1 at 4096 length 4.

Environment

  • Red Hat OpenStack Platform 13.0 (RHOSP)

Subscriber exclusive content

A Red Hat subscription provides unlimited access to our knowledgebase, tools, and much more.

Current Customers and Partners

Log in for full access

Log In

New to Red Hat?

Learn more about Red Hat subscriptions

Using a Red Hat product through a public cloud?

How to access this content