#!/usr/bin/env python
"""
Connectors for the Lustre `lfs df` and `lctl dl -t` commands to determine the
health of Lustre file systems from the clients' perspective.
"""
import re
from tokio.connectors.common import SubprocessOutputDict
# Only try to match osc/mdc lines; skip mgc/lov/lmv
# 351 UP osc snx11025-OST0007-osc-ffff8875ac1e7c00 3f30f170-90e6-b332-b141-a6d4a94a1829 5 10.100.100.12@o2ib1
#
# snx11035-OST0000_UUID 90767651352 54512631228 35277748388 61% /scratch2[OST:0]
# snx000-OST... tot use avail 00% /scra[OST :0 ]
_REX_OST_MAP = re.compile(r'^\s*(\d+)\s+(\S+)\s+(\S+)\s+(snx\d+-\S+)\s+(\S+)\s+(\d+)\s+(\S+@\S+)\s*$')
_REX_LFS_DF = re.compile(r'^\s*(snx\d+-\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+).\s+(\S+)\[([^:]+):(\d+)\]\s*$')
LCTL = 'lctl'
LCTL_DL_T = [LCTL, 'dl', '-t']
LFS = 'lfs'
LFS_DF = [LFS, 'df']
[docs]class LfsOstMap(SubprocessOutputDict):
"""
Representation for the lctl dl -t command. Generates a dict of form
{ file_system: { ost_name : { keys: values } } }
This is a generally logical structure, although this map is always almost
fed into a routine that tries to find multiple OSTs on the same OSS (i.e., a
failover situation)
"""
def __init__(self, *args, **kwargs):
super(LfsOstMap, self).__init__(*args, **kwargs)
self.subprocess_cmd = LCTL_DL_T
self.load()
[docs] def __repr__(self):
"""Serialize object into an ASCII string
Returns a string that resembles the input used to initialize this object
"""
repr_result = ""
# Iterate over file systems within each time step
for target_name in sorted(self.keys()):
obd_data = self[target_name]
for obd_name in sorted(list(obd_data.keys()), key=lambda x: int(obd_data[x]['index'])):
keyvals = obd_data[obd_name]
record_string = \
"%(index)3d %(status)2s %(role)3s %(role_id)s %(uuid)s %(ref_count)d %(nid)s\n" % keyvals
repr_result += record_string
return repr_result
[docs] def load_str(self, input_str):
"""Parse the output of `lctl dl -t` to initialize self
"""
degenerate_keys = 0
for line in input_str.splitlines():
match = _REX_OST_MAP.search(line)
if match is not None:
file_system, target_name = (match.group(4).split('-')[0:2])
if file_system not in self:
self[file_system] = {}
# Duplicates can happen if a file system is doubly mounted
if target_name in self[file_system]:
degenerate_keys += 1
self[file_system][target_name] = {
'index': int(match.group(1)),
'status': match.group(2).lower(),
'role': match.group(3).lower(),
'role_id': match.group(4),
'uuid': match.group(5),
'ref_count': int(match.group(6)),
'target_ip': match.group(7).split('@')[0],
'nid': match.group(7),
}
[docs] def get_failovers(self):
"""Identify OSSes with an abnormal number of OSTs
Identify OSTs that are probably failed over and return a list of
abnormal OSSes and the expected number of OSTs per OSS.
"""
resulting_data = {}
for file_system, ost_data in self.items():
ost_counts = {} # key = ip address, val = ost count
for ost_name, ost_values in ost_data.items():
if ost_values['role'] != 'osc': # don't care about mdc, mgc
continue
ip_addr = ost_values['target_ip']
ost_counts[ip_addr] = ost_counts.get(ip_addr, 0) + 1
# Get mode of OSTs per OSS to infer what "normal" OST/OSS ratio is
histogram = {}
for ip_addr, ost_count in ost_counts.items():
if ost_count not in histogram:
histogram[ost_count] = 1
else:
histogram[ost_count] += 1
if not histogram:
raise KeyError('no OSTs to count')
mode = max(histogram, key=histogram.get)
# Build a dict of { ip_addr: [ ostname1, ostname2, ... ], ... }
abnormal_ips = {}
for ost_name, ost_values in ost_data.items():
if ost_values['role'] != 'osc': # don't care about mdc, mgc
continue
ip_addr = ost_values['target_ip']
if ost_counts[ip_addr] != mode:
if ip_addr in abnormal_ips:
abnormal_ips[ip_addr].append(ost_name)
else:
abnormal_ips[ip_addr] = [ost_name]
resulting_data[file_system] = {
'mode': mode,
'abnormal_ips': abnormal_ips,
}
return resulting_data
[docs]class LfsOstFullness(SubprocessOutputDict):
"""
Representation for the `lfs df` command. Generates a dict of form
{ file_system: { ost_name : { keys: values } } }
"""
def __init__(self, *args, **kwargs):
super(LfsOstFullness, self).__init__(*args, **kwargs)
self.subprocess_cmd = LFS_DF
self.load()
[docs] def __repr__(self):
"""Serialize object into an ASCII string
Returns a string that resembles the input used to initialize this object:
snx11025-OST0001_UUID 90767651352 63381521692 26424604184 71% /scratch1[OST:1]
"""
repr_result = ""
for target_name in sorted(self.keys()):
obd_data = self[target_name]
for obd_name in sorted(list(obd_data.keys()), key=lambda x: obd_data[x]['target_index']):
keyvalues = obd_data[obd_name]
repr_result += "%s-%s_UUID %ld %ld %ld %3d%% %s[%s:%d]\n" % (
target_name,
obd_name,
keyvalues['total_kib'],
keyvalues['used_kib'],
keyvalues['remaining_kib'],
# Note that lfs dl's percents are not divided by
# avail_kib, but rather the sum of used and remaining.
round(100.0 * keyvalues['used_kib'] / (keyvalues['remaining_kib'] + keyvalues['used_kib'])),
keyvalues['mount_pt'],
keyvalues['role'].upper(),
keyvalues['target_index'], )
return repr_result
[docs] def load_str(self, input_str):
"""Parse the output of `lfs df` to initialize self
"""
degenerate_keys = 0
for line in input_str.splitlines():
match = _REX_LFS_DF.search(line)
if match is not None:
file_system, target_name = re.findall('[^-_]+', match.group(1))[0:2]
if file_system not in self:
self[file_system] = {}
# Duplicates can happen if a file system is doubly mounted
if target_name in self[file_system]:
degenerate_keys += 1
self[file_system][target_name] = {
'total_kib': int(match.group(2)),
'used_kib': int(match.group(3)),
'remaining_kib': int(match.group(4)),
'mount_pt': match.group(6),
'role': match.group(7).lower(),
'target_index': int(match.group(8)),
}