Source code for tokio.cli.compare_isdct

"""
Compare two NERSC ISDCT dumps and report
    1. the devices that appeared or were removed
    2. the numeric counters whose values changed
    3. the string counters whose contents changed
"""

import json
import argparse
import warnings
import datetime
import tokio.connectors.nersc_isdct
from tokio.common import isstr

# If the following keys report changes, the drive should be flagged
ERROR_KEYS = [
    'crc_error_count',
    'critical_warnings',
    'end_to_end_error_detection_count',
    'erase_fail_count',
    'media_errors',
    'number_of_error_info_log_entries',
    'pci_link_gen_speed',
    'pci_link_width',
    'program_fail_count',
    'smart_crc_error_count_raw',
    'smart_endto_end_error_detection_count_raw',
    'smart_erase_fail_count_raw',
    'smart_pli_lock_loss_count_raw',
    'smart_program_fail_count_raw',
    'thermal_throttle_count',
]

[docs]def reduce_diff(diff_dict): """ Take the raw output of .diff() and aggregate the results of each device """ reduced = { 'sum': {}, 'min': {}, 'max': {}, 'count': {}, } for counters in diff_dict['devices'].values(): for counter, value in counters.items(): if counter not in reduced['count']: reduced['count'][counter] = 1 new = True else: reduced['count'][counter] += 1 new = False if not isstr(value): if new: reduced['sum'][counter] = value reduced['min'][counter] = value reduced['max'][counter] = value else: reduced['sum'][counter] += value reduced['min'][counter] = min(value, reduced['min'][counter]) reduced['max'][counter] = max(value, reduced['max'][counter]) result = {} for reduction, counters in reduced.items(): for counter, value in counters.items(): reduced_key = "%s_%s" % (reduction, counter) result[reduced_key] = value if reduction == 'sum': reduced_key = "%s_%s" % ('ave', counter) result[reduced_key] = float(value) / reduced['count'][counter] return result
[docs]def _convert_counters(counters, conversion_factor, label): """ Convert a single flat dictionary of counters of bytes into another unit """ results = {} # convert each relevant key for counter, value in counters.items(): if counter.endswith('_bytes'): new_key = counter.replace('_bytes', "_" + label) new_value = value * conversion_factor else: new_key = counter new_value = value results[new_key] = new_value return results
[docs]def convert_byte_keys(input_dict, conversion_factor=2.0**(-30.0), label="gibs"): """ Convert all keys ending in _bytes to some other unit. Accepts either the raw diff dict or the reduced dict from reduce_diff() """ results = {} # raw diff dict if 'devices' in input_dict: results = input_dict.copy() for serial_no, counters in input_dict['devices'].items(): results['devices'][serial_no] = _convert_counters(counters, conversion_factor, label) else: results = _convert_counters(input_dict, conversion_factor, label) return results
[docs]def summarize_reduced_diffs(reduced_diffs): """ Print a human-readable summary of the relevant reduced diff data """ buf = "" ### General summary if 'sum_data_units_read_gibs' not in reduced_diffs: read_gibs = reduced_diffs.get('sum_data_units_read_bytes', 0) * 2.0**(-40) write_gibs = reduced_diffs.get('sum_data_units_written_bytes', 0) * 2.0**(-40) else: read_gibs = reduced_diffs.get('sum_data_units_read_gibs', 0) write_gibs = reduced_diffs.get('sum_data_units_written_gibs', 0) buf += "Read: %10.2f TiB, %10.2f MOps\n" % ( read_gibs, reduced_diffs.get('sum_host_read_commands', 0) / 1000000.0) buf += "Written: %10.2f TiB, %10.2f MOps\n" % ( write_gibs, reduced_diffs.get('sum_host_write_commands', 0) / 1000000.0) buf += "WAF: %+10.4f\n" % reduced_diffs.get('max_write_amplification_factor', 0) return buf
[docs]def summarize_errors(diff_dict, isdct_data): """ Print a human-readable summary of any bad SSDs """ buf = "" for serial_no, error_key in discover_errors(diff_dict): if buf != "": buf += "\n" buf += "%s %s %s %s" % ( isdct_data[serial_no]['node_name'], serial_no, error_key, diff_dict['devices'][serial_no][error_key]) return buf
[docs]def discover_errors(diff_dict): """ Look through all diffs and report serial numbers of devices that show changes in counters that may indicate a hardware issue. """ if 'devices' not in diff_dict: warnings.warn("No 'devices' found in diff dict") return [] errors = [] for serial_no, counters in diff_dict['devices'].items(): for error_key in ERROR_KEYS: if error_key in counters: errors.append((serial_no, error_key)) return errors
[docs]def main(argv=None): """Entry point for the CLI interface """ parser = argparse.ArgumentParser() parser.add_argument("-a", "--all", action='store_true', help='report changes for each device') parser.add_argument("-g", "--gibs", action="store_true", help="report in units of GiB") parser.add_argument("-s", "--summary", action="store_true", help="print summary of differences") parser.add_argument("-z", "--report-zeros", action='store_true', help='include counters that do not change') parser.add_argument("old_isdctfile", help="older ISDCT dump file") parser.add_argument("new_isdctfile", help="newer ISDCT dump file") args = parser.parse_args(argv) old_isdctfile = tokio.connectors.nersc_isdct.NerscIsdct(args.old_isdctfile) new_isdctfile = tokio.connectors.nersc_isdct.NerscIsdct(args.new_isdctfile) diff_dict = new_isdctfile.diff(old_isdctfile, report_zeros=args.report_zeros) if args.gibs: diff_dict = convert_byte_keys(diff_dict) if args.summary: print_summary(old_isdctfile, new_isdctfile, diff_dict) elif args.all: print(json.dumps(diff_dict, indent=4, sort_keys=True)) else: reduced_diff = reduce_diff(diff_dict) print(json.dumps(reduced_diff, indent=4, sort_keys=True))