Source code for tokio.analysis.umami

#!/usr/bin/env python
"""
Class and tools to generate TOKIO UMAMI plots
"""

import json
import datetime
import collections
import textwrap
import numpy
import pandas

DEFAULT_LINEWIDTH = 1
DEFAULT_LINECOLOR = "#853692"
DEFAULT_FONTSIZE = 12
DEFAULT_COLORSCALE = ['#DA0017', '#FD6A07', '#40A43A', '#2C69A9']
DEFAULT_FIGSIZE = (6.0, 12.0 / 9.0)

[docs]class Umami(collections.OrderedDict): """ Subclass of dictionary that stores all of the data needed to generate an UMAMI diagram. It is keyed by a metric name, and values are UmamiMetric objects which contain timestamps (x values) and measurements (y values) """ # def __init__(self, *args, **kwargs): # super(Umami, self).__init__(*args, **kwargs)
[docs] def to_dict(self): """ Convert this object (and all of its constituent UmamiMetric objects) into a dictionary """ return {k: v.__dict__ for k, v in self.items()}
[docs] def _to_dict_for_pandas(self, stringify_key=False): """ Convert this object into a DataFrame, indexed by timestamp, with each column as a metric. The Umami attributes (labels, etc) are not expressed. """ to_df = {} for metric, measurement in self.items(): for index, timestamp in enumerate(measurement.timestamps): if stringify_key: key = str(timestamp) else: key = timestamp if key not in to_df: to_df[key] = {} to_df[key].update({metric: measurement.values[index]}) return to_df
[docs] def to_json(self): """Serialize self into a JSON string Returns: str: JSON representation of numerical data being plotted """ return json.dumps(self._to_dict_for_pandas(stringify_key=True), indent=4, sort_keys=True)
[docs] def to_dataframe(self): """Return a representation of self as pandas.DataFrame Returns: pandas.DataFrame: numerical representation of the values being plotted """ return pandas.DataFrame.from_dict(self._to_dict_for_pandas(), orient='index')
[docs] def plot(self, output_file=None, highlight_index=-1, linewidth=DEFAULT_LINEWIDTH, linecolor=DEFAULT_LINECOLOR, colorscale=DEFAULT_COLORSCALE, fontsize=DEFAULT_FONTSIZE, figsize=DEFAULT_FIGSIZE): """Create a graphical representation of the UMAMI object Args: output_file (str or None): save umami diagram to file of given name highlight_index (int): index of measurement to highlight linewidth (int): linewidth for both timeseries and boxplot lines linecolor (str): color of line in timeseries panels colorscale (list of str): colors to use for data below the 25th, 50th, 75th, and 100th percentiles fontsize (int): font size for UMAMI labels figsize (tuple of float): x, y dimensions of a single UMAMI row; multiplied by len(self.keys()) to determine full diagram height Returns: list: List of matplotlib.axis.Axis objects corresponding to each panel in the UMAMI diagram """ # import here because of various things that can break matplotlib on import import matplotlib.pyplot rows_to_plot = list(self.keys()) fig = matplotlib.pyplot.figure() fig.set_size_inches(figsize[0], len(rows_to_plot) * figsize[1]) # Required to adjust the column widths of our figure (width_ratios) gridspec = matplotlib.gridspec.GridSpec( len(rows_to_plot), # how many rows to draw 2, # how many columns to draw width_ratios=[4, 1]) # ratio of column widths # Get the full range of x so we can force all rows to share the same # x range in the presence of trailing/leading NaNs x_min = None x_max = None for measurement in self.values(): this_min = min(measurement.timestamps) this_max = max(measurement.timestamps) if x_min is None or this_min < x_min: x_min = this_min if x_max is None or this_max > x_max: x_max = this_max # Draw UMAMI rows last_ax_ts = None row_num = None for measurement in self.values(): if row_num is None: row_num = 0 else: row_num += 1 x_val = measurement.timestamps y_val = measurement.values ### first plot the timeseries of the given variable ax_ts = fig.add_subplot(gridspec[2*row_num]) ax_ts.plot(x_val, y_val, linestyle='-', marker='x', linewidth=linewidth, color=linecolor) # textwrap.wrap inserts line breaks into each label ax_ts.set_ylabel('\n'.join(textwrap.wrap(text=measurement.label, width=15, break_on_hyphens=True)), fontsize=fontsize, rotation=0, horizontalalignment='right', verticalalignment='center') ax_ts.grid() ax_ts.set_xlim(x_min, x_max) # blank out the labels for all subplots except the bottom-most one if row_num != len(rows_to_plot) - 1: ax_ts.set_xticklabels([]) else: last_ax_ts = ax_ts # resize and rotate the labels for the timeseries plot for tick in ax_ts.xaxis.get_major_ticks(): tick.label.set_fontsize(fontsize) tick.label.set_rotation(45) # also adjust the font size for the y labels for tick in ax_ts.yaxis.get_major_ticks(): tick.label.set_fontsize(fontsize) # then plot the boxplot summary of the given variable ax_box = fig.add_subplot(gridspec[2*row_num + 1]) y_box_data = numpy.array(y_val) y_box_mask = [True] * len(y_box_data) y_box_mask[highlight_index] = False y_box_data = y_box_data[y_box_mask] y_box_data = y_box_data[~numpy.isnan(y_box_data)] ax_box.boxplot(y_box_data, # note: do not include last measurement in boxplot widths=0.70, boxprops={'linewidth':linewidth}, medianprops={'linewidth':linewidth}, whiskerprops={'linewidth':linewidth}, capprops={'linewidth':linewidth}, flierprops={'linewidth':linewidth}, whis=[5, 95]) # scale the extents of the y ranges a little for clarity orig_ylim = ax_ts.get_ylim() new_ylim = list(map(lambda a, b: a*(1 + b), orig_ylim, (-0.1, 0.1))) ax_ts.set_ylim(new_ylim) yticks = ax_ts.get_yticks().tolist() # the following is a heuristic to determine how close the topmost # tick label is to the edge of the plot. if it's too close, blank # it out so it doesn't overlap with the bottom-most tick label # of the row above it critical_fraction = abs(1.0 - (yticks[-1] - new_ylim[0]) / (new_ylim[-1] - new_ylim[0])) if row_num > 0 and critical_fraction < 0.01: # note that setting one of the yticks to a string resets the # formatting so that the tick labels appear as floats. since # we (hopefully) would get integral ticks otherwise, force # them to ints. This will mess things up if the yrange is # very narrow and must be expressed as floats. yticks = list(map(int, yticks)) yticks[-1] = " " ax_ts.set_yticklabels(yticks) # lock in the y range to match the timeseries plot, just in case ax_box.set_ylim(ax_ts.get_ylim()) # determine the color of our highlights based on quartile percentiles = [numpy.nanpercentile(y_val[0:-1], percentile) for percentile in (25, 50, 75, 100)] color_index = 0 for color_index, percentile in enumerate(percentiles): if y_val[highlight_index] <= percentile: break if measurement.big_is_good: highlight_color = colorscale[color_index] else: highlight_color = colorscale[(1+color_index)*-1] # highlight the latest measurement on the timeseries plot x_last = matplotlib.dates.date2num(x_val[highlight_index]) x_2nd_last = matplotlib.dates.date2num(x_val[highlight_index-1]) ax_ts.plot([x_2nd_last, x_last], [y_val[highlight_index-1], y_val[highlight_index]], linestyle='-', color=highlight_color, linewidth=linewidth*2.0) ax_ts.plot([x_last], [y_val[highlight_index]], marker='*', color=highlight_color, markersize=15) # where does this last data point lie on the distribution? ax_box.plot([0, 2], [y_val[highlight_index], y_val[highlight_index]], linestyle='--', color=highlight_color, linewidth=2.0, zorder=10) # blank out all labels ax_box.set_yticklabels([""]) ax_box.set_xticklabels([""]) ax_box.yaxis.grid() fig.subplots_adjust(hspace=0.0, wspace=0.0) fig.autofmt_xdate() last_ax_ts.xaxis.set_major_formatter(matplotlib.dates.DateFormatter('%b %d')) if output_file is not None: fig.savefig(output_file, bbox_inches="tight") return fig.axes
[docs]class UmamiMetric(object): """A single row of an UMAMI diagram. Logically contains timeseries data from a single connector, where the `timestamps` attribute is a list of timestamps (seconds since epoch), and the 'values' attribute is a list of values corresponding to each timestamp. The number of timestamps and attributes must always be the same. """ def __init__(self, timestamps, values, label, big_is_good=True): # If we are given pandas.Series, convert them to lists, then copy. # Otherwise, just copy the list-like inputs. if isinstance(timestamps, pandas.Series): self.timestamps = timestamps.tolist()[:] else: self.timestamps = timestamps[:] if isinstance(values, pandas.Series): self.values = values.tolist()[:] else: self.values = values[:] self.label = label self.big_is_good = big_is_good if len(self.timestamps) != len(self.values): raise Exception('timestamps and values must be of equal length')
[docs] def to_json(self): """Create JSON-encoded string representation of self Returns: str: JSON-encoded representation of values stored in UmamiMetric """ return json.dumps(self.__dict__, default=_serialize_datetime)
[docs] def append(self, timestamp, value): """ Can only add values along with a timestamp. """ self.timestamps.append(timestamp) self.values.append(value)
[docs] def pop(self): """ Analogous to the list .pop() method. """ timestamp = self.timestamps.pop() value = self.values.pop() return timestamp, value
[docs]def _serialize_datetime(obj): """ Special serializer function that converts datetime into something that can be encoded in json """ if isinstance(obj, (datetime.datetime, datetime.date)): return (obj - datetime.datetime.utcfromtimestamp(0)).total_seconds() raise TypeError("Type %s not serializable" % type(obj))