#!/usr/bin/env python
"""
Class and tools to generate TOKIO UMAMI plots
"""
import json
import datetime
import collections
import textwrap
import numpy
import pandas
DEFAULT_LINEWIDTH = 1
DEFAULT_LINECOLOR = "#853692"
DEFAULT_FONTSIZE = 12
DEFAULT_COLORSCALE = ['#DA0017', '#FD6A07', '#40A43A', '#2C69A9']
DEFAULT_FIGSIZE = (6.0, 12.0 / 9.0)
[docs]class Umami(collections.OrderedDict):
"""
Subclass of dictionary that stores all of the data needed to generate an
UMAMI diagram. It is keyed by a metric name, and values are UmamiMetric
objects which contain timestamps (x values) and measurements (y values)
"""
# def __init__(self, *args, **kwargs):
# super(Umami, self).__init__(*args, **kwargs)
[docs] def to_dict(self):
"""
Convert this object (and all of its constituent UmamiMetric objects)
into a dictionary
"""
return {k: v.__dict__ for k, v in self.items()}
[docs] def _to_dict_for_pandas(self, stringify_key=False):
"""
Convert this object into a DataFrame, indexed by timestamp, with each
column as a metric. The Umami attributes (labels, etc) are not
expressed.
"""
to_df = {}
for metric, measurement in self.items():
for index, timestamp in enumerate(measurement.timestamps):
if stringify_key:
key = str(timestamp)
else:
key = timestamp
if key not in to_df:
to_df[key] = {}
to_df[key].update({metric: measurement.values[index]})
return to_df
[docs] def to_json(self):
"""Serialize self into a JSON string
Returns:
str: JSON representation of numerical data being plotted
"""
return json.dumps(self._to_dict_for_pandas(stringify_key=True), indent=4, sort_keys=True)
[docs] def to_dataframe(self):
"""Return a representation of self as pandas.DataFrame
Returns:
pandas.DataFrame: numerical representation of the values being
plotted
"""
return pandas.DataFrame.from_dict(self._to_dict_for_pandas(), orient='index')
[docs] def plot(self, output_file=None,
highlight_index=-1,
linewidth=DEFAULT_LINEWIDTH,
linecolor=DEFAULT_LINECOLOR,
colorscale=DEFAULT_COLORSCALE,
fontsize=DEFAULT_FONTSIZE,
figsize=DEFAULT_FIGSIZE):
"""Create a graphical representation of the UMAMI object
Args:
output_file (str or None): save umami diagram to file of given name
highlight_index (int): index of measurement to highlight
linewidth (int): linewidth for both timeseries and boxplot lines
linecolor (str): color of line in timeseries panels
colorscale (list of str): colors to use for data below the 25th,
50th, 75th, and 100th percentiles
fontsize (int): font size for UMAMI labels
figsize (tuple of float): x, y dimensions of a single UMAMI row;
multiplied by len(self.keys()) to determine full diagram height
Returns:
list: List of matplotlib.axis.Axis objects corresponding to each
panel in the UMAMI diagram
"""
# import here because of various things that can break matplotlib on import
import matplotlib.pyplot
rows_to_plot = list(self.keys())
fig = matplotlib.pyplot.figure()
fig.set_size_inches(figsize[0], len(rows_to_plot) * figsize[1])
# Required to adjust the column widths of our figure (width_ratios)
gridspec = matplotlib.gridspec.GridSpec(
len(rows_to_plot), # how many rows to draw
2, # how many columns to draw
width_ratios=[4, 1]) # ratio of column widths
# Get the full range of x so we can force all rows to share the same
# x range in the presence of trailing/leading NaNs
x_min = None
x_max = None
for measurement in self.values():
this_min = min(measurement.timestamps)
this_max = max(measurement.timestamps)
if x_min is None or this_min < x_min:
x_min = this_min
if x_max is None or this_max > x_max:
x_max = this_max
# Draw UMAMI rows
last_ax_ts = None
row_num = None
for measurement in self.values():
if row_num is None:
row_num = 0
else:
row_num += 1
x_val = measurement.timestamps
y_val = measurement.values
### first plot the timeseries of the given variable
ax_ts = fig.add_subplot(gridspec[2*row_num])
ax_ts.plot(x_val,
y_val,
linestyle='-',
marker='x',
linewidth=linewidth,
color=linecolor)
# textwrap.wrap inserts line breaks into each label
ax_ts.set_ylabel('\n'.join(textwrap.wrap(text=measurement.label,
width=15,
break_on_hyphens=True)),
fontsize=fontsize,
rotation=0,
horizontalalignment='right',
verticalalignment='center')
ax_ts.grid()
ax_ts.set_xlim(x_min, x_max)
# blank out the labels for all subplots except the bottom-most one
if row_num != len(rows_to_plot) - 1:
ax_ts.set_xticklabels([])
else:
last_ax_ts = ax_ts
# resize and rotate the labels for the timeseries plot
for tick in ax_ts.xaxis.get_major_ticks():
tick.label.set_fontsize(fontsize)
tick.label.set_rotation(45)
# also adjust the font size for the y labels
for tick in ax_ts.yaxis.get_major_ticks():
tick.label.set_fontsize(fontsize)
# then plot the boxplot summary of the given variable
ax_box = fig.add_subplot(gridspec[2*row_num + 1])
y_box_data = numpy.array(y_val)
y_box_mask = [True] * len(y_box_data)
y_box_mask[highlight_index] = False
y_box_data = y_box_data[y_box_mask]
y_box_data = y_box_data[~numpy.isnan(y_box_data)]
ax_box.boxplot(y_box_data, # note: do not include last measurement in boxplot
widths=0.70,
boxprops={'linewidth':linewidth},
medianprops={'linewidth':linewidth},
whiskerprops={'linewidth':linewidth},
capprops={'linewidth':linewidth},
flierprops={'linewidth':linewidth},
whis=[5, 95])
# scale the extents of the y ranges a little for clarity
orig_ylim = ax_ts.get_ylim()
new_ylim = list(map(lambda a, b: a*(1 + b), orig_ylim, (-0.1, 0.1)))
ax_ts.set_ylim(new_ylim)
yticks = ax_ts.get_yticks().tolist()
# the following is a heuristic to determine how close the topmost
# tick label is to the edge of the plot. if it's too close, blank
# it out so it doesn't overlap with the bottom-most tick label
# of the row above it
critical_fraction = abs(1.0 - (yticks[-1] - new_ylim[0]) / (new_ylim[-1] - new_ylim[0]))
if row_num > 0 and critical_fraction < 0.01:
# note that setting one of the yticks to a string resets the
# formatting so that the tick labels appear as floats. since
# we (hopefully) would get integral ticks otherwise, force
# them to ints. This will mess things up if the yrange is
# very narrow and must be expressed as floats.
yticks = list(map(int, yticks))
yticks[-1] = " "
ax_ts.set_yticklabels(yticks)
# lock in the y range to match the timeseries plot, just in case
ax_box.set_ylim(ax_ts.get_ylim())
# determine the color of our highlights based on quartile
percentiles = [numpy.nanpercentile(y_val[0:-1], percentile)
for percentile in (25, 50, 75, 100)]
color_index = 0
for color_index, percentile in enumerate(percentiles):
if y_val[highlight_index] <= percentile:
break
if measurement.big_is_good:
highlight_color = colorscale[color_index]
else:
highlight_color = colorscale[(1+color_index)*-1]
# highlight the latest measurement on the timeseries plot
x_last = matplotlib.dates.date2num(x_val[highlight_index])
x_2nd_last = matplotlib.dates.date2num(x_val[highlight_index-1])
ax_ts.plot([x_2nd_last, x_last],
[y_val[highlight_index-1], y_val[highlight_index]],
linestyle='-',
color=highlight_color,
linewidth=linewidth*2.0)
ax_ts.plot([x_last], [y_val[highlight_index]],
marker='*',
color=highlight_color,
markersize=15)
# where does this last data point lie on the distribution?
ax_box.plot([0, 2],
[y_val[highlight_index], y_val[highlight_index]],
linestyle='--',
color=highlight_color,
linewidth=2.0,
zorder=10)
# blank out all labels
ax_box.set_yticklabels([""])
ax_box.set_xticklabels([""])
ax_box.yaxis.grid()
fig.subplots_adjust(hspace=0.0, wspace=0.0)
fig.autofmt_xdate()
last_ax_ts.xaxis.set_major_formatter(matplotlib.dates.DateFormatter('%b %d'))
if output_file is not None:
fig.savefig(output_file, bbox_inches="tight")
return fig.axes
[docs]class UmamiMetric(object):
"""A single row of an UMAMI diagram.
Logically contains timeseries data from a single connector, where the
`timestamps` attribute is a list of timestamps (seconds since epoch), and
the 'values' attribute is a list of values corresponding to each timestamp.
The number of timestamps and attributes must always be the same.
"""
def __init__(self, timestamps, values, label, big_is_good=True):
# If we are given pandas.Series, convert them to lists, then copy.
# Otherwise, just copy the list-like inputs.
if isinstance(timestamps, pandas.Series):
self.timestamps = timestamps.tolist()[:]
else:
self.timestamps = timestamps[:]
if isinstance(values, pandas.Series):
self.values = values.tolist()[:]
else:
self.values = values[:]
self.label = label
self.big_is_good = big_is_good
if len(self.timestamps) != len(self.values):
raise Exception('timestamps and values must be of equal length')
[docs] def to_json(self):
"""Create JSON-encoded string representation of self
Returns:
str: JSON-encoded representation of values stored in UmamiMetric
"""
return json.dumps(self.__dict__, default=_serialize_datetime)
[docs] def append(self, timestamp, value):
"""
Can only add values along with a timestamp.
"""
self.timestamps.append(timestamp)
self.values.append(value)
[docs] def pop(self):
"""
Analogous to the list .pop() method.
"""
timestamp = self.timestamps.pop()
value = self.values.pop()
return timestamp, value
[docs]def _serialize_datetime(obj):
"""
Special serializer function that converts datetime into something that can
be encoded in json
"""
if isinstance(obj, (datetime.datetime, datetime.date)):
return (obj - datetime.datetime.utcfromtimestamp(0)).total_seconds()
raise TypeError("Type %s not serializable" % type(obj))