Source code for ensign.visualize

#!/usr/bin/env python
# ENSIGN rights
""" Visualize a tensor decomposition by plotting scores for selected components 
in each mode.
"""

# visualize.py comp
#
# Requires results to be inside VISUALS_INPUT_DIR and in
#  the format: decomp_mode_i.txt where "i" is the mode.
#
# If comp isn't specified then all components will be visualized.

import multiprocessing as mltprc
import sys

import matplotlib
matplotlib.use('Agg')

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib.collections import LineCollection
import numpy as np

import ensign.cp_decomp as cpd
import ensign.comp_top_k as ctk
import ensign.ensign_io.ensign_logging as ensign_logging

log = ensign_logging.get_logger()
# log.setLevel('DEBUG') # Developers uncomment this for stacktraces

DISCRETE_COUNT = 1000
AXIS_SCALE_OVERHEAD = 1.1
X_AXIS_OVERHEAD = 0.01
MAX_MODE_PER_FRAME = 8
TOLERANCE_VALUE = 0.001
DPI = 150

def _parse_args():
    if len(sys.argv) != 4 and len(sys.argv) != 7:
        print ("ERROR: visualize.py <comp> <input_directory> <num weight vectors> "
               " [<decomposition_type> <num_components> <use_absolute_value>]")
        return FAILURE

    args = {}
    args['decomp_dir'] = sys.argv[2]
    args['num_components'] = -1

    if len(sys.argv) == 7:
        if sys.argv[4] == "CP":
            args['decomp_type'] = sys.argv[4]
        else:
            print ("ERROR: Invalid decomposition type: " + sys.argv[4])
            return FAILURE
        args['num_components'] = int(sys.argv[5])

    return args

def main():
    args = _parse_args()

    decomp = cpd.read_cp_decomp_dir(args['decomp_dir'])

    if args['num_components'] == -1:
        num_components = decomp.rank
    else:
        num_components = args['num_components']
        if num_components > decomp.rank:
            print('WARNING: More components requested than rank of the decomposition.')
            print('Using {} instead.'.format(decomp.rank))
            num_components = decomp.rank

    visualize(decomp, num_components=num_components)

def visualize(decomp, top_k=True, num_components=None):
    """
    Visualize all the components of a decomposition and save to a file 'decomp.pdf'.

    Parameters
    ----------
    decomp : ensign.cp_decomp.CPDecomp
        Decomposition to visualize.
    top_k : bool
        Whether or not to include the top 10 labels of each mode in the 
        visuals.
    num_components : int
        Number of components to visualize. If left as None, then the rank of
        the decomposition is used.
    """
    if top_k:
        top_k = ctk.get_top_k(decomp.factors, decomp.labels, list(range(decomp.rank)), 10)

    if num_components is None:
        num_components = decomp.rank

    # use the number of physical processors, rather than logical
    num_cores = (int((mltprc.cpu_count() / 2)) if (mltprc.cpu_count() > 1 ) else 1)
    comps_per_core = int(num_components / num_cores)
    comps_leftover = num_components % num_cores

    # creates partitions of the components among the available CPUs
    # if even partitions are not possible, the last CPU will visualize the "extra" components
    # some cores may go unused if the number of cores exceeds the deomposition rank
    # example: for 4 cores and 30 components, will give:
    # [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15], 
    # [16, 17, 18, 19], [20, 21, 22, 23], [24, 25, 26, 27], [28,29]]   

    process_comp_map = [list(range(num_components))[core*comps_per_core 
        + min(comps_leftover, core): (core+1)*(comps_per_core)
        + min(core+1, comps_leftover)]
        for core in range(num_cores) if (core < num_components)]

    procs = []
    for process_id, comp_range in enumerate(process_comp_map):
        p = mltprc.Process(target=draw_pdf, args=(decomp, comp_range, top_k))
        procs.append(p)
        p.start()  

    exitcode_sum = 0
    for p in procs:
        p.join()
        exitcode_sum += p.exitcode

    return exitcode_sum

def draw_pdf(decomp, comp_range, top_k):
    # Helper function to parallelize computation.
    for comp_id in comp_range:
        _plot_component(decomp, comp_id, top_k=top_k, inline=False)
    return 0

[docs]def plot_component(decomp, comp_id, inline=True):
    """
    Visualizes a specific component of a decomposition using blue-line charts.

    Parameters
    ----------
    decomp : CPDecomp
        Decomposition object from which to visualize a component.
    comp_id : int
        The ID of the component to visualize.
    inline : bool
        Set to True if using within a Jupyter notebook and False to get PNG
        file as output. The output file is named '_comp_<comp_id>.png'.

    Returns
    -------
    fig : matplotlib.figure.Figure
        Figure object representing visualization.
    """
    top_k = ctk.get_top_k(decomp.factors, decomp.labels, [comp_id], 10)
    return _plot_component(decomp, comp_id, top_k, inline=inline)

def _plot_component(decomp, comp_id, top_k, inline=True):
    """
    Driver function for plot_component(). Separated so that the Top 10 labels
    are calculated for every component ahead of time instead of every process
    computing the Top 10 for the components it's responsible for.

    Parameters
    ----------
    decomp : CPDecomp
        Decomposition object from which to visualize a component.
    comp_id : int
        The ID of the component to visualize.
    top_k : {int: {int: (str, int, float)}}
        Top-K results. See ensign.comp_top_k
    inline : bool
        Set to True if using within a Jupyter notebook and False to get PNG
        file as output. The output file is named '_comp_<comp_id>.png'.

    Returns
    -------
    fig : matplotlib.figure.Figure
        Figure object representing visualization.
    """
    if inline:
        FIG_WIDTH, FIG_HEIGHT, TITLE_SIZE, TEXT_SIZE = 14, 10, 14, 10
    else:
        FIG_WIDTH, FIG_HEIGHT, TITLE_SIZE, TEXT_SIZE = 14, 10, 14, 9

    if decomp.order > MAX_MODE_PER_FRAME:
        decomp.order = MAX_MODE_PER_FRAME

    # create figure
    fig, axes = (plt.subplots(decomp.order, figsize=(FIG_WIDTH, FIG_HEIGHT)))
    fig.canvas.draw()
    if decomp.order == 1:
        axes = np.array([axes])

    plot_clct = []
    for frame_num in range(len(axes)):
        plot_clct.append(axes[frame_num].plot([], [], color='blue')[0])

    fig.canvas.blit(axes[0].bbox)
    line_collect = {}

    main_title = fig.suptitle('Component {}, Weight: {}'.format(comp_id, decomp.weights[comp_id]), 
                                fontsize=TITLE_SIZE, y=1.02)

    for mode_id in range(decomp.order):
        # retrieve important values for plotting
        mode_size = decomp.mode_sizes[mode_id]
        max_val = max(decomp.factors[mode_id][:, comp_id])
        min_val = min(decomp.factors[mode_id][:, comp_id])

        if (max_val > 0) & (min_val > 0):
            y_min = 0
            y_max = AXIS_SCALE_OVERHEAD * max_val
        elif (max_val < 0) & (min_val < 0):
            y_min = AXIS_SCALE_OVERHEAD * min_val
            y_max = 0
        else:
            y_min = AXIS_SCALE_OVERHEAD * min_val
            y_max = AXIS_SCALE_OVERHEAD * max_val
            
        if (max_val - min_val < TOLERANCE_VALUE):
            max_val = min_val + TOLERANCE_VALUE

        # set up axes
        axes[mode_id].axis(xmin=-X_AXIS_OVERHEAD * mode_size, 
                            xmax=(mode_size - 1) + X_AXIS_OVERHEAD * mode_size)
        axes[mode_id].tick_params(labelsize=TEXT_SIZE)
        axes[mode_id].get_xaxis().set_major_locator(ticker.MaxNLocator(integer=True))

        # draw the x-axis, as long as it is not too close to y_max or y_min
        smallest_view_increment = (y_max - y_min) / 50.0
        if y_min < -1 * smallest_view_increment and y_max > smallest_view_increment:
            axes[mode_id].axhline(y=0, color='k', linewidth=0.25)

        axes[mode_id].set_title(decomp.mode_names[mode_id], size=TEXT_SIZE)
        if (np.array(decomp.factors[mode_id][:, comp_id]) < 0).any():
            axes[mode_id].set_ylabel("Score ~ [-1, 1]", fontsize=TEXT_SIZE)
        else:
            axes[mode_id].set_ylabel("Score ~ [0, 1]", fontsize=TEXT_SIZE)
        axes[mode_id].set_xlabel("'{}' Label Index ~ [0, {})".format(decomp.mode_names[mode_id], decomp.mode_sizes[mode_id]), fontsize=TEXT_SIZE)
        tick_marks = axes[mode_id].get_yticks()

        # draw the bars
        if mode_size < DISCRETE_COUNT:
            # store the LineCollections in dictionary line_collect, so they can be removed later
            # if number of scores < DISCRETE_COUNT; create a bar plot;
            # otherwise, use linear plotting
            # for discrete plotting, draw vertical lines to create bars representing the scores
            # creates an array of lines with coordinates [0, score] (i.e., vertical lines)
            lin_segmnts = LineCollection([list(zip([elmnt,elmnt], 
                [decomp.factors[mode_id][:, comp_id][elmnt], 0])) for elmnt in range(mode_size)], 
                colors='blue', linewidth=1.5)
            line_collect[mode_id] = lin_segmnts
            axes[mode_id].add_collection(lin_segmnts)

            axes[mode_id].axis(ymin=y_min, ymax=y_max, xmin=-X_AXIS_OVERHEAD * mode_size, xmax=(mode_size - 1) + X_AXIS_OVERHEAD * mode_size)
            axes[mode_id].draw_artist(lin_segmnts)
            plot_clct[mode_id].set_data([],[])
            axes[mode_id].set_yticks(np.linspace(y_min, y_max, num=5))
        else:
            indices = np.linspace(0, mode_size, num=mode_size)
            plot_clct[mode_id].set_data(indices, decomp.factors[mode_id][:, comp_id])
            axes[mode_id].relim()
            axes[mode_id].autoscale()
            axes[mode_id].axis(xmin=-X_AXIS_OVERHEAD * mode_size, xmax=(mode_size-1) + X_AXIS_OVERHEAD * mode_size)
            axes[mode_id].set_yticks(np.linspace(y_min, y_max, num=5))
            axes[mode_id].draw_artist(plot_clct[mode_id])

        # draw top k
        if top_k:
            ymin, ymax = axes[mode_id].get_ylim()
            xmin, xmax = axes[mode_id].get_xlim()
            width, height = xmax, ymax + abs(ymin)
            text_height = height / 10
            fontsize = TEXT_SIZE - (2 + (decomp.order - 5))

            for i, tup in enumerate(top_k[comp_id][mode_id]):
                label, idx, score = tup
                label = str(label)
                text_y = ymax - (text_height * (i + 1))
                label = label[:15] + '...' + label[-15:] if len(label) > 30 else label
                axes[mode_id].text(x=width, y=text_y, s="  {:04d} | {} | {:04f}".format(idx, label, score), fontsize=fontsize)
            axes[mode_id].text(x=width, y=ymax, s="  [Top 10] Index | Label | Score", fontsize=fontsize, color='red')

    # Save the figure
    plt.subplots_adjust(hspace=.99)
    fig.tight_layout(rect=[0, 0, 1, 0.99])  

    if not inline:
        fig.savefig('{}_comp_{}.png'.format(comp_id, comp_id), bbox_inches='tight', dpi=DPI)
    plt.close('all')
    return fig

if __name__ == "__main__":
    main()