Source code for ensign.comp_top_k

# ENSIGN rights
"""Module for investigating the contents of a tensor decomposition by listing
the highest scoring labels in a component.
"""

import numpy as np

import ensign.ensign_io.ensign_logging as ensign_logging

log = ensign_logging.get_logger()

[docs]def get_top_k(factors, labels, comp_ids, k=10):
    """Computes the top k labels and corresponding scores for each mode for 
    each component in the list.
    
    Parameters
    ----------
    factors : list of ndarray
        Factor matrices formatted as a list of ndarrays. Each factor matrix
        should be represented as an ndarray and have shape (mode_size, rank).
    labels : list of lists of str
        List of label maps, each of which formatted as a list of strings.
        e.g. labels[1][5] should get the 5th label of mode 1.
    comp_ids : list of int
        List of component ids
    k : int
        The number of labels and scores to return from each mode for each 
        component.  

    Returns
    -------
    ret : dict
        Nested dictionaries containing sorted lists of tuples. Keys are
        component ID and mode ID.
        Each tuple contains a label, its index in the mode, and its score
        in the factor matrix. (label, idx, score)
        e.g. Retrieve a tuple of the highest scoring label
        from mode 2 of component 10: ret[10][2][0]
    """
    if k < 1:
        msg = 'The value passed to -k should be a positive integer.'
        log.error(msg)
        raise ValueError(msg)

    if len(factors) != len(labels):
        msg = 'The number of factor matrices (decomp_mode_<x>.txt files) does not equal the number of label maps (map_mode_<x>.txt files).'
        log.error(msg)
        raise IOError(msg)
    
    order = len(factors)

    if order == 0:
        msg = 'No factor matrices (decomp_mode_<x>.txt files) or label maps (map_mode_<x>.txt files) were passed.'
        log.error(msg)
        raise IOError(msg)

    rank = factors[0].shape[1]

    # Validate input component ids
    comps = []
    for comp_id in comp_ids:
        if comp_id not in list(range(rank)):
            msg = "Rank of the decomposition is: {}, Component {} does not exist.".format(rank, comp_id)
            log.warning(msg)
        else:
            comps.append(comp_id)
    comp_ids = comps

    top_k = {}
    for comp_id in list(comp_ids): # Check all modes as per definition
        top_k[comp_id] = {}

        for mode_id in range(order): # Check user-selected components
            component = factors[mode_id][:,comp_id] # Get the specific component
            mode_labels = labels[mode_id]

            # Find indices of top values in this component
            idxs = np.argsort(np.abs(component)) # Sort the component (ascending) and get the associated indices
            top_idxs = idxs[::-1][:k] # [::-1] Reverses the sort and [:k] grabs the top k

            # Get associated labels and scores from the indices
            top_labels = [mode_labels[i] for i in top_idxs] # Get the labels associated with top indices
            top_scores = list(filter(lambda x: x != 0, component[top_idxs])) # Get top k scores and filter out zeros

            top_k[comp_id][mode_id] = []
            for top_idx in top_idxs:
                score = component[top_idx]
                if score != 0:
                    top_k[comp_id][mode_id].append((mode_labels[top_idx], top_idx, score))

    return top_k