Source code for ensign.comp_top_k

# ENSIGN rights
"""Module for investigating the contents of a tensor decomposition by listing
the highest scoring labels in a component.
"""

import numpy as np

import ensign.ensign_io.ensign_logging as ensign_logging

log = ensign_logging.get_logger()

[docs]def get_top_k(factors, labels, comp_ids, k=10): """Computes the top k labels and corresponding scores for each mode for each component in the list. Parameters ---------- factors : list of ndarray Factor matrices formatted as a list of ndarrays. Each factor matrix should be represented as an ndarray and have shape (mode_size, rank). labels : list of lists of str List of label maps, each of which formatted as a list of strings. e.g. labels[1][5] should get the 5th label of mode 1. comp_ids : list of int List of component ids k : int The number of labels and scores to return from each mode for each component. Returns ------- ret : dict Nested dictionaries containing sorted lists of tuples. Keys are component ID and mode ID. Each tuple contains a label, its index in the mode, and its score in the factor matrix. (label, idx, score) e.g. Retrieve a tuple of the highest scoring label from mode 2 of component 10: ret[10][2][0] """ if k < 1: msg = 'The value passed to -k should be a positive integer.' log.error(msg) raise ValueError(msg) if len(factors) != len(labels): msg = 'The number of factor matrices (decomp_mode_<x>.txt files) does not equal the number of label maps (map_mode_<x>.txt files).' log.error(msg) raise IOError(msg) order = len(factors) if order == 0: msg = 'No factor matrices (decomp_mode_<x>.txt files) or label maps (map_mode_<x>.txt files) were passed.' log.error(msg) raise IOError(msg) rank = factors[0].shape[1] # Validate input component ids comps = [] for comp_id in comp_ids: if comp_id not in list(range(rank)): msg = "Rank of the decomposition is: {}, Component {} does not exist.".format(rank, comp_id) log.warning(msg) else: comps.append(comp_id) comp_ids = comps top_k = {} for comp_id in list(comp_ids): # Check all modes as per definition top_k[comp_id] = {} for mode_id in range(order): # Check user-selected components component = factors[mode_id][:,comp_id] # Get the specific component mode_labels = labels[mode_id] # Find indices of top values in this component idxs = np.argsort(np.abs(component)) # Sort the component (ascending) and get the associated indices top_idxs = idxs[::-1][:k] # [::-1] Reverses the sort and [:k] grabs the top k # Get associated labels and scores from the indices top_labels = [mode_labels[i] for i in top_idxs] # Get the labels associated with top indices top_scores = list(filter(lambda x: x != 0, component[top_idxs])) # Get top k scores and filter out zeros top_k[comp_id][mode_id] = [] for top_idx in top_idxs: score = component[top_idx] if score != 0: top_k[comp_id][mode_id].append((mode_labels[top_idx], top_idx, score)) return top_k