Source code for ensign.comp_top_k
# ENSIGN rights
"""Module for investigating the contents of a tensor decomposition by listing
the highest scoring labels in a component.
"""
import numpy as np
import ensign.ensign_io.ensign_logging as ensign_logging
log = ensign_logging.get_logger()
[docs]def get_top_k(factors, labels, comp_ids, k=10):
"""Computes the top k labels and corresponding scores for each mode for
each component in the list.
Parameters
----------
factors : list of ndarray
Factor matrices formatted as a list of ndarrays. Each factor matrix
should be represented as an ndarray and have shape (mode_size, rank).
labels : list of lists of str
List of label maps, each of which formatted as a list of strings.
e.g. labels[1][5] should get the 5th label of mode 1.
comp_ids : list of int
List of component ids
k : int
The number of labels and scores to return from each mode for each
component.
Returns
-------
ret : dict
Nested dictionaries containing sorted lists of tuples. Keys are
component ID and mode ID.
Each tuple contains a label, its index in the mode, and its score
in the factor matrix. (label, idx, score)
e.g. Retrieve a tuple of the highest scoring label
from mode 2 of component 10: ret[10][2][0]
"""
if k < 1:
msg = 'The value passed to -k should be a positive integer.'
log.error(msg)
raise ValueError(msg)
if len(factors) != len(labels):
msg = 'The number of factor matrices (decomp_mode_<x>.txt files) does not equal the number of label maps (map_mode_<x>.txt files).'
log.error(msg)
raise IOError(msg)
order = len(factors)
if order == 0:
msg = 'No factor matrices (decomp_mode_<x>.txt files) or label maps (map_mode_<x>.txt files) were passed.'
log.error(msg)
raise IOError(msg)
rank = factors[0].shape[1]
# Validate input component ids
comps = []
for comp_id in comp_ids:
if comp_id not in list(range(rank)):
msg = "Rank of the decomposition is: {}, Component {} does not exist.".format(rank, comp_id)
log.warning(msg)
else:
comps.append(comp_id)
comp_ids = comps
top_k = {}
for comp_id in list(comp_ids): # Check all modes as per definition
top_k[comp_id] = {}
for mode_id in range(order): # Check user-selected components
component = factors[mode_id][:,comp_id] # Get the specific component
mode_labels = labels[mode_id]
# Find indices of top values in this component
idxs = np.argsort(np.abs(component)) # Sort the component (ascending) and get the associated indices
top_idxs = idxs[::-1][:k] # [::-1] Reverses the sort and [:k] grabs the top k
# Get associated labels and scores from the indices
top_labels = [mode_labels[i] for i in top_idxs] # Get the labels associated with top indices
top_scores = list(filter(lambda x: x != 0, component[top_idxs])) # Get top k scores and filter out zeros
top_k[comp_id][mode_id] = []
for top_idx in top_idxs:
score = component[top_idx]
if score != 0:
top_k[comp_id][mode_id].append((mode_labels[top_idx], top_idx, score))
return top_k