Source code for ensign.decomp_diff

#!/usr/bin/env python
# ENSIGN rights

import numpy as np
import scipy.spatial.distance as ssd
import ensign.distance as ed
from ensign.constants import DTYPE
from ensign import cp_decomp as cpd
import ensign.ensign_io.ensign_logging as logging

logger = logging.get_logger()

DEFAULT_THRESHOLD = 0.6
IGNORE_WEIGHTS = False
SUPPORTED_SCIPY_METRICS = ['braycurtis', 'canberra', 'chebyshev', 'cityblock', 'correlation', 'cosine',
                           'dice', 'euclidean', 'hamming', 'jaccard', 'jensenshannon', 'kulsinski',
                           'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'sokalmichener',
                           'sokalsneath', 'sqeuclidean', 'yule']

DEFAULT_METRIC = "cosine"
SUPPORTED_ANALYSIS_TYPES = ['similarity', 'mapping', 'number']
DEFAULT_ANALYSIS_TYPE = "similarity"

# Error strings (used in exceptions and logging)
ERROR_DEFAULT = 'An error occurred with decomp_diff'
ERROR_INCOMPARABLE = 'Decompositions are not comparable: {error}'
ERROR_PARAM = 'Invalid parameter value: {param}={value}, {error}'
ERROR_CSR = 'Invalid syntax for comma-separated range value: {csr}'
ERROR_DECOMP_READ = 'Unable to read decomposition data from {path}: {error}'
ERROR_INVALID_ANALYSIS_TYPE = 'Invalid analysis type: {type}'
ERROR_UNSUPPORTED_METRIC = 'Unsupported metric: {metric}'
ERROR_INVALID_THRESHOLD = 'Invalid value for threshold: {value}. Threshold must be a non-negative float value.'
ERROR_INSUFFICIENT_DECOMPS = 'Two decompositions are required for comparison'
ERROR_INSUFFICIENT_COMPONENTS = 'Number of component specifiers does not match number of decompositions'
ERROR_MODE_COUNT = 'Incompatible number of modes ({actual}) for decomposition in {id}; should be {expected}'
ERROR_MODE_SIZE = 'Incompatible size({actual}) for mode {mode} in {id}; should be {expected}'
ERROR_MODE_SHAPE = 'Incompatible shape ({actual}) for mode {mode} in {id}; should be {expected}'
ERROR_MODE_OUT_OF_BOUNDS = 'Mode {mode} ID is greater than or equal to tensor rank {order} in tensor ID {id} (from files {filenames})'
ERROR_COMPONENT_OUT_OF_BOUNDS = 'Component ID {selection} is invalid for tensor ID {id} with tensor rank {rank} (from files {filenames}). Skipping.'


class DiffCPDecomp:
    """
    An organizational helper class that associates a CPDecomp with its ID, the subset
    of components and modes used to compare it to another, and the n-th root of the decomp weights.
    """

    def __init__(self):
        self.decomp = None
        self.decomp_id = None
        self.diff_modes = []
        self.diff_components = []
        self.nth_root_weights = []

# TODO: Consider expanding this custom exception hierarchy to make it usable across all ensign tools.


class DecompDiffError(Exception):
    """
    The base exception for the Decomposition Diff module. Sub-classes should redefine 'message'
    """
    message = ERROR_DEFAULT

    def __init__(self, **kwargs):
        msg = self.message.format(**kwargs)
        super().__init__(msg)
        self.kwargs = kwargs


class DecompDiffIncomparableError(DecompDiffError):
    """ Exception raised when decompositions are not comparable.
    """
    message = ERROR_INCOMPARABLE


class DecompDiffParameterError(DecompDiffError):
    """ Exception raised when an invalid parameter value is provided.
    """
    message = ERROR_PARAM


class DecompDiffCsrSyntaxError(DecompDiffParameterError):
    """ Exception raised when a syntactically invalid comma-separated range value is detected
    """
    message = ERROR_CSR


class DecompDiffReadError(DecompDiffError):
    """ Exception raised when reading a decomposition.
    """
    message = ERROR_DECOMP_READ

def _validate_analyses(analyses):
    if not analyses:
        logger.error("No analysis types given")
        raise DecompDiffParameterError(param='analyses', value=analyses, error='No analysis types given')
    for analysis in analyses:
        if analysis not in SUPPORTED_ANALYSIS_TYPES:
            errmsg = ERROR_INVALID_ANALYSIS_TYPE.format(type=analysis)
            logger.error(errmsg)
            raise DecompDiffParameterError(param='analyses', value=analyses, error=errmsg)


def _validate_metric(metric):
    if metric not in SUPPORTED_SCIPY_METRICS + ed.SUPPORTED_ENSIGN_METRICS:
        errmsg = ERROR_UNSUPPORTED_METRIC.format(metric=metric)
        logger.error(errmsg)
        raise DecompDiffParameterError(param='metric', value=metric, error=errmsg)


def _validate_threshold(threshold):
    try:
        thresh_float_val = float(threshold)
        if thresh_float_val < 0.0:
            errmsg = ERROR_INVALID_THRESHOLD.format(value=threshold)
            logger.error(errmsg)
            raise DecompDiffParameterError(param='threshold', value=threshold, error=errmsg)
    except ValueError:
        errmsg = ERROR_INVALID_THRESHOLD.format(value=threshold)
        logger.error(errmsg)
        raise DecompDiffParameterError(param='threshold', value=threshold, error=errmsg)

def _validate_mode_selection(mode_selection, ddc):
    if isinstance(mode_selection, int) and not isinstance(mode_selection, list):
        mode_selection = [mode_selection]
    for mode_id in mode_selection:
        if mode_id >= ddc.decomp.order or mode_id < 0:
            errmsg = ERROR_MODE_OUT_OF_BOUNDS.format(mode=mode_id, order=ddc.decomp.order, id=ddc.decomp_id, filenames=ddc.decomp.filenames)
            logger.error(errmsg)
            raise DecompDiffParameterError(param='modes', value=mode_selection, error=errmsg)
    ddc.diff_modes = mode_selection

def _validate_component_selection(component_selection, ddc):
    if isinstance(component_selection, int) and not isinstance(component_selection, list):
        component_selection = [component_selection]
    valid_components = []
    for component_id in component_selection:
        if component_id >= ddc.decomp.rank or component_id < 0:
            errmsg = ERROR_COMPONENT_OUT_OF_BOUNDS.format(selection=component_id, rank=ddc.decomp.rank,
                                                          id=ddc.decomp_id, filenames = ddc.decomp.filenames)
            logger.warning(errmsg)
        else:
            valid_components.append(component_id)
    ddc.diff_components = valid_components

def _assert_comparability(decomps):
    if len(decomps) != 2:
        errmsg = ERROR_INSUFFICIENT_DECOMPS
        logger.error(errmsg)
        raise DecompDiffIncomparableError(error=errmsg)

    decomp_0_mode_sizes = decomps[0].decomp.mode_sizes
    decomp_0_factors = decomps[0].decomp.factors
    decomp_0_diff_modes = decomps[0].diff_modes
    for ddc in decomps[1:]:
        # The number of modes used in the comparison must be same for all decompositions
        expected = len(decomp_0_diff_modes)
        actual = len(ddc.diff_modes)
        if expected != actual:
            errmsg = ERROR_MODE_COUNT.format(actual=actual, id=ddc.decomp_id, expected=expected)
            logger.error(errmsg)
            raise DecompDiffIncomparableError(error=errmsg)

        for mode_id in range(0, len(decomps[0].diff_modes)):
            # The mode size of each mode must be the same for all decompositions
            expected = decomp_0_mode_sizes[decomp_0_diff_modes[mode_id]]
            actual = ddc.decomp.mode_sizes[ddc.diff_modes[mode_id]]
            if expected != actual:
                errmsg = ERROR_MODE_SIZE.format(actual=actual, mode=ddc.diff_modes[mode_id], id=ddc.decomp_id,
                                                expected=expected)
                logger.error(errmsg)
                raise DecompDiffIncomparableError(error=errmsg)

            # The shape of each factor matrix must be the same for all decompositions, only in the number of values.
            expected = decomp_0_factors[decomp_0_diff_modes[mode_id]].shape[0]
            actual = ddc.decomp.factors[ddc.diff_modes[mode_id]].shape[0]
            if expected != actual:
                errmsg = ERROR_MODE_SHAPE.format(actual=actual, mode=ddc.diff_modes[mode_id],
                                                 id=ddc.decomp_id, expected=expected)
                logger.error(errmsg)
                raise DecompDiffIncomparableError(error=errmsg)

def _compute_similarity_matrix(decomps, metric, ignore_weights):
    component_subsets = [len(ddc.diff_components) for ddc in decomps]
    sm = np.zeros(component_subsets)

    metric_fn = ssd.cosine
    if metric in SUPPORTED_SCIPY_METRICS:
        metric_fn = getattr(ssd, metric)
    elif metric in ed.SUPPORTED_ENSIGN_METRICS:
        metric_fn = getattr(ed, metric)

    # Future Work: Generalize this to compare the components of N decompositions
    d0 = decomps[0]
    d1 = decomps[1]
    for i, comp_0 in enumerate(d0.diff_components, 0):
        for j, comp_1 in enumerate(d1.diff_components, 0):
            comp_vector_0 = np.concatenate(
                [d0.decomp.factors[mode_id][:, comp_0] for mode_id in d0.diff_modes])
            comp_vector_1 = np.concatenate(
                [d1.decomp.factors[mode_id][:, comp_1] for mode_id in d1.diff_modes])
            if not ignore_weights:
                comp_vector_0 = comp_vector_0 * d0.nth_root_weights[comp_0]
                comp_vector_1 = comp_vector_1 * d1.nth_root_weights[comp_1]
            sm[i, j] = metric_fn(comp_vector_0, comp_vector_1)
    return sm


def _compute_similarity_mappings(similarity_matrix, threshold):
    # Find components from decomp_1 (cols) that are 'similar' to components in decomp_0 (rows) and save
    # their indexes and distance. Also, find unique components in decomp_1 - those that have no 'similar'
    # components in decomp_0.
    similar_component_mappings = []
    unique_components = [True] * similarity_matrix.shape[1]
    for i in range(0, similarity_matrix.shape[0]):
        similar_comp_found = False
        for j in range(0, similarity_matrix.shape[1]):
            if similarity_matrix[i, j] <= threshold:
                similar_comp_found = True
                similar_component_mappings.append([i, j, similarity_matrix[i, j]])
                unique_components[j] = False
        if not similar_comp_found:
            similar_component_mappings.append([i, -1, 0])

    # Add in the unique components from decomp 1.
    for j, is_unique in enumerate(unique_components, 0):
        if is_unique:
            similar_component_mappings.append([-1, j, 0])

    return similar_component_mappings


def _compute_similarity_number(decomps, metric='cosine', ignore_weights=False):
    # The maximum number of components to be used for any decomposition is the minimum number of
    # components specified, for all decompositions.
    max_comps_per_decomp = min(len(ddc.diff_components) for ddc in decomps)

    # Create a single vector for each decomposition that consists of all the specified modes and components
    decomp_vectors = []
    for ddc in decomps:
        # Restrict the components for this decomposition to the maximum allowed. In this way, we get the
        # same sized vector used in comparisons between decompositions.
        components_for_decomp = ddc.diff_components[0:max_comps_per_decomp]

        # Construct the subset of factors for this decomp based on the subset of modes and the subset of components.
        # Similar to decomp.factors, this is a list of np.array objects. The difference being there are only factor
        # matrices for the modes specified, and the rank is reduced to only those components that are specified.
        factors_subset = [ddc.decomp.factors[mode_id][:, components_for_decomp] for mode_id in ddc.diff_modes]
        if not ignore_weights:
            for factor_sub_matrix in factors_subset:
                nth_root_weights = ddc.decomp.weights[components_for_decomp]
                factor_sub_matrix.__imul__(nth_root_weights)
        decomp_vectors.append(np.concatenate(factors_subset).flatten())

    # Compute metric between every pair of vectors.
    metric_fn = metric
    if metric in SUPPORTED_SCIPY_METRICS:
        metric_fn = metric
    elif metric in ed.SUPPORTED_ENSIGN_METRICS:
        metric_fn = getattr(ed, metric)
    distances = ssd.pdist(np.array(decomp_vectors, dtype=DTYPE), metric=metric_fn)

    # Future Work: We're currently assuming only two decompositions are being compared by this tool. This fn can handle
    # an arbitrary number, in which case it could return the condensed distance matrix from pdist().
    return distances[0]


[docs]def decomp_diff(decomps, analyses=None, modes=None, components=None, threshold=DEFAULT_THRESHOLD, ignore_weights=IGNORE_WEIGHTS, metric=DEFAULT_METRIC): """ Determine differences(similarities) between decompositions. Parameters ---------- decomps : [ensign.CPDecomp] A list of CPDecomp objects, one for each decomposition to be compared. analyses : [string] A list of analysis types. One or more of: 'number', 'mapping', 'similarity' modes : [int] A list of mode identifiers to be used in the comparison. Default: all modes are compared. components : [[int]] One per decomposition, a list of component identifiers to be used in comparison. Default: all components are compared for every decomposition. threshold : float The threshold value that determines similarity when determining similar components ignore_weights : bool Either ignore (True) or use (False) decomposition weights when comparing decompositions. Default: False metric : string The name of metric to be used in comparisons. Can be either a metric from scipy.spatial.distance, or a custom metric defined in ensign.distance. Returns ------- result : dict The key to the dictionary is the analysis type, the value is the result of that analysis. 'number' : float - The result of comparing each decomposition 'mapping' : dict - For each component in decomp[0] show the most similar component(s) in " "decomp[1] within the given threshold. A unique component in decomp[0] will " "map to an empty list. Also, map components from decomp[1] with no similar " "component in decomp[0], within the threshold. 'similarity' : numpy.ndarray - The similarity matrix showing comparision between all specified components of decomp[0] and all specified components of decomp[1] """ logger.debug('Starting decomp_diff') logger.debug(' decomps: ' + str(decomps)) logger.debug(' analyses: ' + str(analyses)) logger.debug(' modes: ' + str(modes)) logger.debug(' components: ' + str(components)) logger.debug(' threshold: ' + str(threshold)) logger.debug(' ignore_weights: ' + str(ignore_weights)) logger.debug(' metric: ' + metric) logger.debug('Validating parameters') # The default list of analyses to perform. This is a mutable default argument. # See https://docs.python-guide.org/writing/gotchas/ if analyses is None: logger.debug("Using Default Analysis type: {type}".format(type=DEFAULT_ANALYSIS_TYPE)) analyses = [DEFAULT_ANALYSIS_TYPE] # Future work: Expand ability of this tool to compare more than two decompositions if not decomps or len(decomps) != 2: errmsg = ERROR_INSUFFICIENT_DECOMPS logger.error(errmsg) raise DecompDiffParameterError(param='decomps', value=decomps, error=errmsg) # A default value of None for components means that all components of each decomposition will be compared. if components is None: logger.debug("Using all components in each decomposition") components = [list(range(0, decomp.rank)) for decomp in decomps] # A default value of None for modes means that all modes of each decomposition will be compared. if modes is None: logger.debug("Using all modes in each decomposition") modes = list(range(0, decomps[0].order)) # If specified, the number of component strings must be the same as the number of decomps. if components is not None and len(decomps) != len(components): errmsg = ERROR_INSUFFICIENT_COMPONENTS logger.error(errmsg) raise DecompDiffParameterError(param='components', value=components, error=errmsg) _validate_analyses(analyses) _validate_metric(metric) _validate_threshold(threshold) logger.debug('Reading decompositions, determining modes and components') diff_decomps = [] for decomp_id, (decomp, decomp_component) in enumerate(zip(decomps, components)): ddc = DiffCPDecomp() ddc.decomp = decomp ddc.decomp_id = decomp_id _validate_mode_selection(modes, ddc) logger.debug(" Selected modes: {modes}".format(modes=str(ddc.diff_modes))) _validate_component_selection(decomp_component, ddc) logger.debug(" Selected components: {comps}".format(comps=str(ddc.diff_components))) if not ignore_weights: ddc.nth_root_weights = [w ** (1.0 / ddc.decomp.rank) for w in ddc.decomp.weights] diff_decomps.append(ddc) logger.debug('Determining comparability of decompositions') _assert_comparability(diff_decomps) result = {} if 'similarity' in analyses or 'mapping' in analyses: logger.debug('computing similarity matrix') similarity_matrix = _compute_similarity_matrix(diff_decomps, metric, ignore_weights) if 'similarity' in analyses: result['similarity'] = similarity_matrix if 'mapping' in analyses: logger.debug('Computing similarity mappings') result['mapping'] = _compute_similarity_mappings(similarity_matrix, threshold) if 'number' in analyses: logger.debug('Computing similarity between decompositions') result['number'] = _compute_similarity_number(diff_decomps, metric, ignore_weights) logger.debug('Finished decomp_diff') return result