Source code for ensign.decomp_diff

#!/usr/bin/env python
# ENSIGN rights

import numpy as np
import scipy.spatial.distance as ssd
import ensign.distance as ed
from ensign.constants import DTYPE
from ensign import cp_decomp as cpd
import ensign.ensign_io.ensign_logging as logging

logger = logging.get_logger()

DEFAULT_THRESHOLD = 0.6
IGNORE_WEIGHTS = False
SUPPORTED_SCIPY_METRICS = ['braycurtis', 'canberra', 'chebyshev', 'cityblock', 'correlation', 'cosine',
                           'dice', 'euclidean', 'hamming', 'jaccard', 'jensenshannon', 'kulsinski',
                           'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'sokalmichener',
                           'sokalsneath', 'sqeuclidean', 'yule']

DEFAULT_METRIC = "cosine"
SUPPORTED_ANALYSIS_TYPES = ['similarity', 'mapping', 'number']
DEFAULT_ANALYSIS_TYPE = "similarity"

# Error strings (used in exceptions and logging)
ERROR_DEFAULT = 'An error occurred with decomp_diff'
ERROR_INCOMPARABLE = 'Decompositions are not comparable: {error}'
ERROR_PARAM = 'Invalid parameter value: {param}={value}, {error}'
ERROR_CSR = 'Invalid syntax for comma-separated range value: {csr}'
ERROR_DECOMP_READ = 'Unable to read decomposition data from {path}: {error}'
ERROR_INVALID_ANALYSIS_TYPE = 'Invalid analysis type: {type}'
ERROR_UNSUPPORTED_METRIC = 'Unsupported metric: {metric}'
ERROR_INVALID_THRESHOLD = 'Invalid value for threshold: {value}. Threshold must be a non-negative float value.'
ERROR_INSUFFICIENT_DECOMPS = 'Two decompositions are required for comparison'
ERROR_INSUFFICIENT_COMPONENTS = 'Number of component specifiers does not match number of decompositions'
ERROR_MODE_COUNT = 'Incompatible number of modes ({actual}) for decomposition in {id}; should be {expected}'
ERROR_MODE_SIZE = 'Incompatible size({actual}) for mode {mode} in {id}; should be {expected}'
ERROR_MODE_SHAPE = 'Incompatible shape ({actual}) for mode {mode} in {id}; should be {expected}'
ERROR_MODE_OUT_OF_BOUNDS = 'Mode {mode} ID is greater than or equal to tensor rank {order} in tensor ID {id} (from files {filenames})'
ERROR_COMPONENT_OUT_OF_BOUNDS = 'Component ID {selection} is invalid for tensor ID {id} with tensor rank {rank} (from files {filenames}). Skipping.'


class DiffCPDecomp:
    """
    An organizational helper class that associates a CPDecomp with its ID, the subset
    of components and modes used to compare it to another, and the n-th root of the decomp weights.
    """

    def __init__(self):
        self.decomp = None
        self.decomp_id = None
        self.diff_modes = []
        self.diff_components = []
        self.nth_root_weights = []

# TODO: Consider expanding this custom exception hierarchy to make it usable across all ensign tools.


class DecompDiffError(Exception):
    """
    The base exception for the Decomposition Diff module. Sub-classes should redefine 'message'
    """
    message = ERROR_DEFAULT

    def __init__(self, **kwargs):
        msg = self.message.format(**kwargs)
        super().__init__(msg)
        self.kwargs = kwargs


class DecompDiffIncomparableError(DecompDiffError):
    """ Exception raised when decompositions are not comparable.
    """
    message = ERROR_INCOMPARABLE


class DecompDiffParameterError(DecompDiffError):
    """ Exception raised when an invalid parameter value is provided.
    """
    message = ERROR_PARAM


class DecompDiffCsrSyntaxError(DecompDiffParameterError):
    """ Exception raised when a syntactically invalid comma-separated range value is detected
    """
    message = ERROR_CSR


class DecompDiffReadError(DecompDiffError):
    """ Exception raised when reading a decomposition.
    """
    message = ERROR_DECOMP_READ

def _validate_analyses(analyses):
    if not analyses:
        logger.error("No analysis types given")
        raise DecompDiffParameterError(param='analyses', value=analyses, error='No analysis types given')
    for analysis in analyses:
        if analysis not in SUPPORTED_ANALYSIS_TYPES:
            errmsg = ERROR_INVALID_ANALYSIS_TYPE.format(type=analysis)
            logger.error(errmsg)
            raise DecompDiffParameterError(param='analyses', value=analyses, error=errmsg)


def _validate_metric(metric):
    if metric not in SUPPORTED_SCIPY_METRICS + ed.SUPPORTED_ENSIGN_METRICS:
        errmsg = ERROR_UNSUPPORTED_METRIC.format(metric=metric)
        logger.error(errmsg)
        raise DecompDiffParameterError(param='metric', value=metric, error=errmsg)


def _validate_threshold(threshold):
    try:
        thresh_float_val = float(threshold)
        if thresh_float_val < 0.0:
            errmsg = ERROR_INVALID_THRESHOLD.format(value=threshold)
            logger.error(errmsg)
            raise DecompDiffParameterError(param='threshold', value=threshold, error=errmsg)
    except ValueError:
        errmsg = ERROR_INVALID_THRESHOLD.format(value=threshold)
        logger.error(errmsg)
        raise DecompDiffParameterError(param='threshold', value=threshold, error=errmsg)

def _validate_mode_selection(mode_selection, ddc):
    if isinstance(mode_selection, int) and not isinstance(mode_selection, list):
        mode_selection = [mode_selection]
    for mode_id in mode_selection:
        if mode_id >= ddc.decomp.order or mode_id < 0:
            errmsg = ERROR_MODE_OUT_OF_BOUNDS.format(mode=mode_id, order=ddc.decomp.order, id=ddc.decomp_id, filenames=ddc.decomp.filenames)
            logger.error(errmsg)
            raise DecompDiffParameterError(param='modes', value=mode_selection, error=errmsg)
    ddc.diff_modes = mode_selection

def _validate_component_selection(component_selection, ddc):
    if isinstance(component_selection, int) and not isinstance(component_selection, list):
        component_selection = [component_selection]
    valid_components = []
    for component_id in component_selection:
        if component_id >= ddc.decomp.rank or component_id < 0:
            errmsg = ERROR_COMPONENT_OUT_OF_BOUNDS.format(selection=component_id, rank=ddc.decomp.rank,
                                                          id=ddc.decomp_id, filenames = ddc.decomp.filenames)
            logger.warning(errmsg)
        else:
            valid_components.append(component_id)
    ddc.diff_components = valid_components

def _assert_comparability(decomps):
    if len(decomps) != 2:
        errmsg = ERROR_INSUFFICIENT_DECOMPS
        logger.error(errmsg)
        raise DecompDiffIncomparableError(error=errmsg)

    decomp_0_mode_sizes = decomps[0].decomp.mode_sizes
    decomp_0_factors = decomps[0].decomp.factors
    decomp_0_diff_modes = decomps[0].diff_modes
    for ddc in decomps[1:]:
        # The number of modes used in the comparison must be same for all decompositions
        expected = len(decomp_0_diff_modes)
        actual = len(ddc.diff_modes)
        if expected != actual:
            errmsg = ERROR_MODE_COUNT.format(actual=actual, id=ddc.decomp_id, expected=expected)
            logger.error(errmsg)
            raise DecompDiffIncomparableError(error=errmsg)

        for mode_id in range(0, len(decomps[0].diff_modes)):
            # The mode size of each mode must be the same for all decompositions
            expected = decomp_0_mode_sizes[decomp_0_diff_modes[mode_id]]
            actual = ddc.decomp.mode_sizes[ddc.diff_modes[mode_id]]
            if expected != actual:
                errmsg = ERROR_MODE_SIZE.format(actual=actual, mode=ddc.diff_modes[mode_id], id=ddc.decomp_id,
                                                expected=expected)
                logger.error(errmsg)
                raise DecompDiffIncomparableError(error=errmsg)

            # The shape of each factor matrix must be the same for all decompositions, only in the number of values.
            expected = decomp_0_factors[decomp_0_diff_modes[mode_id]].shape[0]
            actual = ddc.decomp.factors[ddc.diff_modes[mode_id]].shape[0]
            if expected != actual:
                errmsg = ERROR_MODE_SHAPE.format(actual=actual, mode=ddc.diff_modes[mode_id],
                                                 id=ddc.decomp_id, expected=expected)
                logger.error(errmsg)
                raise DecompDiffIncomparableError(error=errmsg)

def _compute_similarity_matrix(decomps, metric, ignore_weights):
    component_subsets = [len(ddc.diff_components) for ddc in decomps]
    sm = np.zeros(component_subsets)

    metric_fn = ssd.cosine
    if metric in SUPPORTED_SCIPY_METRICS:
        metric_fn = getattr(ssd, metric)
    elif metric in ed.SUPPORTED_ENSIGN_METRICS:
        metric_fn = getattr(ed, metric)

    # Future Work: Generalize this to compare the components of N decompositions
    d0 = decomps[0]
    d1 = decomps[1]
    for i, comp_0 in enumerate(d0.diff_components, 0):
        for j, comp_1 in enumerate(d1.diff_components, 0):
            comp_vector_0 = np.concatenate(
                [d0.decomp.factors[mode_id][:, comp_0] for mode_id in d0.diff_modes])
            comp_vector_1 = np.concatenate(
                [d1.decomp.factors[mode_id][:, comp_1] for mode_id in d1.diff_modes])
            if not ignore_weights:
                comp_vector_0 = comp_vector_0 * d0.nth_root_weights[comp_0]
                comp_vector_1 = comp_vector_1 * d1.nth_root_weights[comp_1]
            sm[i, j] = metric_fn(comp_vector_0, comp_vector_1)
    return sm


def _compute_similarity_mappings(similarity_matrix, threshold):
    # Find components from decomp_1 (cols) that are 'similar' to components in decomp_0 (rows) and save
    # their indexes and distance. Also, find unique components in decomp_1 - those that have no 'similar'
    # components in decomp_0.
    similar_component_mappings = []
    unique_components = [True] * similarity_matrix.shape[1]
    for i in range(0, similarity_matrix.shape[0]):
        similar_comp_found = False
        for j in range(0, similarity_matrix.shape[1]):
            if similarity_matrix[i, j] <= threshold:
                similar_comp_found = True
                similar_component_mappings.append([i, j, similarity_matrix[i, j]])
                unique_components[j] = False
        if not similar_comp_found:
            similar_component_mappings.append([i, -1, 0])

    # Add in the unique components from decomp 1.
    for j, is_unique in enumerate(unique_components, 0):
        if is_unique:
            similar_component_mappings.append([-1, j, 0])

    return similar_component_mappings


def _compute_similarity_number(decomps, metric='cosine', ignore_weights=False):
    # The maximum number of components to be used for any decomposition is the minimum number of
    # components specified, for all decompositions.
    max_comps_per_decomp = min(len(ddc.diff_components) for ddc in decomps)

    # Create a single vector for each decomposition that consists of all the specified modes and components
    decomp_vectors = []
    for ddc in decomps:
        # Restrict the components for this decomposition to the maximum allowed. In this way, we get the
        # same sized vector used in comparisons between decompositions.
        components_for_decomp = ddc.diff_components[0:max_comps_per_decomp]

        # Construct the subset of factors for this decomp based on the subset of modes and the subset of components.
        # Similar to decomp.factors, this is a list of np.array objects. The difference being there are only factor
        # matrices for the modes specified, and the rank is reduced to only those components that are specified.
        factors_subset = [ddc.decomp.factors[mode_id][:, components_for_decomp] for mode_id in ddc.diff_modes]
        if not ignore_weights:
            for factor_sub_matrix in factors_subset:
                nth_root_weights = ddc.decomp.weights[components_for_decomp]
                factor_sub_matrix.__imul__(nth_root_weights)
        decomp_vectors.append(np.concatenate(factors_subset).flatten())

    # Compute metric between every pair of vectors.
    metric_fn = metric
    if metric in SUPPORTED_SCIPY_METRICS:
        metric_fn = metric
    elif metric in ed.SUPPORTED_ENSIGN_METRICS:
        metric_fn = getattr(ed, metric)
    distances = ssd.pdist(np.array(decomp_vectors, dtype=DTYPE), metric=metric_fn)

    # Future Work: We're currently assuming only two decompositions are being compared by this tool. This fn can handle
    # an arbitrary number, in which case it could return the condensed distance matrix from pdist().
    return distances[0]


[docs]def decomp_diff(decomps, analyses=None, modes=None,
                components=None, threshold=DEFAULT_THRESHOLD,
                ignore_weights=IGNORE_WEIGHTS, metric=DEFAULT_METRIC):
    """
    Determine differences(similarities) between decompositions.

    Parameters
    ----------
    decomps : [ensign.CPDecomp]
        A list of CPDecomp objects, one for each decomposition to be compared.
    analyses : [string]
        A list of analysis types. One or more of: 'number', 'mapping', 'similarity'
    modes : [int]
        A list of mode identifiers to be used in the comparison. Default: all modes are compared.
    components : [[int]]
        One per decomposition, a list of component identifiers to be used in comparison. 
        Default: all components are compared for every decomposition.
    threshold : float
        The threshold value that determines similarity when determining similar components
    ignore_weights : bool
        Either ignore (True) or use (False) decomposition weights when comparing decompositions. Default: False
    metric : string
        The name of metric to be used in comparisons. Can be either a metric from scipy.spatial.distance, or a custom
        metric defined in ensign.distance.

    Returns
    -------
    result : dict
        The key to the dictionary is the analysis type, the value is the result of that analysis.
            'number' : float - The result of comparing each decomposition
            'mapping' : dict - For each component in decomp[0] show the most similar component(s) in "
                              "decomp[1] within the given threshold. A unique component in decomp[0] will "
                              "map to an empty list. Also, map components from decomp[1] with no similar "
                              "component in decomp[0], within the threshold.
            'similarity' : numpy.ndarray - The similarity matrix showing comparision between all specified
                                           components of decomp[0] and all specified components of decomp[1]

    """
    logger.debug('Starting decomp_diff')
    logger.debug('  decomps: ' + str(decomps))
    logger.debug('  analyses: ' + str(analyses))
    logger.debug('  modes: ' + str(modes))
    logger.debug('  components: ' + str(components))
    logger.debug('  threshold: ' + str(threshold))
    logger.debug('  ignore_weights: ' + str(ignore_weights))
    logger.debug('  metric: ' + metric)

    logger.debug('Validating parameters')
    # The default list of analyses to perform. This is a mutable default argument.
    # See https://docs.python-guide.org/writing/gotchas/
    if analyses is None:
        logger.debug("Using Default Analysis type: {type}".format(type=DEFAULT_ANALYSIS_TYPE))
        analyses = [DEFAULT_ANALYSIS_TYPE]

    # Future work: Expand ability of this tool to compare more than two decompositions
    if not decomps or len(decomps) != 2:
        errmsg = ERROR_INSUFFICIENT_DECOMPS
        logger.error(errmsg)
        raise DecompDiffParameterError(param='decomps', value=decomps, error=errmsg)

    # A default value of None for components means that all components of each decomposition will be compared.
    if components is None:
        logger.debug("Using all components in each decomposition")
        components = [list(range(0, decomp.rank)) for decomp in decomps]

    # A default value of None for modes means that all modes of each decomposition will be compared.
    if modes is None:
        logger.debug("Using all modes in each decomposition")
        modes = list(range(0, decomps[0].order))

    # If specified, the number of component strings must be the same as the number of decomps.
    if components is not None and len(decomps) != len(components):
        errmsg = ERROR_INSUFFICIENT_COMPONENTS
        logger.error(errmsg)
        raise DecompDiffParameterError(param='components', value=components, error=errmsg)

    _validate_analyses(analyses)
    _validate_metric(metric)
    _validate_threshold(threshold)

    logger.debug('Reading decompositions, determining modes and components')
    diff_decomps = []
    for decomp_id, (decomp, decomp_component) in enumerate(zip(decomps, components)):
        ddc = DiffCPDecomp()
        ddc.decomp = decomp
        ddc.decomp_id = decomp_id
        _validate_mode_selection(modes, ddc)
        logger.debug("    Selected modes: {modes}".format(modes=str(ddc.diff_modes)))
        _validate_component_selection(decomp_component, ddc)
        logger.debug("    Selected components: {comps}".format(comps=str(ddc.diff_components)))
        if not ignore_weights:
            ddc.nth_root_weights = [w ** (1.0 / ddc.decomp.rank) for w in ddc.decomp.weights]
        diff_decomps.append(ddc)

    logger.debug('Determining comparability of decompositions')
    _assert_comparability(diff_decomps)

    result = {}
    if 'similarity' in analyses or 'mapping' in analyses:
        logger.debug('computing similarity matrix')
        similarity_matrix = _compute_similarity_matrix(diff_decomps, metric, ignore_weights)
        if 'similarity' in analyses:
            result['similarity'] = similarity_matrix
        if 'mapping' in analyses:
            logger.debug('Computing similarity mappings')
            result['mapping'] = _compute_similarity_mappings(similarity_matrix, threshold)
    if 'number' in analyses:
        logger.debug('Computing similarity between decompositions')
        result['number'] = _compute_similarity_number(diff_decomps, metric, ignore_weights)

    logger.debug('Finished decomp_diff')
    return result