#!/usr/bin/env python
# ENSIGN rights
import numpy as np
import scipy.spatial.distance as ssd
import ensign.distance as ed
from ensign.constants import DTYPE
from ensign import cp_decomp as cpd
import ensign.ensign_io.ensign_logging as logging
logger = logging.get_logger()
DEFAULT_THRESHOLD = 0.6
IGNORE_WEIGHTS = False
SUPPORTED_SCIPY_METRICS = ['braycurtis', 'canberra', 'chebyshev', 'cityblock', 'correlation', 'cosine',
'dice', 'euclidean', 'hamming', 'jaccard', 'jensenshannon', 'kulsinski',
'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'sokalmichener',
'sokalsneath', 'sqeuclidean', 'yule']
DEFAULT_METRIC = "cosine"
SUPPORTED_ANALYSIS_TYPES = ['similarity', 'mapping', 'number']
DEFAULT_ANALYSIS_TYPE = "similarity"
# Error strings (used in exceptions and logging)
ERROR_DEFAULT = 'An error occurred with decomp_diff'
ERROR_INCOMPARABLE = 'Decompositions are not comparable: {error}'
ERROR_PARAM = 'Invalid parameter value: {param}={value}, {error}'
ERROR_CSR = 'Invalid syntax for comma-separated range value: {csr}'
ERROR_DECOMP_READ = 'Unable to read decomposition data from {path}: {error}'
ERROR_INVALID_ANALYSIS_TYPE = 'Invalid analysis type: {type}'
ERROR_UNSUPPORTED_METRIC = 'Unsupported metric: {metric}'
ERROR_INVALID_THRESHOLD = 'Invalid value for threshold: {value}. Threshold must be a non-negative float value.'
ERROR_INSUFFICIENT_DECOMPS = 'Two decompositions are required for comparison'
ERROR_INSUFFICIENT_COMPONENTS = 'Number of component specifiers does not match number of decompositions'
ERROR_MODE_COUNT = 'Incompatible number of modes ({actual}) for decomposition in {id}; should be {expected}'
ERROR_MODE_SIZE = 'Incompatible size({actual}) for mode {mode} in {id}; should be {expected}'
ERROR_MODE_SHAPE = 'Incompatible shape ({actual}) for mode {mode} in {id}; should be {expected}'
ERROR_MODE_OUT_OF_BOUNDS = 'Mode {mode} ID is greater than or equal to tensor rank {order} in tensor ID {id} (from files {filenames})'
ERROR_COMPONENT_OUT_OF_BOUNDS = 'Component ID {selection} is invalid for tensor ID {id} with tensor rank {rank} (from files {filenames}). Skipping.'
class DiffCPDecomp:
"""
An organizational helper class that associates a CPDecomp with its ID, the subset
of components and modes used to compare it to another, and the n-th root of the decomp weights.
"""
def __init__(self):
self.decomp = None
self.decomp_id = None
self.diff_modes = []
self.diff_components = []
self.nth_root_weights = []
# TODO: Consider expanding this custom exception hierarchy to make it usable across all ensign tools.
class DecompDiffError(Exception):
"""
The base exception for the Decomposition Diff module. Sub-classes should redefine 'message'
"""
message = ERROR_DEFAULT
def __init__(self, **kwargs):
msg = self.message.format(**kwargs)
super().__init__(msg)
self.kwargs = kwargs
class DecompDiffIncomparableError(DecompDiffError):
""" Exception raised when decompositions are not comparable.
"""
message = ERROR_INCOMPARABLE
class DecompDiffParameterError(DecompDiffError):
""" Exception raised when an invalid parameter value is provided.
"""
message = ERROR_PARAM
class DecompDiffCsrSyntaxError(DecompDiffParameterError):
""" Exception raised when a syntactically invalid comma-separated range value is detected
"""
message = ERROR_CSR
class DecompDiffReadError(DecompDiffError):
""" Exception raised when reading a decomposition.
"""
message = ERROR_DECOMP_READ
def _validate_analyses(analyses):
if not analyses:
logger.error("No analysis types given")
raise DecompDiffParameterError(param='analyses', value=analyses, error='No analysis types given')
for analysis in analyses:
if analysis not in SUPPORTED_ANALYSIS_TYPES:
errmsg = ERROR_INVALID_ANALYSIS_TYPE.format(type=analysis)
logger.error(errmsg)
raise DecompDiffParameterError(param='analyses', value=analyses, error=errmsg)
def _validate_metric(metric):
if metric not in SUPPORTED_SCIPY_METRICS + ed.SUPPORTED_ENSIGN_METRICS:
errmsg = ERROR_UNSUPPORTED_METRIC.format(metric=metric)
logger.error(errmsg)
raise DecompDiffParameterError(param='metric', value=metric, error=errmsg)
def _validate_threshold(threshold):
try:
thresh_float_val = float(threshold)
if thresh_float_val < 0.0:
errmsg = ERROR_INVALID_THRESHOLD.format(value=threshold)
logger.error(errmsg)
raise DecompDiffParameterError(param='threshold', value=threshold, error=errmsg)
except ValueError:
errmsg = ERROR_INVALID_THRESHOLD.format(value=threshold)
logger.error(errmsg)
raise DecompDiffParameterError(param='threshold', value=threshold, error=errmsg)
def _validate_mode_selection(mode_selection, ddc):
if isinstance(mode_selection, int) and not isinstance(mode_selection, list):
mode_selection = [mode_selection]
for mode_id in mode_selection:
if mode_id >= ddc.decomp.order or mode_id < 0:
errmsg = ERROR_MODE_OUT_OF_BOUNDS.format(mode=mode_id, order=ddc.decomp.order, id=ddc.decomp_id, filenames=ddc.decomp.filenames)
logger.error(errmsg)
raise DecompDiffParameterError(param='modes', value=mode_selection, error=errmsg)
ddc.diff_modes = mode_selection
def _validate_component_selection(component_selection, ddc):
if isinstance(component_selection, int) and not isinstance(component_selection, list):
component_selection = [component_selection]
valid_components = []
for component_id in component_selection:
if component_id >= ddc.decomp.rank or component_id < 0:
errmsg = ERROR_COMPONENT_OUT_OF_BOUNDS.format(selection=component_id, rank=ddc.decomp.rank,
id=ddc.decomp_id, filenames = ddc.decomp.filenames)
logger.warning(errmsg)
else:
valid_components.append(component_id)
ddc.diff_components = valid_components
def _assert_comparability(decomps):
if len(decomps) != 2:
errmsg = ERROR_INSUFFICIENT_DECOMPS
logger.error(errmsg)
raise DecompDiffIncomparableError(error=errmsg)
decomp_0_mode_sizes = decomps[0].decomp.mode_sizes
decomp_0_factors = decomps[0].decomp.factors
decomp_0_diff_modes = decomps[0].diff_modes
for ddc in decomps[1:]:
# The number of modes used in the comparison must be same for all decompositions
expected = len(decomp_0_diff_modes)
actual = len(ddc.diff_modes)
if expected != actual:
errmsg = ERROR_MODE_COUNT.format(actual=actual, id=ddc.decomp_id, expected=expected)
logger.error(errmsg)
raise DecompDiffIncomparableError(error=errmsg)
for mode_id in range(0, len(decomps[0].diff_modes)):
# The mode size of each mode must be the same for all decompositions
expected = decomp_0_mode_sizes[decomp_0_diff_modes[mode_id]]
actual = ddc.decomp.mode_sizes[ddc.diff_modes[mode_id]]
if expected != actual:
errmsg = ERROR_MODE_SIZE.format(actual=actual, mode=ddc.diff_modes[mode_id], id=ddc.decomp_id,
expected=expected)
logger.error(errmsg)
raise DecompDiffIncomparableError(error=errmsg)
# The shape of each factor matrix must be the same for all decompositions, only in the number of values.
expected = decomp_0_factors[decomp_0_diff_modes[mode_id]].shape[0]
actual = ddc.decomp.factors[ddc.diff_modes[mode_id]].shape[0]
if expected != actual:
errmsg = ERROR_MODE_SHAPE.format(actual=actual, mode=ddc.diff_modes[mode_id],
id=ddc.decomp_id, expected=expected)
logger.error(errmsg)
raise DecompDiffIncomparableError(error=errmsg)
def _compute_similarity_matrix(decomps, metric, ignore_weights):
component_subsets = [len(ddc.diff_components) for ddc in decomps]
sm = np.zeros(component_subsets)
metric_fn = ssd.cosine
if metric in SUPPORTED_SCIPY_METRICS:
metric_fn = getattr(ssd, metric)
elif metric in ed.SUPPORTED_ENSIGN_METRICS:
metric_fn = getattr(ed, metric)
# Future Work: Generalize this to compare the components of N decompositions
d0 = decomps[0]
d1 = decomps[1]
for i, comp_0 in enumerate(d0.diff_components, 0):
for j, comp_1 in enumerate(d1.diff_components, 0):
comp_vector_0 = np.concatenate(
[d0.decomp.factors[mode_id][:, comp_0] for mode_id in d0.diff_modes])
comp_vector_1 = np.concatenate(
[d1.decomp.factors[mode_id][:, comp_1] for mode_id in d1.diff_modes])
if not ignore_weights:
comp_vector_0 = comp_vector_0 * d0.nth_root_weights[comp_0]
comp_vector_1 = comp_vector_1 * d1.nth_root_weights[comp_1]
sm[i, j] = metric_fn(comp_vector_0, comp_vector_1)
return sm
def _compute_similarity_mappings(similarity_matrix, threshold):
# Find components from decomp_1 (cols) that are 'similar' to components in decomp_0 (rows) and save
# their indexes and distance. Also, find unique components in decomp_1 - those that have no 'similar'
# components in decomp_0.
similar_component_mappings = []
unique_components = [True] * similarity_matrix.shape[1]
for i in range(0, similarity_matrix.shape[0]):
similar_comp_found = False
for j in range(0, similarity_matrix.shape[1]):
if similarity_matrix[i, j] <= threshold:
similar_comp_found = True
similar_component_mappings.append([i, j, similarity_matrix[i, j]])
unique_components[j] = False
if not similar_comp_found:
similar_component_mappings.append([i, -1, 0])
# Add in the unique components from decomp 1.
for j, is_unique in enumerate(unique_components, 0):
if is_unique:
similar_component_mappings.append([-1, j, 0])
return similar_component_mappings
def _compute_similarity_number(decomps, metric='cosine', ignore_weights=False):
# The maximum number of components to be used for any decomposition is the minimum number of
# components specified, for all decompositions.
max_comps_per_decomp = min(len(ddc.diff_components) for ddc in decomps)
# Create a single vector for each decomposition that consists of all the specified modes and components
decomp_vectors = []
for ddc in decomps:
# Restrict the components for this decomposition to the maximum allowed. In this way, we get the
# same sized vector used in comparisons between decompositions.
components_for_decomp = ddc.diff_components[0:max_comps_per_decomp]
# Construct the subset of factors for this decomp based on the subset of modes and the subset of components.
# Similar to decomp.factors, this is a list of np.array objects. The difference being there are only factor
# matrices for the modes specified, and the rank is reduced to only those components that are specified.
factors_subset = [ddc.decomp.factors[mode_id][:, components_for_decomp] for mode_id in ddc.diff_modes]
if not ignore_weights:
for factor_sub_matrix in factors_subset:
nth_root_weights = ddc.decomp.weights[components_for_decomp]
factor_sub_matrix.__imul__(nth_root_weights)
decomp_vectors.append(np.concatenate(factors_subset).flatten())
# Compute metric between every pair of vectors.
metric_fn = metric
if metric in SUPPORTED_SCIPY_METRICS:
metric_fn = metric
elif metric in ed.SUPPORTED_ENSIGN_METRICS:
metric_fn = getattr(ed, metric)
distances = ssd.pdist(np.array(decomp_vectors, dtype=DTYPE), metric=metric_fn)
# Future Work: We're currently assuming only two decompositions are being compared by this tool. This fn can handle
# an arbitrary number, in which case it could return the condensed distance matrix from pdist().
return distances[0]
[docs]def decomp_diff(decomps, analyses=None, modes=None,
components=None, threshold=DEFAULT_THRESHOLD,
ignore_weights=IGNORE_WEIGHTS, metric=DEFAULT_METRIC):
"""
Determine differences(similarities) between decompositions.
Parameters
----------
decomps : [ensign.CPDecomp]
A list of CPDecomp objects, one for each decomposition to be compared.
analyses : [string]
A list of analysis types. One or more of: 'number', 'mapping', 'similarity'
modes : [int]
A list of mode identifiers to be used in the comparison. Default: all modes are compared.
components : [[int]]
One per decomposition, a list of component identifiers to be used in comparison.
Default: all components are compared for every decomposition.
threshold : float
The threshold value that determines similarity when determining similar components
ignore_weights : bool
Either ignore (True) or use (False) decomposition weights when comparing decompositions. Default: False
metric : string
The name of metric to be used in comparisons. Can be either a metric from scipy.spatial.distance, or a custom
metric defined in ensign.distance.
Returns
-------
result : dict
The key to the dictionary is the analysis type, the value is the result of that analysis.
'number' : float - The result of comparing each decomposition
'mapping' : dict - For each component in decomp[0] show the most similar component(s) in "
"decomp[1] within the given threshold. A unique component in decomp[0] will "
"map to an empty list. Also, map components from decomp[1] with no similar "
"component in decomp[0], within the threshold.
'similarity' : numpy.ndarray - The similarity matrix showing comparision between all specified
components of decomp[0] and all specified components of decomp[1]
"""
logger.debug('Starting decomp_diff')
logger.debug(' decomps: ' + str(decomps))
logger.debug(' analyses: ' + str(analyses))
logger.debug(' modes: ' + str(modes))
logger.debug(' components: ' + str(components))
logger.debug(' threshold: ' + str(threshold))
logger.debug(' ignore_weights: ' + str(ignore_weights))
logger.debug(' metric: ' + metric)
logger.debug('Validating parameters')
# The default list of analyses to perform. This is a mutable default argument.
# See https://docs.python-guide.org/writing/gotchas/
if analyses is None:
logger.debug("Using Default Analysis type: {type}".format(type=DEFAULT_ANALYSIS_TYPE))
analyses = [DEFAULT_ANALYSIS_TYPE]
# Future work: Expand ability of this tool to compare more than two decompositions
if not decomps or len(decomps) != 2:
errmsg = ERROR_INSUFFICIENT_DECOMPS
logger.error(errmsg)
raise DecompDiffParameterError(param='decomps', value=decomps, error=errmsg)
# A default value of None for components means that all components of each decomposition will be compared.
if components is None:
logger.debug("Using all components in each decomposition")
components = [list(range(0, decomp.rank)) for decomp in decomps]
# A default value of None for modes means that all modes of each decomposition will be compared.
if modes is None:
logger.debug("Using all modes in each decomposition")
modes = list(range(0, decomps[0].order))
# If specified, the number of component strings must be the same as the number of decomps.
if components is not None and len(decomps) != len(components):
errmsg = ERROR_INSUFFICIENT_COMPONENTS
logger.error(errmsg)
raise DecompDiffParameterError(param='components', value=components, error=errmsg)
_validate_analyses(analyses)
_validate_metric(metric)
_validate_threshold(threshold)
logger.debug('Reading decompositions, determining modes and components')
diff_decomps = []
for decomp_id, (decomp, decomp_component) in enumerate(zip(decomps, components)):
ddc = DiffCPDecomp()
ddc.decomp = decomp
ddc.decomp_id = decomp_id
_validate_mode_selection(modes, ddc)
logger.debug(" Selected modes: {modes}".format(modes=str(ddc.diff_modes)))
_validate_component_selection(decomp_component, ddc)
logger.debug(" Selected components: {comps}".format(comps=str(ddc.diff_components)))
if not ignore_weights:
ddc.nth_root_weights = [w ** (1.0 / ddc.decomp.rank) for w in ddc.decomp.weights]
diff_decomps.append(ddc)
logger.debug('Determining comparability of decompositions')
_assert_comparability(diff_decomps)
result = {}
if 'similarity' in analyses or 'mapping' in analyses:
logger.debug('computing similarity matrix')
similarity_matrix = _compute_similarity_matrix(diff_decomps, metric, ignore_weights)
if 'similarity' in analyses:
result['similarity'] = similarity_matrix
if 'mapping' in analyses:
logger.debug('Computing similarity mappings')
result['mapping'] = _compute_similarity_mappings(similarity_matrix, threshold)
if 'number' in analyses:
logger.debug('Computing similarity between decompositions')
result['number'] = _compute_similarity_number(diff_decomps, metric, ignore_weights)
logger.debug('Finished decomp_diff')
return result