# ENSIGN rights
"""CANDECOMP-PARAFAC (CP) sparse tensor decomposition tools.
This module contains functions for reading, writing, and performing CP
decompositions along with a class for representing decomposition results.
"""
from ctypes import *
import functools
import json
import multiprocessing as mltprc
import os
import random
import re
import string
import sys
import numpy as np
import pandas as pd
from ensign.constants import *
import ensign.constants as C
import ensign.ensign_ctypes.ensign_types as et
import ensign.ensign_io.ensign_logging as ensign_logging
import ensign.ensign_io.decomp_io as dio
import ensign.ensign_io.sptensor_io as sio
import ensign.sptensor as spt
API = cdll.LoadLibrary('libapi.so')
# Used when calculating backtrack data
# Determines which values (relative to max in mode) should be considered 0
REL_CHOP_POINT=1e-4
DOUBLE_BYTES = np.dtype(float).itemsize
CP_ALS = 0
CP_ALS_NN = 1
CP_APR = 2
CP_APR_PDNR = 3
CP_APR_PQNR = 4
MALLOC_ERROR = 3
logger = ensign_logging.get_logger()
class StreamingDatalessFit:
"""
Values necessary for calculating the fit of streaming decompositions
without using the base tensor data values.
Attributes
----------
norm : float
L2 norm of the base tensor.
residual_norm : float
Residual L2 norm from fitting the base tensor to the base tensor
decomposition.
inner_product : numpy.ndarray (float)
List of terms constituting the inner product of base tensor and tensor
reconstructed from the factor matrices of base tensor decomposition.
"""
def __init__(self, norm, res_norm, inner_product):
""" Constructor for CPDecomp
"""
self.norm = norm
self.residual_norm = res_norm
self.inner_product = inner_product
def __eq__(self, other):
if type(self) != type(other):
return False
if self.norm != other.norm:
return False
if self.residual_norm != other.residual_norm:
return False
if not np.array_equal(self.inner_product, other.inner_product):
return False
return True
def __str__(self):
return f'Norm: {self.norm}\nResidual-Norm: {self.residual_norm}\nInner-Product: {self.inner_product}'
[docs]class CPDecomp:
"""Represents the results of a CP decomposition.
Attributes
----------
rank : int
Rank of decomposition.
order : int
Order of decomposed tensor.
sptensor : SPTensor
SPTensor that was decomposed.
weights : numpy.ndarray
1D array of length ``rank`` containing weights for each component of
the decomposition.
factors : list of numpy.ndarray
List of 2D factor matrices accessed by ``mode_id``. Each factor matrix
has ``shape = (mode_size, rank)`` and ``dtype = "float64"``. All values
in a factor matrix lie inside the closed interval ``[-1, 1]``. Each
column of a factor matrix represents a single component.
factors_init : list of numpy.ndarray
List of 2D factor matrices as initialized for the decomposition. Only
saved if requested by the decomposition call.
labels : list of list of str, optional
An optional list of per-mode index labels accessed by ``mode_id``. Each
list of labels corresponds to the labels for each index of the mode.
Suppose we have a mode with ``mode_id = 3`` and ``"Yes"``, ``"No"``,
``"Maybe"`` mapped to indices 0-2. For this mode, ``labels[3]`` is
equal to ``["Yes", "No", "Maybe"]``. Decompositions are not required to
have labels.
mode_names : list of str, optional
Each mode's name accessed by ``mode_id``. Decompositions are not
required to have mode names.
mode_sizes : list of int
The number of indices in each mode accessed by ``mode_id``.
streaming_dataless_fit : ndarray
Used by decomposition algorithms in a streaming setting to determine
final fit.
metrics : dict
Dictionary with keys: [time, fit, cosine_sim, norm_scaling, cp_total_iter]
representing the time taken to run the decomposition, the final fit of
the decomposition, the cosine similarity of the decomposition to the
original tensor, the scale factor used to normalize the decompositions,
and the number of optimization steps used to reach the final
decomposition values, respectively.
cpd_backtrack : list of lists of int
One entry per component. Each list contains tensor entries that
contributed to the corresponding component. If CPDecomp object has two
components, and the first one tracks to tensor entries 0 and 2 and the
second tracks to tensor entries 1 and 3, then the backtrack will be
[[0,2], [1,3]]. Only exists if requested in the decomposition call.
See also
--------
ensign.sptensor.SPTensor : Sparse tensor class
"""
[docs] def __init__(self):
self.filenames = None
self.rank = 0
self.order = 0
self.sptensor = None
self.weights = None
self.streaming_dataless_fit = None
self.factors = []
self.labels = []
self.mode_names = []
self.mode_sizes = []
self.factors_init = None
self.metrics = None
def __eq__(self, d):
are_equal = True
if type(self) != type(d):
return False
# First deal with the basics. Are both decomps there? The same size/shape?
if self is None and d is None:
return True
elif self is None or d is None:
return False
are_equal = are_equal and self.rank == d.rank
are_equal = are_equal and self.order == d.order
are_equal = are_equal and self.weights.shape == d.weights.shape
are_equal = are_equal and len(self.factors) == len(d.factors)
if not are_equal:
return False
# Check weights
are_equal = are_equal and np.array_equal(self.weights, d.weights)
# Check streaming dataless fit
if self.streaming_dataless_fit is not None and d.streaming_dataless_fit is not None:
are_equal = are_equal and (self.streaming_dataless_fit == d.streaming_dataless_fit)
if not are_equal:
return False
# Check factor matrices
for i in range(len(self.factors)):
if not are_equal:
break
are_equal = are_equal and np.array_equal(self.factors[i], d.factors[i])
return are_equal
def __ne__(self, d):
return not self.__eq__(d)
def __str__(self):
ret = {'rank': self.rank, 'order': self.order,
'weights': self.weights.tolist()}
return json.dumps(ret)
[docs] def compute_cpd_backtrack(self, out_dir=None):
"""Computes CP Decomposition backtracking information.
Parameters
----------
out_dir : str
Directory to save backtracking information in.
"""
self.cpd_backtrack = _compute_cpd_backtrack(self)
if out_dir:
dio.write_decomp_backtrack(out_dir, self.cpd_backtrack)
def _get_sparse_co_tensor(py_sptensor):
order = c_long(py_sptensor.order)
nnz = c_long(py_sptensor.nnz)
mode_sizes = (c_long * py_sptensor.order)(*py_sptensor.mode_sizes)
py_entries = py_sptensor.entries.values[:, py_sptensor.order].astype(np.float64)
entries = py_entries.ctypes.data_as(POINTER(c_double))
py_coord_index = py_sptensor.entries.values[:, :-1].astype(np.int64).ravel()
index = cast((POINTER(c_long) * py_sptensor.nnz)(), POINTER(POINTER(c_long)))
index[0] = py_coord_index.ctypes.data_as(POINTER(c_long))
sparse_co_tensor = et.C_SparseCoTensor(nModes=order, nnz=nnz, size=mode_sizes, NNZ=entries, SM=index)
return sparse_co_tensor
def _get_py_decomp_obj(ktensor, sptensor, rank, metrics, options):
d = CPDecomp()
d.rank = rank
d.order = sptensor.order
weights = np.array([getattr(ktensor, 'lambda')[i] for i in range(rank)])
sorted_weight_idxs = np.argsort(weights)[::-1]
d.weights = weights[sorted_weight_idxs]
d.factors = []
for i in range(d.order):
factor = np.ctypeslib.as_array(
ktensor.matrices[i][0].data[0], shape=(sptensor.mode_sizes[i], rank))
factor = factor[:, sorted_weight_idxs]
d.factors.append(factor)
streaming_inner_product = \
[options.streaming_dataless_fit.contents.ip[i] for i in sorted_weight_idxs]
d.streaming_dataless_fit = StreamingDatalessFit(
options.streaming_dataless_fit.contents.norm,
options.streaming_dataless_fit.contents.res_norm,
np.array(streaming_inner_product)
)
d.mode_names = sptensor.mode_names
d.mode_sizes = sptensor.mode_sizes
d.labels = sptensor.labels
d.sptensor = sptensor
metrics_dict = {}
metrics_dict['time'] = metrics.time
metrics_dict['fit'] = metrics.fit
metrics_dict['cosine_sim'] = metrics.cosine_sim
metrics_dict['norm_scaling'] = metrics.norm_scaling
metrics_dict['cp_total_iter'] = metrics.cp_total_iter
d.metrics = metrics_dict
return d
def _get_init_guess(py_decomp, options):
""" Adds initial guesses for factor matrices to CPDecomp object
Copies initial factor matrix guesses from options to CPDecomp object.
Parameters
----------
py_decomp : CPDecomp
CPDecomp object defined in this module.
options : decomp_options
decomp_options struct as defined in 'ensign_types.h'
"""
py_decomp.factors_init = [np.zeros((py_decomp.mode_sizes[i], py_decomp.rank)) for i in range(py_decomp.order)]
for i in range(py_decomp.order):
for j in range(py_decomp.mode_sizes[i]):
for k in range(py_decomp.rank):
py_decomp.factors_init[i][j][k] = options.factor_matrices_initial_guess[i][j * py_decomp.rank + k]
def _get_ctype_decomp_options_obj(seed=0, verbose=False, memory_limit=2e9,
factor_matrices_initial_guess=POINTER(POINTER(c_double))(),
streaming_dataless_fit=POINTER(et.C_StreamingDatalessFit)(),
output_init_guess=False,
cp_als_memory_limit=C.CP_ALS_MEM_LIMIT_GB*(1024**3),
cp_als_max_iter=C.CP_ALS_MAX_ITER,
cp_als_stop_tol=C.CP_ALS_STOP_TOL,
cp_als_nn_max_iter=C.CP_ALS_NN_MAX_ITER,
cp_als_nn_memory_limit=C.CP_ALS_MEM_LIMIT_GB*(1024**3),
cp_als_nn_stop_tol=C.CP_ALS_NN_STOP_TOL,
cp_apr_memory_limit=C.CP_APR_MEM_LIMIT_GB*(1024**3),
cp_apr_max_outer_iter=C.CP_APR_MAX_OUTER_ITER,
cp_apr_max_inner_iter=C.CP_APR_MAX_INNER_ITER,
cp_apr_stop_tol=C.CP_APR_STOP_TOL,
cp_apr_pqnr_memory_limit=C.CP_APR_PQNR_MEM_LIMIT_GB*(1024**3),
cp_apr_pqnr_max_outer_iter=C.CP_APR_PQNR_MAX_OUTER_ITER,
cp_apr_pqnr_max_inner_iter=C.CP_APR_PQNR_MAX_INNER_ITER,
cp_apr_pqnr_stop_tol=C.CP_APR_PQNR_STOP_TOL,
cp_apr_pqnr_is_inexact=C.CP_APR_PQNR_IS_INEXACT,
cp_apr_pqnr_lbfgs_m=C.CP_APR_PQNR_LBFGS_M,
cp_apr_pqnr_skooch_mode=C.CP_APR_PQNR_SKOOCH_MODE,
cp_apr_pdnr_memory_limit=C.CP_APR_PDNR_MEM_LIMIT_GB*(1024**3),
cp_apr_pdnr_max_outer_iter=C.CP_APR_PDNR_MAX_OUTER_ITER,
cp_apr_pdnr_max_inner_iter=C.CP_APR_PDNR_MAX_INNER_ITER,
cp_apr_pdnr_stop_tol=C.CP_APR_PDNR_STOP_TOL,
cp_apr_pdnr_is_inexact=C.CP_APR_PDNR_IS_INEXACT):
return et.C_DecompOptions(c_int(seed),
c_int(verbose),
c_long(int(memory_limit)),
c_int(output_init_guess),
factor_matrices_initial_guess,
streaming_dataless_fit,
c_long(int(cp_als_memory_limit)),
c_long(cp_als_max_iter),
c_double(cp_als_stop_tol),
c_long(cp_als_nn_max_iter),
c_long(int(cp_als_nn_memory_limit)),
c_double(cp_als_nn_stop_tol),
c_long(int(cp_apr_memory_limit)),
c_long(cp_apr_max_outer_iter),
c_long(cp_apr_max_inner_iter),
c_double(cp_apr_stop_tol),
c_long(int(cp_apr_pqnr_memory_limit)),
c_long(cp_apr_pqnr_max_outer_iter),
c_long(cp_apr_pqnr_max_inner_iter),
c_double(cp_apr_pqnr_stop_tol),
c_long(cp_apr_pqnr_is_inexact),
c_long(cp_apr_pqnr_lbfgs_m),
c_long(int(cp_apr_pqnr_skooch_mode)),
c_long(int(cp_apr_pdnr_memory_limit)),
c_long(cp_apr_pdnr_max_outer_iter),
c_long(cp_apr_pdnr_max_inner_iter),
c_double(cp_apr_pdnr_stop_tol),
c_long(cp_apr_pdnr_is_inexact))
def _get_decomp_metrics():
return et.C_DecompMetrics(0.0, 0.0, 0.0, 0.0, 0)
def _get_k_tensor(order, rank):
num_modes = c_long(order)
num_factors = c_long(rank)
weights = POINTER(c_double)((c_double * rank)(*([1.0] * rank)))
# Array of pointers
matrices = (POINTER(et.C_FactorMatrix) * order)
# Instantiate array of pointers
matrices = matrices(*([pointer(et.C_FactorMatrix())] * order))
# Cast to double pointer
matrices = POINTER(POINTER(et.C_FactorMatrix))(matrices)
scratch_matrix = POINTER(et.C_FactorMatrix)()
return et.C_KTensor(num_modes, matrices, weights, num_factors, scratch_matrix)
def _validate_params(sptensor, rank, max_iter, stop_tol, mem_lim, outer_iter=1, inner_iter=1):
if not isinstance(sptensor, spt.SPTensor) and not isinstance(sptensor, str):
msg = "sptensor is not of class SPTensor or an SPTensor filename."
logger.error(msg)
raise TypeError(msg)
if sptensor == None:
msg = "sptensor is none"
logger.error(msg)
raise TypeError(msg)
if not isinstance(rank, int):
msg = "Rank must be an int."
logger.error(msg)
raise TypeError(msg)
if rank <= 0:
msg = "Rank of decomposition cannot be less than 1."
logger.error(msg)
raise TypeError(msg)
if max_iter < 1:
msg = "Maximum iterations cannot be less than 1."
logger.error(msg)
raise TypeError(msg)
if outer_iter < 1:
msg = "Maximum outer iterations cannot be less than 1."
logger.error(msg)
raise TypeError(msg)
if inner_iter < 1:
msg = "Maximum inner iterations cannot be less than 1."
logger.error(msg)
raise TypeError(msg)
if stop_tol <= 0.0:
msg = "Stop tolerance cannot be less than or equal to 0."
logger.error(msg)
raise TypeError(msg)
if mem_lim <= 0:
msg = "Memory Limit (GB) cannot be less than or equal to 0."
logger.error(msg)
raise TypeError(msg)
return 1
[docs]def reconstruct_into(decomp, tensor, comp_ids=None, orig_entries=False):
""" Reconstructs a CP decomposition 'into' a sparse tensor.
Reconstructs a tensor from a decomposition ``decomp`` by computing and summing
selected outer products. If a list of component IDs is given in ``comp_ids``
then only components in this list will contribute to the reconstruction.
Only nonzero indices of ``tensor`` are included in the reconstructed tensor.
Parameters
----------
decomp : CPDecomp
Decompostion to reconstruct using sum-of-outer-products.
tensor : SPTensor
The sparse tensor that contains the nonzero entries to reconstruct into.
comp_ids : list, optional
A list of components to reconstruct the SPTensor from. If ``None`` (default),
all components are used.
orig_entries : bool, optional
This parameter controls the value of entries in the reconstructed tensor.
When ``True``, nonzero values in the sum-of-outer-products reconstruction
are given the same value as ``tensor`` regardless of reconstructed value.
When ``False`` (default) nonzero values are given the value produced by
the sum-of-outer-products reconstruction.
Returns
-------
reconstructed_spt : SPTensor
The sparse tensor produced by reconstructing selected components of
``decomp`` into nonzero values of ``tensor``.
"""
# Validate decomposition and sparse tensor
if decomp is None:
msg = "Parameter 'decomp' must be a CPDecomp object (not 'None')"
logger.error(msg)
raise ValueError(msg)
if tensor is None:
msg = "Parameter 'tensor' must be a SPTensor object (not 'None')"
logger.error(msg)
raise ValueError(msg)
if not isinstance(decomp, CPDecomp) and not isinstance(tensor, spt.SPTensor):
msg = "Parameters 'decomp' and 'tensor' must be, respectively, CPDecomp and SPTensor objects"
logger.error(msg)
raise ValueError(msg)
if decomp.order != tensor.order:
msg = "Parameters 'decomp' and 'tensor' must have the same order"
logger.error(msg)
raise ValueError(msg)
if len(decomp.mode_sizes) != len(tensor.mode_sizes):
msg = "Parameters 'decomp' and 'tensor' must have the same order"
logger.error(msg)
raise ValueError(msg)
for mode_id in range(decomp.order):
if decomp.mode_sizes[mode_id] != tensor.mode_sizes[mode_id]:
msg = "Parameters 'decomp' and 'tensor' must have identical mode sizes"
logger.error(msg)
raise ValueError(msg)
# Validate component list
if comp_ids is not None:
for comp_id in comp_ids:
try:
tmp = int(comp_id)
except:
msg = "All items in parameter 'comp_ids' must be integers"
logger.error(msg)
raise ValueError(msg)
if int(comp_id) >= len(decomp.weights) or int(comp_id) < 0:
msg = "All items in parameter 'comp_ids' must be on the interval [0, 'd.rank')"
logger.error(msg)
raise ValueError(msg)
# Validate original entries flag
if (orig_entries != True and orig_entries != False):
msg = "Flag 'orig_entries' must be 'True' or 'False'"
logger.error(msg)
raise ValueError(msg)
# Copy original sparse tensor
reconstructed_spt = spt.SPTensor(tensor.order, tensor.nnz, tensor.mode_sizes,
tensor.entries.copy())
# Weight distributes to every dot product calculation for a component. If
# we want a component to be represented, we set its weight to weight*1.0.
base_product = np.copy(decomp.weights)
if comp_ids is not None:
base_product = np.zeros(decomp.rank)
for comp_id in comp_ids:
base_product[comp_id] = 1.0
base_product *= decomp.weights
# For every index tuple in the original sparse tensor we compute its outer
# product in all components simultaneously. We sum outer products for the
# index tuple to obtain its reconstructed value.
indices = tensor.entries.values.astype("int64")
for nz_id, nz in enumerate(indices):
outer_product = np.copy(base_product)
for mode_id in range(decomp.order):
entry_id = nz[mode_id]
entry_scores = decomp.factors[mode_id][entry_id]
outer_product *= entry_scores
reconstructed_value = 0
if orig_entries == True:
reconstructed_value = tensor.entries.values[nz_id, -1]
else:
reconstructed_value = outer_product.sum()
reconstructed_spt.entries.iloc[nz_id, decomp.order] = reconstructed_value
return reconstructed_spt
[docs]def get_fit_per_entry(decomp, tensor, top_k=None):
""" Calculates fit for each entry of a reconstructed tensor present in the original tensor.
This function evaluates how well each entry of the tensor reconstructed
from ``decomp`` matches the entry at the same index in ``tensor`` This
evaluation is calculated as fit, a floating point number on the interval
(-infinity, 1.0]. A fit of 1.0 is an exact match between the reconstructed
and original entry. Output is sorted fy fit (ascending).
Lower fit values indicate a reconstructed entry is different than the original
entry. Low fit values may be indicative of anomalous data.
Parameters
----------
decomp : CPDecomp
Decompostion used to calculate per-element fit. A sparse tensor is
reconstructed from ``decomp`` using sum-of-outer-products and the
results are compared to ``tensor``.
tensor : SPTensor
The sparse tensor that is the basis for comparison.
top_k : int, optional
The number of elements to return. If ``top_k`` is ``None`` (default),
fit values for all entries are returned. If ``top_k`` is positive, the
``top_k`` lowest fit elements are returned. If ``top_k`` is negative the
``abs(top_k)`` highest fit elements are returned. If ``top_k`` is zero
an empty ``ndarray`` is returned.
Returns
-------
fit_vals : numpy.ndarray
A ``numpy.ndarray`` of the same shape and format as ``tensor.entries``. The
first ``tensor.order`` columns of ``fit_vals`` are indices into into
each mode of ``tensor``. The final column of ``fit_vals`` contains the
entry's fit value.
"""
# Validate decomposition and sparse tensor
if decomp is None:
msg = "Parameter 'decomp' must be a CPDecomp object (not 'None')"
logger.error(msg)
raise ValueError(msg)
if tensor is None:
msg = "Parameter 'tensor' must be a SPTensor object (not 'None')"
logger.error(msg)
raise ValueError(msg)
if not (isinstance(decomp, CPDecomp) and isinstance(tensor, spt.SPTensor)):
msg = "Parameters 'decomp' and 'tensor' must be, respectively, CPDecomp and SPTensor objects"
logger.error(msg)
raise ValueError(msg)
if decomp.order != tensor.order:
msg = "Parameters 'decomp' and 'tensor' must have the same order"
logger.error(msg)
raise ValueError(msg)
if len(decomp.mode_sizes) != len(tensor.mode_sizes):
msg = "Parameters 'decomp' and 'tensor' must have the same order"
logger.error(msg)
raise ValueError(msg)
for mode_id in range(decomp.order):
if decomp.mode_sizes[mode_id] != tensor.mode_sizes[mode_id]:
msg = "Parameters 'decomp' and 'tensor' must have identical mode sizes"
logger.error(msg)
raise ValueError(msg)
# Validate top_k
if top_k is not None:
try:
tmp = int(top_k)
except:
msg = "Parameter 'top_k' must be integer or None"
logger.error(msg)
raise ValueError(msg)
if int(top_k) == 0:
return np.empty(shape=(0,0))
# Reconstruct decomposition into original tensor
r = reconstruct_into(decomp, tensor)
# Compute difference between original and reconstructed nonzero values
reconstructed_vals = r.entries.values[:, decomp.order]
orig_vals = tensor.entries.values[:, decomp.order]
diff_vals = orig_vals - reconstructed_vals
# Fit of original value in reconstruction is 1 - |diff / orig|
# NOTE: Negative values indicate bad fit, and larger magnitude is worse
# e.g., -1 is bad, -100 is worse
fit_orig = 1.0 - np.absolute(diff_vals / orig_vals)
# Copy original entries and replace value of each index tuple with fit
fit_vals = tensor.entries.values.copy()
fit_vals[:, decomp.order] = fit_orig
# Sort entries by fit (ascending). Smallest fit values are anomalies.
view_spec = ""
for i in range(decomp.order):
view_spec += "float64,"
view_spec += "float64"
field_spec = "f" + str(decomp.order)
fit_vals.view(view_spec).sort(order=[field_spec], axis=0)
# Return the sorted ndarray
if top_k is None:
return fit_vals
elif int(top_k) > 0:
return fit_vals[:top_k, :]
elif int(top_k) < 0:
return fit_vals[top_k:,:]
else:
# Guard, we should never reach this point
msg = "Invalid top_k value"
logger.error(msg)
raise ValueError(msg)
[docs]def are_close(decomp0, decomp1, rtol=1e-5, atol=1e-08):
"""Checks if weights and factor matrices are close according to tolerance.
Uses the following formula element-wise on weights and factor matrices:
abs(d0 - d1) <= (atol + rtol * abs(d1))
If this inequality is ``True`` for all elements then ``True`` is returned.
Parameters
----------
decomp0, decomp1 : CPDecomp
Decompositions to compare for closeness
rtol : float, optional
Relative tolerance
atol : float, optional
Absolute tolerance
Returns
-------
are_close : bool
True if the decomposition weights and factor matrices are element-wise
equal within a tolerance.
"""
are_close = True
# First deal with the basics. Are both decomps there? The same size/shape?
if decomp0 is None and decomp1 is None:
return True
elif decomp0 is None or decomp1 is None:
return False
are_close = are_close and decomp0.rank == decomp1.rank
are_close = are_close and decomp0.order == decomp1.order
are_close = are_close and decomp0.weights.shape == decomp1.weights.shape
are_close = are_close and len(decomp0.factors) == len(decomp1.factors)
for i in range(len(decomp0.factors)):
are_close = are_close and decomp0.factors[i].shape == decomp1.factors[i].shape
if not are_close:
return False
# Check weights
are_close = are_close and np.allclose(decomp0.weights,
decomp1.weights,
rtol=rtol,
atol=atol)
# Check factor matrices
for i in range(len(decomp0.factors)):
if not are_close:
break
are_close = are_close and np.allclose(decomp0.factors[i],
decomp1.factors[i],
rtol=rtol,
atol=atol)
return are_close
[docs]def read_cp_decomp_dir(decomp_dir):
"""Reads a CP decomposition from the filesystem.
Parameters
----------
decomp_dir : str
Path to the directory containing CP decomposition.
Returns
-------
decomposition : CPDecomp
The decomposition in ``decomp_dir``.
Raises
------
Exception
If the decomposition cannot be read or is not well formed.
See also
--------
ensign.cp_decomp.CPDecomp : CP decomposition class
"""
if(decomp_dir == None or type(decomp_dir) != str):
msg = "decomp_dir is not of type String."
logger.error(msg)
raise TypeError(msg)
if(len(decomp_dir) < 1):
msg = "decomp_dir is not a valid directory."
logger.error(msg)
raise ValueError(msg)
if decomp_dir[-1] != '/':
decomp_dir += '/'
# Initialize decomposition object and build list of decomp_mode_<i>.txt files
decomp = CPDecomp()
decomp_fn_pattern = re.compile('decomp_mode_[0-9]+.txt')
decomp_mode_fns = sorted(filter(lambda x: decomp_fn_pattern.fullmatch(x) is not None,
os.listdir(decomp_dir)))
if len(decomp_mode_fns) == 0:
msg = 'No decomp_mode_<i>.txt (factor matrices) files found in {}.'.format(decomp_dir)
logger.error(msg)
raise IOError(msg)
# Assign decomposition specific values
decomp.order = len(decomp_mode_fns)
decomp.weights, decomp.rank = dio.read_weights(decomp_dir + '/weights.txt')
decomp.factors = [dio.read_factor_matrix(decomp_dir + '/' + fn)[0] for fn in decomp_mode_fns]
decomp.mode_sizes = [factor.shape[0] for factor in decomp.factors]
if 'streaming.txt' in os.listdir(decomp_dir):
decomp.streaming_dataless_fit = dio.read_streaming(decomp_dir + '/streaming.txt')
# Assign sptensor specific values
mode_map_fn_pattern = re.compile('map_mode_[0-9]+.txt')
mode_map_fns = sorted(filter(lambda x: mode_map_fn_pattern.fullmatch(x) is not None,
os.listdir(decomp_dir)))
if 'tensor_data.txt' in os.listdir(decomp_dir):
decomp.sptensor = spt.read_sptensor(decomp_dir)
decomp.labels = decomp.sptensor.labels
decomp.mode_names = decomp.sptensor.mode_names
elif mode_map_fns:
decomp.mode_names, decomp.labels = sio.read_many_labels(decomp_dir, mode_map_fns)
else:
decomp.mode_names = list(map(lambda x: 'mode_'+str(x), list(range(len(decomp.mode_sizes)))))
decomp.labels = [['label_{}-{}'.format(str(i), str(x)) for x in range(mode_size)]
for i, mode_size in enumerate(decomp.mode_sizes)]
# Read backtracking data if available
decomp.cpd_backtrack = dio.read_decomp_backtrack(decomp_dir)
return decomp
[docs]def write_cp_decomp_dir(decomp_dir, decomp, write_tensor=False):
"""Writes a CP decomposition to the filesystem.
Parameters
----------
decomp_dir : str
Path to the destination directory of the CP decomposition. This
directory will be created if nonexistent and **will be erased and
overwritten** if it exists.
decomp : CPDecomp
The decomposition to be written.
write_tensor : bool
Write the tensor_data.txt and map_mode_<x>.txt files.
Raises
------
Exception
If the decomposition could not be written.
See also
--------
ensign.cp_decomp.CPDecomp : CP decomposition class
"""
if not os.path.exists(decomp_dir):
os.makedirs(decomp_dir)
else:
if write_tensor:
fn_pattern = re.compile('map_mode_[0-9]+.txt|decomp_mode_[0-9]+.txt|initial_guess_[0-9]+.txt|streaming.txt|weights.txt|tensor_data.txt')
else:
fn_pattern = re.compile('decomp_mode_[0-9]+.txt|initial_guess_[0-9]+.txt|streaming.txt|weights.txt')
for fn in list(filter(lambda x: fn_pattern.fullmatch(x) is not None,
os.listdir(decomp_dir))):
os.remove(os.path.join(decomp_dir, fn))
# Scores
for mode_id, factors in enumerate(decomp.factors):
dio.write_factor_matrix(decomp_dir, mode_id, factors)
# Weights
dio.write_weights(decomp_dir, decomp.weights)
# Streaming dataless fit
if decomp.streaming_dataless_fit is not None:
dio.write_streaming(decomp_dir, decomp.streaming_dataless_fit)
# Initial Score Guesses
if decomp.factors_init is not None:
for mode_id, factors in enumerate(decomp.factors_init):
dio.write_factor_matrix(decomp_dir, mode_id, factors, 'initial_guess')
# Labels
if write_tensor:
for mode_id, labels in enumerate(decomp.labels):
mode_name = decomp.mode_names[mode_id]
labels = decomp.labels[mode_id]
sio.write_labels(decomp_dir, mode_id, mode_name, labels)
# Tensor
if decomp.sptensor is not None and write_tensor:
sio.write_sptensor_entries(decomp_dir, decomp.sptensor.mode_sizes, decomp.sptensor.entries.values)
# Backtracking information
if decomp.cpd_backtrack is not None:
dio.write_decomp_backtrack(decomp_dir, decomp.cpd_backtrack)
else:
if os.path.isfile(decomp_dir + '/cpd_backtrack.txt'):
os.remove(decomp_dir + '/cpd_backtrack.txt')
def _filter_cartesian_product(index_lists, valid_indices):
"""
Computes the component backtrack info from a component of a decompositon
to the tensor by finding the intersection of valid_indices with the Cartesian
product of the lists in index_lists. index_lists can be of arbitary
length.
Parameters
----------
index_lists : list of lists
A list containing one list for each decomposition mode. Each list
contains indices that are hot in the component in question.
valid_indices : iterable of tuples
All index tuples appearing in the tensor
Returns
-------
product : list of tuples
All tuples in Cartesian product of index_lists. These correspond to
the tensor indices that are hot in the component.
"""
index_list_lengths = list(map(len, index_lists))
# if elements in cart. prod. were to reside in a multi-d array, the ith
# element of flat_lengths is the number of elements in a slice where the
# first i indices of that multi-d array are specified
flat_lengths = [functools.reduce(lambda x,y:x*y, index_list_lengths[i:])
for i in range(len(index_lists))]
n_entries_in_cart_prod = flat_lengths.pop(0)
product = []
for i in range(n_entries_in_cart_prod):
idx = i
# compute which indices should be chosen in constructing the current
# element of the cart. prod.
element_indices = ()
for length in flat_lengths:
element_indices += (idx // length,)
idx %= length
element_indices += (idx,)
# construct the element given the indices that should be chosen
element = tuple(index_lists[j][element_indices[j]] for j in range(len(index_lists)))
# only store the element if it is specified as valid
if element in valid_indices:
product.append(element)
return product
def _forwardtrack_decomp_component(decomp, comp_id):
"""
Computes the component backtrack from a component of a decompositon to the
tensor by looking "forward" from the tensor to the decomposition to
check which indices appear in the component. This method is faster than
_filter_cartesian_product if the component has more hot indices than
the original tensor.
Parameters
----------
decomp : CPDecomp
Decomposition object in question.
comp_id : int
The component whose backtrack info is being calculated.
Returns
-------
forward_track : list of tuples
All indices in the tensor that are hot in the component.
"""
tensor = decomp.sptensor
forward_track = []
# iterate over all hot indices in the tensor
for i, coordinate_row in enumerate(tensor.entries.values[:, :-1]):
nonzero_flag = 1
# check if the corresponding element of the outer product is 0
for label_index, factor_matrix in zip(coordinate_row, decomp.factors):
# use a relative chop point below which is "zero"
mode_max = factor_matrix[:, comp_id].max()
if factor_matrix[int(label_index), comp_id] < REL_CHOP_POINT * mode_max:
nonzero_flag = 0
if nonzero_flag:
forward_track.append(tuple(coordinate_row))
return forward_track
def _compute_cpd_backtrack(decomp):
"""
Computes the backtrack info from each component of a decompositon to
the original tensor. Sets the appropriate field in the CPDecomp object.
Parameters
----------
decomp : CPDecomp
Decomposition object in question.
"""
indices = [tuple(map(int, t))
for t in decomp.sptensor.entries.values[:, :-1]]
# save a map from indices to tensor line numbers
line_nums = dict(list(zip(indices, list(range(len(indices))))))
cpd_backtrack = []
# for each comp keep track of hot indices in each mode
for comp_id in range(decomp.rank):
non_zero_indices = []
for mode_id in range(decomp.order):
mode_vec = decomp.factors[mode_id][:, comp_id]
mode_max = mode_vec.max()
hot_mode_indices = list(np.argwhere(mode_vec > REL_CHOP_POINT * mode_max).ravel())
non_zero_indices.append(hot_mode_indices)
# check if there are more hot indices in the component than in tensor
num_comp_entries = np.prod(list(map(lambda l: len(l), non_zero_indices)))
if num_comp_entries > len(decomp.sptensor.entries):
indices_to_track = _forwardtrack_decomp_component(decomp, comp_id)
else:
indices_to_track = _filter_cartesian_product(non_zero_indices, line_nums)
# convert from indices to line numbers
entries_to_track = [line_nums[i] for i in indices_to_track]
cpd_backtrack.append(entries_to_track)
decomp.cpd_backtrack = cpd_backtrack
return cpd_backtrack
[docs]def cp_als(sptensor, rank, out_dir=None, seed=CP_ALS_SEED, max_iter=CP_ALS_MAX_ITER,
stop_tol=CP_ALS_STOP_TOL, mem_limit_gb=CP_ALS_MEM_LIMIT_GB,
output_init_guess=CP_ALS_OUTPUT_INIT_GUESS,
verbose=CP_ALS_VERBOSE, gen_backtrack=CP_ALS_BACKTRACK):
"""Performs a CP decomposition using the alternating least squares (ALS) method. [1]_ [2]_
Parameters
----------
sptensor : SPTensor or str
Tensor to decompose. If sptensor is of type str, it should contain a
path to a directory containing an SPTensor or directly to a
tensor_data.txt file.
rank : int
Rank of the decomposition.
out_dir : str
Path to directory where decomposition results will be written. If not
set, the decomposition will not be written to disk.
Returns
-------
decomp : CPDecomp
Results of the decomposition.
Other Parameters
----------------
seed : int, optional
Random seed used to initialize decomposition.
Default value is ``0``.
max_iter : int, optional
Maximum iterations before stopping decomposition.
Default value is ``100``.
stop_tol : float, optional
Minimum change in fit required for convergence.
Default value is ``1.0e-5``.
mem_limit_gb : int, optional
Maximum memory used during decomposition, in GiB.
Default value is ``2``.
output_init_guess : bool, optional
Write the initial guess of decomposition to ``out_dir/input_<mode_id>.txt``
Default value is ``False``.
verbose : bool, optional
Output verbose decomposition progress to ``out_dir/output.txt``
Default value is ``False``.
gen_backtrack : bool, optional
Output component-wise backtracking information to ``out_dir/cpd_backtrack.txt``
Default value is ``False``.
Raises
------
Exception
If there is a problem performing the decomposition.
See also
--------
ensign.cp_decomp.CPDecomp : CP decomposition class
ensign.sptensor.SPTensor : Sparse tensor class
References
----------
.. [1] Kolda, T., Bader, B., "Tensor Decompositions and Applications,"
SIAM Review, 51(3), pp. 455-500, 2009.
.. [2] Baskaran, M., Meister, B., Vasilache, N., Lethin, R., "Efficient and
Scalable Computations with Sparse Tensors," IEEE HPEC, 2012.
"""
if not _validate_params(sptensor, rank, max_iter, stop_tol, mem_limit_gb):
msg = "Invalid Parameters"
logger.error(msg)
raise TypeError(msg)
if isinstance(sptensor, str):
if os.path.isdir(sptensor):
sptensor = spt.read_sptensor(sptensor)
elif os.path.isfile(sptensor):
sptensor = spt.read_sptensor(os.path.dirname(sptensor) + '/', os.path.basename(sptensor))
else:
msg = '{} is not a valid sptensor file or directory.'.format(sptensor)
logger.error(msg)
raise IOError(msg)
tensor = _get_sparse_co_tensor(sptensor)
options = _get_ctype_decomp_options_obj(
seed=seed,
cp_als_max_iter=max_iter,
cp_als_stop_tol=stop_tol,
cp_als_memory_limit=mem_limit_gb*(1024**3),
verbose=verbose,
memory_limit=mem_limit_gb*(1024**3),
output_init_guess=output_init_guess
)
ktensor = _get_k_tensor(sptensor.order, rank)
metrics = _get_decomp_metrics()
cp_als = API.cp_als
cp_als.argtypes = [POINTER(et.C_SparseCoTensor), c_int,
POINTER(et.C_DecompOptions), POINTER(et.C_KTensor),
POINTER(et.C_DecompMetrics)]
ret = cp_als(pointer(tensor), rank, pointer(options), pointer(ktensor),
pointer(metrics))
if ret == MALLOC_ERROR:
logger.error('MALLOC_ERROR. Try increasing the memory limit by using the mem_limit_gb argument.')
exit(MALLOC_ERROR)
py_decomp = _get_py_decomp_obj(ktensor, sptensor, rank, metrics, options)
if output_init_guess:
_get_init_guess(py_decomp, options)
if gen_backtrack:
_compute_cpd_backtrack(py_decomp)
else:
py_decomp.cpd_backtrack = None
if out_dir:
if not os.path.exists(out_dir):
os.makedirs(out_dir)
write_cp_decomp_dir(out_dir, py_decomp)
return py_decomp
[docs]def cp_als_nn(sptensor, rank, out_dir=None, seed=CP_ALS_NN_SEED, max_iter=CP_ALS_NN_MAX_ITER,
stop_tol=CP_ALS_NN_STOP_TOL, mem_limit_gb=CP_ALS_NN_MEM_LIMIT_GB,
output_init_guess=CP_ALS_NN_OUTPUT_INIT_GUESS,
verbose=CP_ALS_NN_VERBOSE, gen_backtrack=CP_ALS_NN_BACKTRACK):
"""Performs a nonnegative CP decomposition using the alternating least squares (ALS) method.
Nonegativity constraints are on output factor matrices. Input tensor
entries must be nonnegative. [1]_ [2]_
Parameters
----------
sptensor : SPTensor or str
Tensor to decompose. If sptensor is of type str, it should contain a
path to a directory containing an SPTensor or directly to a
tensor_data.txt file.
rank : int
Rank of the decomposition.
out_dir : str
Path to directory where decomposition results will be written. If not
set, the decomposition will not be written to disk.
Returns
-------
decomp : CPDecomp
Results of the decomposition.
Other Parameters
----------------
seed : int, optional
Random seed used to initialize decomposition.
Default value is ``0``.
max_iter : int, optional
Maximum iterations before stopping decomposition.
Default value is ``100``.
stop_tol : float, optional
Minimum change in fit required for convergence.
Default value is ``1.0e-5``.
mem_limit_gb : int, optional
Maximum memory used during decomposition, in GiB.
Default value is ``2``.
output_init_guess : bool, optional
Write the initial guess of decomposition to ``out_dir/input_<mode_id>.txt``
Default value is ``False``.
verbose : bool, optional
Output verbose decomposition progress to ``out_dir/output.txt``
Default value is ``False``.
gen_backtrack : bool, optional
Output component-wise backtracking information to ``out_dir/cpd_backtrack.txt``
Default value is ``False``.
Raises
------
Exception
If there is a problem performing the decomposition.
See also
--------
ensign.cp_decomp.CPDecomp : CP decomposition class
ensign.sptensor.SPTensor : Sparse tensor class
References
----------
.. [1] Kolda, T., Bader, B., "Tensor Decompositions and Applications,"
SIAM Review, 51(3), pp. 455-500, 2009.
.. [2] Baskaran, M., Meister, B., Vasilache, N., Lethin, R., "Efficient and
Scalable Computations with Sparse Tensors," IEEE HPEC, 2012.
"""
if not _validate_params(sptensor, rank, max_iter, stop_tol, mem_limit_gb):
msg = "Invalid Parameters"
logger.error(msg)
raise TypeError(msg)
if isinstance(sptensor, str):
if os.path.isdir(sptensor):
sptensor = spt.read_sptensor(sptensor)
elif os.path.isfile(sptensor):
sptensor = spt.read_sptensor(os.path.dirname(sptensor) + '/', os.path.basename(sptensor))
else:
msg = '{} is not a valid sptensor file or directory.'.format(sptensor)
logger.error(msg)
raise IOError(msg)
tensor = _get_sparse_co_tensor(sptensor)
options = _get_ctype_decomp_options_obj(
seed=seed,
cp_als_max_iter=max_iter,
cp_als_stop_tol=stop_tol,
cp_als_memory_limit=mem_limit_gb*(1024**3),
verbose=verbose,
cp_als_nn_max_iter=max_iter,
cp_als_nn_stop_tol=stop_tol,
memory_limit=mem_limit_gb*(1024**3),
output_init_guess=output_init_guess
)
ktensor = _get_k_tensor(sptensor.order, rank)
metrics = _get_decomp_metrics()
cp_als_nn = API.cp_als_nn
cp_als_nn.argtypes = [POINTER(et.C_SparseCoTensor), c_int,
POINTER(et.C_DecompOptions), POINTER(et.C_KTensor),
POINTER(et.C_DecompMetrics)]
ret = cp_als_nn(pointer(tensor), rank, pointer(options), pointer(ktensor),
pointer(metrics))
if ret == MALLOC_ERROR:
logger.error('MALLOC_ERROR. Try increasing the memory limit by using the mem_limit_gb argument.')
exit(MALLOC_ERROR)
py_decomp = _get_py_decomp_obj(ktensor, sptensor, rank, metrics, options)
if output_init_guess:
_get_init_guess(py_decomp, options)
if gen_backtrack:
_compute_cpd_backtrack(py_decomp)
else:
py_decomp.cpd_backtrack = None
if out_dir:
if not os.path.exists(out_dir):
os.makedirs(out_dir)
write_cp_decomp_dir(out_dir, py_decomp)
return py_decomp
[docs]def cp_apr(sptensor, rank, out_dir=None, seed=CP_APR_SEED,
max_outer_iter=CP_APR_MAX_OUTER_ITER,
max_inner_iter=CP_APR_MAX_INNER_ITER,
stop_tol=CP_APR_STOP_TOL, mem_limit_gb=CP_APR_MEM_LIMIT_GB,
output_init_guess=CP_APR_OUTPUT_INIT_GUESS, verbose=CP_APR_VERBOSE,
gen_backtrack=CP_APR_BACKTRACK):
"""Performs a CP decomposition using the alternating Poisson regression (APR) method. [1]_ [2]_
Parameters
----------
sptensor : SPTensor or str
Tensor to decompose. If sptensor is of type str, it should contain a
path to a directory containing an SPTensor or directly to a
tensor_data.txt file.
rank : int
Rank of the decomposition.
out_dir : str
Path to directory where decomposition results will be written. If not
set, the decomposition will not be written to disk.
Returns
-------
decomp : CPDecomp
Results of the decomposition.
Other Parameters
----------------
seed : int, optional
Random seed used to initialize decomposition.
Default value is ``0``.
max_outer_iter : int, optional
Maximum iterations before stopping decomposition.
Default value is ``200``.
max_inner_iter : int, optional
Maximum inner iterations per outer iteration.
Default value is ``10``.
stop_tol : float, optional
Minimum change in KKT violation required for convergence.
Default value is ``1.0e-4``.
mem_limit_gb : int, optional
Maximum memory used during decomposition, in GiB.
Default value is ``2``.
output_init_guess : bool, optional
Write the initial guess of decomposition to ``out_dir/input_<mode_id>.txt``
Default value is ``False``.
verbose : bool, optional
Output verbose decomposition progress to ``out_dir/output.txt``
Default value is ``False``.
gen_backtrack : bool, optional
Output component-wise backtracking information to ``out_dir/cpd_backtrack.txt``
Default value is ``False``.
Raises
------
Exception
If there is a problem performing the decomposition.
See also
--------
ensign.cp_decomp.CPDecomp : CP decomposition class
ensign.sptensor.SPTensor : Sparse tensor class
References
----------
.. [1] Chi, E., Kolda, T., "On Tensors, Sparsity, and Nonnegative
Factorizations," SIAM Journal on Matrix Analysis and Applications
33.4, pp. 1272-1299, 2012.
.. [2] Baskaran, M., Meister, B., Vasilache, N., Lethin, R., "Efficient and
Scalable Computations with Sparse Tensors," IEEE HPEC, 2012.
"""
if not _validate_params(
sptensor, rank, 1, stop_tol,
mem_limit_gb, max_outer_iter, max_inner_iter
):
msg = "Invalid Parameters"
logger.error(msg)
raise TypeError()
if isinstance(sptensor, str):
if os.path.isdir(sptensor):
sptensor = spt.read_sptensor(sptensor)
elif os.path.isfile(sptensor):
sptensor = spt.read_sptensor(
os.path.dirname(sptensor) + '/',
os.path.basename(sptensor)
)
else:
msg = '{} is not a valid sptensor file or directory.'.format(sptensor)
logger.error(msg)
raise IOError(msg)
tensor = _get_sparse_co_tensor(sptensor)
options = _get_ctype_decomp_options_obj(
seed=seed,
cp_apr_max_outer_iter=max_outer_iter,
cp_apr_stop_tol=stop_tol,
cp_apr_max_inner_iter=max_inner_iter,
cp_apr_memory_limit=mem_limit_gb*(1024**3),
verbose=verbose,
memory_limit=mem_limit_gb*(1024**3),
output_init_guess=output_init_guess
)
ktensor = _get_k_tensor(sptensor.order, rank)
metrics = _get_decomp_metrics()
cp_apr = API.cp_apr
cp_apr.argtypes = [POINTER(et.C_SparseCoTensor), c_int,
POINTER(et.C_DecompOptions), POINTER(et.C_KTensor),
POINTER(et.C_DecompMetrics)]
ret = cp_apr(pointer(tensor), rank, pointer(options), pointer(ktensor),
pointer(metrics))
if ret == MALLOC_ERROR:
logger.error('MALLOC_ERROR. Try increasing the memory limit by using the mem_limit_gb argument.')
exit(MALLOC_ERROR)
py_decomp = _get_py_decomp_obj(ktensor, sptensor, rank, metrics, options)
if output_init_guess:
_get_init_guess(py_decomp, options)
if gen_backtrack:
_compute_cpd_backtrack(py_decomp)
else:
py_decomp.cpd_backtrack = None
if out_dir:
if not os.path.exists(out_dir):
os.makedirs(out_dir)
write_cp_decomp_dir(out_dir, py_decomp)
return py_decomp
[docs]def cp_apr_pdnr(sptensor, rank, out_dir=None, seed=CP_APR_SEED,
max_outer_iter=CP_APR_PDNR_MAX_OUTER_ITER,
max_inner_iter=CP_APR_PDNR_MAX_INNER_ITER,
stop_tol=CP_APR_PDNR_STOP_TOL, mem_limit_gb=CP_APR_PDNR_MEM_LIMIT_GB,
output_init_guess=CP_APR_PDNR_OUTPUT_INIT_GUESS,
load_balance=CP_APR_PDNR_LOAD_BALANCE,
verbose=CP_APR_PDNR_VERBOSE, gen_backtrack=CP_APR_PDNR_BACKTRACK):
"""Performs a CP decomposition using alternating Poisson regression (APR) with projected damped Newton row (PDNR) subproblem. [1]_
Parameters
----------
sptensor : SPTensor or str
Tensor to decompose. If sptensor is of type str, it should contain a
path to a directory containing an SPTensor or directly to a
tensor_data.txt file.
rank : int
Rank of the decomposition.
out_dir : str
Path to directory where decomposition results will be written. If not
set, the decomposition will not be written to disk.
Returns
-------
decomp : CPDecomp
Results of the decomposition.
Other Parameters
----------------
seed : int, optional
Random seed used to initialize decomposition.
Default value is ``0``.
max_outer_iter : int, optional
Maximum iterations before stopping decomposition.
Default value is ``200``.
max_inner_iter : int, optional
Maximum inner iterations per outer iteration.
Default value is ``10``.
stop_tol : float, optional
Minimum change in KKT violation required for convergence.
Default value is ``1.0e-4``.
mem_limit_gb : int, optional
Maximum memory used during decomposition, in GiB.
Default value is ``2``.
output_init_guess : bool, optional
Write the initial guess of decomposition to ``out_dir/input_<mode_id>.txt``
Default value is ``False``.
load_balance : int, optional
The load balancing technique to use for parallel execution.
``ensign.constants.NO_LOAD_BALANCE`` (default) is no load balancing,
``ensign.constants.LOAD_BALANCE`` is standard load balancing, and
``ensign.constants.LOAD_BALANCE_UNSORTED`` is unsorted load balancing.
verbose : bool, optional
Output verbose decomposition progress to ``out_dir/output.txt``
Default value is ``False``.
gen_backtrack : bool, optional
Output component-wise backtracking information to ``out_dir/cpd_backtrack.txt``
Default value is ``False``.
Raises
------
Exception
If there is a problem performing the decomposition.
See also
--------
ensign.cp_decomp.CPDecomp : CP decomposition class
ensign.sptensor.SPTensor : Sparse tensor class
References
----------
.. [1] Hansen, S., Plantenga, T., Kolda, T., "Newton-based Optimization for
Kullback-Leibler Nonnegative Tensor Factorizations," Optimization
Methods and Software 30(5), pp. 1002-1029, 2015.
"""
if not _validate_params(sptensor, rank, 1, stop_tol, mem_limit_gb, max_outer_iter, max_inner_iter):
msg = 'Invalid Parameters'
logger.error(msg)
raise TypeError(msg)
if isinstance(sptensor, str):
if os.path.isdir(sptensor):
sptensor = spt.read_sptensor(sptensor)
elif os.path.isfile(sptensor):
sptensor = spt.read_sptensor(os.path.dirname(sptensor) + '/', os.path.basename(sptensor))
else:
msg = '{} is not a valid sptensor file or directory.'.format(sptensor)
raise IOError(msg)
tensor = _get_sparse_co_tensor(sptensor)
options = _get_ctype_decomp_options_obj(
seed=seed,
cp_apr_pdnr_max_outer_iter=max_outer_iter,
cp_apr_pdnr_stop_tol=stop_tol,
cp_apr_pdnr_max_inner_iter=max_inner_iter,
cp_apr_pdnr_memory_limit=mem_limit_gb*(1024**3),
verbose=verbose,
memory_limit=mem_limit_gb*(1024**3),
output_init_guess=output_init_guess
)
ktensor = _get_k_tensor(sptensor.order, rank)
metrics = _get_decomp_metrics()
cp_apr_pdnr = API.cp_apr_pdnr
cp_apr_pdnr.argtypes = [POINTER(et.C_SparseCoTensor), c_int,
POINTER(et.C_DecompOptions), POINTER(et.C_KTensor),
POINTER(et.C_DecompMetrics)]
ret = cp_apr_pdnr(pointer(tensor), rank, pointer(options), pointer(ktensor),
pointer(metrics))
if ret == MALLOC_ERROR:
logger.error('MALLOC_ERROR. Try increasing the memory limit by using the mem_limit_gb argument.')
exit(MALLOC_ERROR)
py_decomp = _get_py_decomp_obj(ktensor, sptensor, rank, metrics, options)
if output_init_guess:
_get_init_guess(py_decomp, options)
if gen_backtrack:
_compute_cpd_backtrack(py_decomp)
else:
py_decomp.cpd_backtrack = None
if out_dir:
if not os.path.exists(out_dir):
os.makedirs(out_dir)
write_cp_decomp_dir(out_dir, py_decomp)
return py_decomp
[docs]def cp_apr_pqnr(sptensor, rank, out_dir=None, seed=CP_APR_PQNR_SEED,
max_outer_iter=CP_APR_PQNR_MAX_OUTER_ITER,
max_inner_iter=CP_APR_PQNR_MAX_INNER_ITER,
stop_tol=CP_APR_PQNR_STOP_TOL, mem_limit_gb=CP_APR_PQNR_MEM_LIMIT_GB,
output_init_guess=CP_APR_PQNR_OUTPUT_INIT_GUESS,
load_balance=CP_APR_PQNR_LOAD_BALANCE,
verbose=CP_APR_PQNR_VERBOSE, gen_backtrack=CP_APR_PQNR_BACKTRACK):
"""Performs a CP decomposition using alternating Poisson regression (APR) with projected quasi-Newton row (PQNR) subproblem. [1]_
Parameters
----------
sptensor : SPTensor or str
Tensor to decompose. If sptensor is of type str, it should contain a
path to a directory containing an SPTensor or directly to a
tensor_data.txt file.
rank : int
Rank of the decomposition.
out_dir : str
Path to directory where decomposition results will be written. If not
set, the decomposition will not be written to disk.
Returns
-------
decomp : CPDecomp
Results of the decomposition.
Other Parameters
----------------
seed : int, optional
Random seed used to initialize decomposition.
Default value is ``0``.
max_outer_iter : int, optional
Maximum iterations before stopping decomposition.
Default value is ``200``.
max_inner_iter : int, optional
Maximum inner iterations per outer iteration.
Default value is ``10``.
stop_tol : float, optional
Minimum change in KKT violation required for convergence.
Default value is ``1.0e-4``.
mem_limit_gb : int, optional
Maximum memory used during decomposition, in GiB.
Default value is ``2``.
output_init_guess : bool, optional
Write the initial guess of decomposition to ``out_dir/input_<mode_id>.txt``
Default value is ``False``.
load_balance : int, optional
The load balancing technique to use for parallel execution.
``ensign.constants.NO_LOAD_BALANCE`` (default) is no load balancing,
``ensign.constants.LOAD_BALANCE`` is standard load balancing, and
``ensign.constants.LOAD_BALANCE_UNSORTED`` is unsorted load balancing.
verbose : bool, optional
Output verbose decomposition progress to ``out_dir/output.txt``
Default value is ``False``.
gen_backtrack : bool, optional
Output component-wise backtracking information to ``out_dir/cpd_backtrack.txt``
Default value is ``False``.
Raises
------
Exception
If there is a problem performing the decomposition.
See also
--------
ensign.cp_decomp.CPDecomp : CP decomposition class
ensign.sptensor.SPTensor : Sparse tensor class
References
----------
.. [1] Hansen, S., Plantenga, T., Kolda, T., "Newton-based Optimization for
Kullback-Leibler Nonnegative Tensor Factorizations," Optimization
Methods and Software 30(5), pp. 1002-1029, 2015.
"""
if not _validate_params(sptensor, rank, 1, stop_tol, mem_limit_gb, max_outer_iter, max_inner_iter):
msg = "Invalid Parameters"
logger.error(msg)
raise TypeError(msg)
if isinstance(sptensor, str):
if os.path.isdir(sptensor):
sptensor = spt.read_sptensor(sptensor)
elif os.path.isfile(sptensor):
sptensor = spt.read_sptensor(os.path.dirname(sptensor) + '/', os.path.basename(sptensor))
else:
msg = '{} is not a valid sptensor file or directory.'.format(sptensor)
logger.error(msg)
raise IOError(msg)
tensor = _get_sparse_co_tensor(sptensor)
options = _get_ctype_decomp_options_obj(
seed=seed,
cp_apr_pqnr_max_outer_iter=max_outer_iter,
cp_apr_pqnr_stop_tol=stop_tol,
cp_apr_pqnr_max_inner_iter=max_inner_iter,
cp_apr_pqnr_memory_limit=mem_limit_gb*(1024**3),
verbose=verbose,
memory_limit=mem_limit_gb*(1024**3),
output_init_guess=output_init_guess
)
ktensor = _get_k_tensor(sptensor.order, rank)
metrics = _get_decomp_metrics()
cp_apr_pqnr = API.cp_apr_pqnr
cp_apr_pqnr.argtypes = [POINTER(et.C_SparseCoTensor), c_int,
POINTER(et.C_DecompOptions), POINTER(et.C_KTensor),
POINTER(et.C_DecompMetrics)]
ret = cp_apr_pqnr(pointer(tensor), rank, pointer(options), pointer(ktensor),
pointer(metrics))
if ret == MALLOC_ERROR:
logger.error('MALLOC_ERROR. Try increasing the memory limit by using the mem_limit_gb argument.')
exit(MALLOC_ERROR)
py_decomp = _get_py_decomp_obj(ktensor, sptensor, rank, metrics, options)
if output_init_guess:
_get_init_guess(py_decomp, options)
if gen_backtrack:
_compute_cpd_backtrack(py_decomp)
else:
py_decomp.cpd_backtrack = None
if out_dir:
if not os.path.exists(out_dir):
os.makedirs(out_dir)
write_cp_decomp_dir(out_dir, py_decomp)
return py_decomp