Source code for ensign.cp_decomp

# ENSIGN rights
"""CANDECOMP-PARAFAC (CP) sparse tensor decomposition tools.

This module contains functions for reading, writing, and performing CP 
decompositions along with a class for representing decomposition results.
"""
from ctypes import *
import functools
import json
import multiprocessing as mltprc
import os
import random
import re
import string
import sys

import numpy as np
import pandas as pd

from ensign.constants import *
import ensign.constants as C
import ensign.ensign_ctypes.ensign_types as et
import ensign.ensign_io.ensign_logging as ensign_logging
import ensign.ensign_io.decomp_io as dio
import ensign.ensign_io.sptensor_io as sio
import ensign.sptensor as spt

API = cdll.LoadLibrary('libapi.so')

# Used when calculating backtrack data
# Determines which values (relative to max in mode) should be considered 0
REL_CHOP_POINT=1e-4 
DOUBLE_BYTES = np.dtype(float).itemsize

CP_ALS = 0
CP_ALS_NN = 1
CP_APR = 2
CP_APR_PDNR = 3
CP_APR_PQNR = 4
MALLOC_ERROR = 3

logger = ensign_logging.get_logger()

class StreamingDatalessFit:
    """
    Values necessary for calculating the fit of streaming decompositions 
    without using the base tensor data values.

    Attributes
    ----------
    norm : float
        L2 norm of the base tensor.
    residual_norm : float
        Residual L2 norm from fitting the base tensor to the base tensor 
        decomposition.
    inner_product : numpy.ndarray (float)
        List of terms constituting the inner product of base tensor and tensor 
        reconstructed from the factor matrices of base tensor decomposition.
    """
    def __init__(self, norm, res_norm, inner_product):
        """ Constructor for CPDecomp
        """
        self.norm = norm
        self.residual_norm = res_norm
        self.inner_product = inner_product

    def __eq__(self, other):
        if type(self) != type(other):
            return False

        if self.norm != other.norm:
            return False
        
        if self.residual_norm != other.residual_norm:
            return False

        if not np.array_equal(self.inner_product, other.inner_product):
            return False

        return True

    def __str__(self):
        return f'Norm: {self.norm}\nResidual-Norm: {self.residual_norm}\nInner-Product: {self.inner_product}'

[docs]class CPDecomp: """Represents the results of a CP decomposition. Attributes ---------- rank : int Rank of decomposition. order : int Order of decomposed tensor. sptensor : SPTensor SPTensor that was decomposed. weights : numpy.ndarray 1D array of length ``rank`` containing weights for each component of the decomposition. factors : list of numpy.ndarray List of 2D factor matrices accessed by ``mode_id``. Each factor matrix has ``shape = (mode_size, rank)`` and ``dtype = "float64"``. All values in a factor matrix lie inside the closed interval ``[-1, 1]``. Each column of a factor matrix represents a single component. factors_init : list of numpy.ndarray List of 2D factor matrices as initialized for the decomposition. Only saved if requested by the decomposition call. labels : list of list of str, optional An optional list of per-mode index labels accessed by ``mode_id``. Each list of labels corresponds to the labels for each index of the mode. Suppose we have a mode with ``mode_id = 3`` and ``"Yes"``, ``"No"``, ``"Maybe"`` mapped to indices 0-2. For this mode, ``labels[3]`` is equal to ``["Yes", "No", "Maybe"]``. Decompositions are not required to have labels. mode_names : list of str, optional Each mode's name accessed by ``mode_id``. Decompositions are not required to have mode names. mode_sizes : list of int The number of indices in each mode accessed by ``mode_id``. streaming_dataless_fit : ndarray Used by decomposition algorithms in a streaming setting to determine final fit. metrics : dict Dictionary with keys: [time, fit, cosine_sim, norm_scaling, cp_total_iter] representing the time taken to run the decomposition, the final fit of the decomposition, the cosine similarity of the decomposition to the original tensor, the scale factor used to normalize the decompositions, and the number of optimization steps used to reach the final decomposition values, respectively. cpd_backtrack : list of lists of int One entry per component. Each list contains tensor entries that contributed to the corresponding component. If CPDecomp object has two components, and the first one tracks to tensor entries 0 and 2 and the second tracks to tensor entries 1 and 3, then the backtrack will be [[0,2], [1,3]]. Only exists if requested in the decomposition call. See also -------- ensign.sptensor.SPTensor : Sparse tensor class """
[docs] def __init__(self): self.filenames = None self.rank = 0 self.order = 0 self.sptensor = None self.weights = None self.streaming_dataless_fit = None self.factors = [] self.labels = [] self.mode_names = [] self.mode_sizes = [] self.factors_init = None self.metrics = None
def __eq__(self, d): are_equal = True if type(self) != type(d): return False # First deal with the basics. Are both decomps there? The same size/shape? if self is None and d is None: return True elif self is None or d is None: return False are_equal = are_equal and self.rank == d.rank are_equal = are_equal and self.order == d.order are_equal = are_equal and self.weights.shape == d.weights.shape are_equal = are_equal and len(self.factors) == len(d.factors) if not are_equal: return False # Check weights are_equal = are_equal and np.array_equal(self.weights, d.weights) # Check streaming dataless fit if self.streaming_dataless_fit is not None and d.streaming_dataless_fit is not None: are_equal = are_equal and (self.streaming_dataless_fit == d.streaming_dataless_fit) if not are_equal: return False # Check factor matrices for i in range(len(self.factors)): if not are_equal: break are_equal = are_equal and np.array_equal(self.factors[i], d.factors[i]) return are_equal def __ne__(self, d): return not self.__eq__(d) def __str__(self): ret = {'rank': self.rank, 'order': self.order, 'weights': self.weights.tolist()} return json.dumps(ret)
[docs] def compute_cpd_backtrack(self, out_dir=None): """Computes CP Decomposition backtracking information. Parameters ---------- out_dir : str Directory to save backtracking information in. """ self.cpd_backtrack = _compute_cpd_backtrack(self) if out_dir: dio.write_decomp_backtrack(out_dir, self.cpd_backtrack)
def _get_sparse_co_tensor(py_sptensor): order = c_long(py_sptensor.order) nnz = c_long(py_sptensor.nnz) mode_sizes = (c_long * py_sptensor.order)(*py_sptensor.mode_sizes) py_entries = py_sptensor.entries.values[:, py_sptensor.order].astype(np.float64) entries = py_entries.ctypes.data_as(POINTER(c_double)) py_coord_index = py_sptensor.entries.values[:, :-1].astype(np.int64).ravel() index = cast((POINTER(c_long) * py_sptensor.nnz)(), POINTER(POINTER(c_long))) index[0] = py_coord_index.ctypes.data_as(POINTER(c_long)) sparse_co_tensor = et.C_SparseCoTensor(nModes=order, nnz=nnz, size=mode_sizes, NNZ=entries, SM=index) return sparse_co_tensor def _get_py_decomp_obj(ktensor, sptensor, rank, metrics, options): d = CPDecomp() d.rank = rank d.order = sptensor.order weights = np.array([getattr(ktensor, 'lambda')[i] for i in range(rank)]) sorted_weight_idxs = np.argsort(weights)[::-1] d.weights = weights[sorted_weight_idxs] d.factors = [] for i in range(d.order): factor = np.ctypeslib.as_array( ktensor.matrices[i][0].data[0], shape=(sptensor.mode_sizes[i], rank)) factor = factor[:, sorted_weight_idxs] d.factors.append(factor) streaming_inner_product = \ [options.streaming_dataless_fit.contents.ip[i] for i in sorted_weight_idxs] d.streaming_dataless_fit = StreamingDatalessFit( options.streaming_dataless_fit.contents.norm, options.streaming_dataless_fit.contents.res_norm, np.array(streaming_inner_product) ) d.mode_names = sptensor.mode_names d.mode_sizes = sptensor.mode_sizes d.labels = sptensor.labels d.sptensor = sptensor metrics_dict = {} metrics_dict['time'] = metrics.time metrics_dict['fit'] = metrics.fit metrics_dict['cosine_sim'] = metrics.cosine_sim metrics_dict['norm_scaling'] = metrics.norm_scaling metrics_dict['cp_total_iter'] = metrics.cp_total_iter d.metrics = metrics_dict return d def _get_init_guess(py_decomp, options): """ Adds initial guesses for factor matrices to CPDecomp object Copies initial factor matrix guesses from options to CPDecomp object. Parameters ---------- py_decomp : CPDecomp CPDecomp object defined in this module. options : decomp_options decomp_options struct as defined in 'ensign_types.h' """ py_decomp.factors_init = [np.zeros((py_decomp.mode_sizes[i], py_decomp.rank)) for i in range(py_decomp.order)] for i in range(py_decomp.order): for j in range(py_decomp.mode_sizes[i]): for k in range(py_decomp.rank): py_decomp.factors_init[i][j][k] = options.factor_matrices_initial_guess[i][j * py_decomp.rank + k] def _get_ctype_decomp_options_obj(seed=0, verbose=False, memory_limit=2e9, factor_matrices_initial_guess=POINTER(POINTER(c_double))(), streaming_dataless_fit=POINTER(et.C_StreamingDatalessFit)(), output_init_guess=False, cp_als_memory_limit=C.CP_ALS_MEM_LIMIT_GB*(1024**3), cp_als_max_iter=C.CP_ALS_MAX_ITER, cp_als_stop_tol=C.CP_ALS_STOP_TOL, cp_als_nn_max_iter=C.CP_ALS_NN_MAX_ITER, cp_als_nn_memory_limit=C.CP_ALS_MEM_LIMIT_GB*(1024**3), cp_als_nn_stop_tol=C.CP_ALS_NN_STOP_TOL, cp_apr_memory_limit=C.CP_APR_MEM_LIMIT_GB*(1024**3), cp_apr_max_outer_iter=C.CP_APR_MAX_OUTER_ITER, cp_apr_max_inner_iter=C.CP_APR_MAX_INNER_ITER, cp_apr_stop_tol=C.CP_APR_STOP_TOL, cp_apr_pqnr_memory_limit=C.CP_APR_PQNR_MEM_LIMIT_GB*(1024**3), cp_apr_pqnr_max_outer_iter=C.CP_APR_PQNR_MAX_OUTER_ITER, cp_apr_pqnr_max_inner_iter=C.CP_APR_PQNR_MAX_INNER_ITER, cp_apr_pqnr_stop_tol=C.CP_APR_PQNR_STOP_TOL, cp_apr_pqnr_is_inexact=C.CP_APR_PQNR_IS_INEXACT, cp_apr_pqnr_lbfgs_m=C.CP_APR_PQNR_LBFGS_M, cp_apr_pqnr_skooch_mode=C.CP_APR_PQNR_SKOOCH_MODE, cp_apr_pdnr_memory_limit=C.CP_APR_PDNR_MEM_LIMIT_GB*(1024**3), cp_apr_pdnr_max_outer_iter=C.CP_APR_PDNR_MAX_OUTER_ITER, cp_apr_pdnr_max_inner_iter=C.CP_APR_PDNR_MAX_INNER_ITER, cp_apr_pdnr_stop_tol=C.CP_APR_PDNR_STOP_TOL, cp_apr_pdnr_is_inexact=C.CP_APR_PDNR_IS_INEXACT): return et.C_DecompOptions(c_int(seed), c_int(verbose), c_long(int(memory_limit)), c_int(output_init_guess), factor_matrices_initial_guess, streaming_dataless_fit, c_long(int(cp_als_memory_limit)), c_long(cp_als_max_iter), c_double(cp_als_stop_tol), c_long(cp_als_nn_max_iter), c_long(int(cp_als_nn_memory_limit)), c_double(cp_als_nn_stop_tol), c_long(int(cp_apr_memory_limit)), c_long(cp_apr_max_outer_iter), c_long(cp_apr_max_inner_iter), c_double(cp_apr_stop_tol), c_long(int(cp_apr_pqnr_memory_limit)), c_long(cp_apr_pqnr_max_outer_iter), c_long(cp_apr_pqnr_max_inner_iter), c_double(cp_apr_pqnr_stop_tol), c_long(cp_apr_pqnr_is_inexact), c_long(cp_apr_pqnr_lbfgs_m), c_long(int(cp_apr_pqnr_skooch_mode)), c_long(int(cp_apr_pdnr_memory_limit)), c_long(cp_apr_pdnr_max_outer_iter), c_long(cp_apr_pdnr_max_inner_iter), c_double(cp_apr_pdnr_stop_tol), c_long(cp_apr_pdnr_is_inexact)) def _get_decomp_metrics(): return et.C_DecompMetrics(0.0, 0.0, 0.0, 0.0, 0) def _get_k_tensor(order, rank): num_modes = c_long(order) num_factors = c_long(rank) weights = POINTER(c_double)((c_double * rank)(*([1.0] * rank))) # Array of pointers matrices = (POINTER(et.C_FactorMatrix) * order) # Instantiate array of pointers matrices = matrices(*([pointer(et.C_FactorMatrix())] * order)) # Cast to double pointer matrices = POINTER(POINTER(et.C_FactorMatrix))(matrices) scratch_matrix = POINTER(et.C_FactorMatrix)() return et.C_KTensor(num_modes, matrices, weights, num_factors, scratch_matrix) def _validate_params(sptensor, rank, max_iter, stop_tol, mem_lim, outer_iter=1, inner_iter=1): if not isinstance(sptensor, spt.SPTensor) and not isinstance(sptensor, str): msg = "sptensor is not of class SPTensor or an SPTensor filename." logger.error(msg) raise TypeError(msg) if sptensor == None: msg = "sptensor is none" logger.error(msg) raise TypeError(msg) if not isinstance(rank, int): msg = "Rank must be an int." logger.error(msg) raise TypeError(msg) if rank <= 0: msg = "Rank of decomposition cannot be less than 1." logger.error(msg) raise TypeError(msg) if max_iter < 1: msg = "Maximum iterations cannot be less than 1." logger.error(msg) raise TypeError(msg) if outer_iter < 1: msg = "Maximum outer iterations cannot be less than 1." logger.error(msg) raise TypeError(msg) if inner_iter < 1: msg = "Maximum inner iterations cannot be less than 1." logger.error(msg) raise TypeError(msg) if stop_tol <= 0.0: msg = "Stop tolerance cannot be less than or equal to 0." logger.error(msg) raise TypeError(msg) if mem_lim <= 0: msg = "Memory Limit (GB) cannot be less than or equal to 0." logger.error(msg) raise TypeError(msg) return 1
[docs]def reconstruct_into(decomp, tensor, comp_ids=None, orig_entries=False): """ Reconstructs a CP decomposition 'into' a sparse tensor. Reconstructs a tensor from a decomposition ``decomp`` by computing and summing selected outer products. If a list of component IDs is given in ``comp_ids`` then only components in this list will contribute to the reconstruction. Only nonzero indices of ``tensor`` are included in the reconstructed tensor. Parameters ---------- decomp : CPDecomp Decompostion to reconstruct using sum-of-outer-products. tensor : SPTensor The sparse tensor that contains the nonzero entries to reconstruct into. comp_ids : list, optional A list of components to reconstruct the SPTensor from. If ``None`` (default), all components are used. orig_entries : bool, optional This parameter controls the value of entries in the reconstructed tensor. When ``True``, nonzero values in the sum-of-outer-products reconstruction are given the same value as ``tensor`` regardless of reconstructed value. When ``False`` (default) nonzero values are given the value produced by the sum-of-outer-products reconstruction. Returns ------- reconstructed_spt : SPTensor The sparse tensor produced by reconstructing selected components of ``decomp`` into nonzero values of ``tensor``. """ # Validate decomposition and sparse tensor if decomp is None: msg = "Parameter 'decomp' must be a CPDecomp object (not 'None')" logger.error(msg) raise ValueError(msg) if tensor is None: msg = "Parameter 'tensor' must be a SPTensor object (not 'None')" logger.error(msg) raise ValueError(msg) if not isinstance(decomp, CPDecomp) and not isinstance(tensor, spt.SPTensor): msg = "Parameters 'decomp' and 'tensor' must be, respectively, CPDecomp and SPTensor objects" logger.error(msg) raise ValueError(msg) if decomp.order != tensor.order: msg = "Parameters 'decomp' and 'tensor' must have the same order" logger.error(msg) raise ValueError(msg) if len(decomp.mode_sizes) != len(tensor.mode_sizes): msg = "Parameters 'decomp' and 'tensor' must have the same order" logger.error(msg) raise ValueError(msg) for mode_id in range(decomp.order): if decomp.mode_sizes[mode_id] != tensor.mode_sizes[mode_id]: msg = "Parameters 'decomp' and 'tensor' must have identical mode sizes" logger.error(msg) raise ValueError(msg) # Validate component list if comp_ids is not None: for comp_id in comp_ids: try: tmp = int(comp_id) except: msg = "All items in parameter 'comp_ids' must be integers" logger.error(msg) raise ValueError(msg) if int(comp_id) >= len(decomp.weights) or int(comp_id) < 0: msg = "All items in parameter 'comp_ids' must be on the interval [0, 'd.rank')" logger.error(msg) raise ValueError(msg) # Validate original entries flag if (orig_entries != True and orig_entries != False): msg = "Flag 'orig_entries' must be 'True' or 'False'" logger.error(msg) raise ValueError(msg) # Copy original sparse tensor reconstructed_spt = spt.SPTensor(tensor.order, tensor.nnz, tensor.mode_sizes, tensor.entries.copy()) # Weight distributes to every dot product calculation for a component. If # we want a component to be represented, we set its weight to weight*1.0. base_product = np.copy(decomp.weights) if comp_ids is not None: base_product = np.zeros(decomp.rank) for comp_id in comp_ids: base_product[comp_id] = 1.0 base_product *= decomp.weights # For every index tuple in the original sparse tensor we compute its outer # product in all components simultaneously. We sum outer products for the # index tuple to obtain its reconstructed value. indices = tensor.entries.values.astype("int64") for nz_id, nz in enumerate(indices): outer_product = np.copy(base_product) for mode_id in range(decomp.order): entry_id = nz[mode_id] entry_scores = decomp.factors[mode_id][entry_id] outer_product *= entry_scores reconstructed_value = 0 if orig_entries == True: reconstructed_value = tensor.entries.values[nz_id, -1] else: reconstructed_value = outer_product.sum() reconstructed_spt.entries.iloc[nz_id, decomp.order] = reconstructed_value return reconstructed_spt
[docs]def get_fit_per_entry(decomp, tensor, top_k=None): """ Calculates fit for each entry of a reconstructed tensor present in the original tensor. This function evaluates how well each entry of the tensor reconstructed from ``decomp`` matches the entry at the same index in ``tensor`` This evaluation is calculated as fit, a floating point number on the interval (-infinity, 1.0]. A fit of 1.0 is an exact match between the reconstructed and original entry. Output is sorted fy fit (ascending). Lower fit values indicate a reconstructed entry is different than the original entry. Low fit values may be indicative of anomalous data. Parameters ---------- decomp : CPDecomp Decompostion used to calculate per-element fit. A sparse tensor is reconstructed from ``decomp`` using sum-of-outer-products and the results are compared to ``tensor``. tensor : SPTensor The sparse tensor that is the basis for comparison. top_k : int, optional The number of elements to return. If ``top_k`` is ``None`` (default), fit values for all entries are returned. If ``top_k`` is positive, the ``top_k`` lowest fit elements are returned. If ``top_k`` is negative the ``abs(top_k)`` highest fit elements are returned. If ``top_k`` is zero an empty ``ndarray`` is returned. Returns ------- fit_vals : numpy.ndarray A ``numpy.ndarray`` of the same shape and format as ``tensor.entries``. The first ``tensor.order`` columns of ``fit_vals`` are indices into into each mode of ``tensor``. The final column of ``fit_vals`` contains the entry's fit value. """ # Validate decomposition and sparse tensor if decomp is None: msg = "Parameter 'decomp' must be a CPDecomp object (not 'None')" logger.error(msg) raise ValueError(msg) if tensor is None: msg = "Parameter 'tensor' must be a SPTensor object (not 'None')" logger.error(msg) raise ValueError(msg) if not (isinstance(decomp, CPDecomp) and isinstance(tensor, spt.SPTensor)): msg = "Parameters 'decomp' and 'tensor' must be, respectively, CPDecomp and SPTensor objects" logger.error(msg) raise ValueError(msg) if decomp.order != tensor.order: msg = "Parameters 'decomp' and 'tensor' must have the same order" logger.error(msg) raise ValueError(msg) if len(decomp.mode_sizes) != len(tensor.mode_sizes): msg = "Parameters 'decomp' and 'tensor' must have the same order" logger.error(msg) raise ValueError(msg) for mode_id in range(decomp.order): if decomp.mode_sizes[mode_id] != tensor.mode_sizes[mode_id]: msg = "Parameters 'decomp' and 'tensor' must have identical mode sizes" logger.error(msg) raise ValueError(msg) # Validate top_k if top_k is not None: try: tmp = int(top_k) except: msg = "Parameter 'top_k' must be integer or None" logger.error(msg) raise ValueError(msg) if int(top_k) == 0: return np.empty(shape=(0,0)) # Reconstruct decomposition into original tensor r = reconstruct_into(decomp, tensor) # Compute difference between original and reconstructed nonzero values reconstructed_vals = r.entries.values[:, decomp.order] orig_vals = tensor.entries.values[:, decomp.order] diff_vals = orig_vals - reconstructed_vals # Fit of original value in reconstruction is 1 - |diff / orig| # NOTE: Negative values indicate bad fit, and larger magnitude is worse # e.g., -1 is bad, -100 is worse fit_orig = 1.0 - np.absolute(diff_vals / orig_vals) # Copy original entries and replace value of each index tuple with fit fit_vals = tensor.entries.values.copy() fit_vals[:, decomp.order] = fit_orig # Sort entries by fit (ascending). Smallest fit values are anomalies. view_spec = "" for i in range(decomp.order): view_spec += "float64," view_spec += "float64" field_spec = "f" + str(decomp.order) fit_vals.view(view_spec).sort(order=[field_spec], axis=0) # Return the sorted ndarray if top_k is None: return fit_vals elif int(top_k) > 0: return fit_vals[:top_k, :] elif int(top_k) < 0: return fit_vals[top_k:,:] else: # Guard, we should never reach this point msg = "Invalid top_k value" logger.error(msg) raise ValueError(msg)
[docs]def are_close(decomp0, decomp1, rtol=1e-5, atol=1e-08): """Checks if weights and factor matrices are close according to tolerance. Uses the following formula element-wise on weights and factor matrices: abs(d0 - d1) <= (atol + rtol * abs(d1)) If this inequality is ``True`` for all elements then ``True`` is returned. Parameters ---------- decomp0, decomp1 : CPDecomp Decompositions to compare for closeness rtol : float, optional Relative tolerance atol : float, optional Absolute tolerance Returns ------- are_close : bool True if the decomposition weights and factor matrices are element-wise equal within a tolerance. """ are_close = True # First deal with the basics. Are both decomps there? The same size/shape? if decomp0 is None and decomp1 is None: return True elif decomp0 is None or decomp1 is None: return False are_close = are_close and decomp0.rank == decomp1.rank are_close = are_close and decomp0.order == decomp1.order are_close = are_close and decomp0.weights.shape == decomp1.weights.shape are_close = are_close and len(decomp0.factors) == len(decomp1.factors) for i in range(len(decomp0.factors)): are_close = are_close and decomp0.factors[i].shape == decomp1.factors[i].shape if not are_close: return False # Check weights are_close = are_close and np.allclose(decomp0.weights, decomp1.weights, rtol=rtol, atol=atol) # Check factor matrices for i in range(len(decomp0.factors)): if not are_close: break are_close = are_close and np.allclose(decomp0.factors[i], decomp1.factors[i], rtol=rtol, atol=atol) return are_close
[docs]def read_cp_decomp_dir(decomp_dir): """Reads a CP decomposition from the filesystem. Parameters ---------- decomp_dir : str Path to the directory containing CP decomposition. Returns ------- decomposition : CPDecomp The decomposition in ``decomp_dir``. Raises ------ Exception If the decomposition cannot be read or is not well formed. See also -------- ensign.cp_decomp.CPDecomp : CP decomposition class """ if(decomp_dir == None or type(decomp_dir) != str): msg = "decomp_dir is not of type String." logger.error(msg) raise TypeError(msg) if(len(decomp_dir) < 1): msg = "decomp_dir is not a valid directory." logger.error(msg) raise ValueError(msg) if decomp_dir[-1] != '/': decomp_dir += '/' # Initialize decomposition object and build list of decomp_mode_<i>.txt files decomp = CPDecomp() decomp_fn_pattern = re.compile('decomp_mode_[0-9]+.txt') decomp_mode_fns = sorted(filter(lambda x: decomp_fn_pattern.fullmatch(x) is not None, os.listdir(decomp_dir))) if len(decomp_mode_fns) == 0: msg = 'No decomp_mode_<i>.txt (factor matrices) files found in {}.'.format(decomp_dir) logger.error(msg) raise IOError(msg) # Assign decomposition specific values decomp.order = len(decomp_mode_fns) decomp.weights, decomp.rank = dio.read_weights(decomp_dir + '/weights.txt') decomp.factors = [dio.read_factor_matrix(decomp_dir + '/' + fn)[0] for fn in decomp_mode_fns] decomp.mode_sizes = [factor.shape[0] for factor in decomp.factors] if 'streaming.txt' in os.listdir(decomp_dir): decomp.streaming_dataless_fit = dio.read_streaming(decomp_dir + '/streaming.txt') # Assign sptensor specific values mode_map_fn_pattern = re.compile('map_mode_[0-9]+.txt') mode_map_fns = sorted(filter(lambda x: mode_map_fn_pattern.fullmatch(x) is not None, os.listdir(decomp_dir))) if 'tensor_data.txt' in os.listdir(decomp_dir): decomp.sptensor = spt.read_sptensor(decomp_dir) decomp.labels = decomp.sptensor.labels decomp.mode_names = decomp.sptensor.mode_names elif mode_map_fns: decomp.mode_names, decomp.labels = sio.read_many_labels(decomp_dir, mode_map_fns) else: decomp.mode_names = list(map(lambda x: 'mode_'+str(x), list(range(len(decomp.mode_sizes))))) decomp.labels = [['label_{}-{}'.format(str(i), str(x)) for x in range(mode_size)] for i, mode_size in enumerate(decomp.mode_sizes)] # Read backtracking data if available decomp.cpd_backtrack = dio.read_decomp_backtrack(decomp_dir) return decomp
[docs]def write_cp_decomp_dir(decomp_dir, decomp, write_tensor=False): """Writes a CP decomposition to the filesystem. Parameters ---------- decomp_dir : str Path to the destination directory of the CP decomposition. This directory will be created if nonexistent and **will be erased and overwritten** if it exists. decomp : CPDecomp The decomposition to be written. write_tensor : bool Write the tensor_data.txt and map_mode_<x>.txt files. Raises ------ Exception If the decomposition could not be written. See also -------- ensign.cp_decomp.CPDecomp : CP decomposition class """ if not os.path.exists(decomp_dir): os.makedirs(decomp_dir) else: if write_tensor: fn_pattern = re.compile('map_mode_[0-9]+.txt|decomp_mode_[0-9]+.txt|initial_guess_[0-9]+.txt|streaming.txt|weights.txt|tensor_data.txt') else: fn_pattern = re.compile('decomp_mode_[0-9]+.txt|initial_guess_[0-9]+.txt|streaming.txt|weights.txt') for fn in list(filter(lambda x: fn_pattern.fullmatch(x) is not None, os.listdir(decomp_dir))): os.remove(os.path.join(decomp_dir, fn)) # Scores for mode_id, factors in enumerate(decomp.factors): dio.write_factor_matrix(decomp_dir, mode_id, factors) # Weights dio.write_weights(decomp_dir, decomp.weights) # Streaming dataless fit if decomp.streaming_dataless_fit is not None: dio.write_streaming(decomp_dir, decomp.streaming_dataless_fit) # Initial Score Guesses if decomp.factors_init is not None: for mode_id, factors in enumerate(decomp.factors_init): dio.write_factor_matrix(decomp_dir, mode_id, factors, 'initial_guess') # Labels if write_tensor: for mode_id, labels in enumerate(decomp.labels): mode_name = decomp.mode_names[mode_id] labels = decomp.labels[mode_id] sio.write_labels(decomp_dir, mode_id, mode_name, labels) # Tensor if decomp.sptensor is not None and write_tensor: sio.write_sptensor_entries(decomp_dir, decomp.sptensor.mode_sizes, decomp.sptensor.entries.values) # Backtracking information if decomp.cpd_backtrack is not None: dio.write_decomp_backtrack(decomp_dir, decomp.cpd_backtrack) else: if os.path.isfile(decomp_dir + '/cpd_backtrack.txt'): os.remove(decomp_dir + '/cpd_backtrack.txt')
def _filter_cartesian_product(index_lists, valid_indices): """ Computes the component backtrack info from a component of a decompositon to the tensor by finding the intersection of valid_indices with the Cartesian product of the lists in index_lists. index_lists can be of arbitary length. Parameters ---------- index_lists : list of lists A list containing one list for each decomposition mode. Each list contains indices that are hot in the component in question. valid_indices : iterable of tuples All index tuples appearing in the tensor Returns ------- product : list of tuples All tuples in Cartesian product of index_lists. These correspond to the tensor indices that are hot in the component. """ index_list_lengths = list(map(len, index_lists)) # if elements in cart. prod. were to reside in a multi-d array, the ith # element of flat_lengths is the number of elements in a slice where the # first i indices of that multi-d array are specified flat_lengths = [functools.reduce(lambda x,y:x*y, index_list_lengths[i:]) for i in range(len(index_lists))] n_entries_in_cart_prod = flat_lengths.pop(0) product = [] for i in range(n_entries_in_cart_prod): idx = i # compute which indices should be chosen in constructing the current # element of the cart. prod. element_indices = () for length in flat_lengths: element_indices += (idx // length,) idx %= length element_indices += (idx,) # construct the element given the indices that should be chosen element = tuple(index_lists[j][element_indices[j]] for j in range(len(index_lists))) # only store the element if it is specified as valid if element in valid_indices: product.append(element) return product def _forwardtrack_decomp_component(decomp, comp_id): """ Computes the component backtrack from a component of a decompositon to the tensor by looking "forward" from the tensor to the decomposition to check which indices appear in the component. This method is faster than _filter_cartesian_product if the component has more hot indices than the original tensor. Parameters ---------- decomp : CPDecomp Decomposition object in question. comp_id : int The component whose backtrack info is being calculated. Returns ------- forward_track : list of tuples All indices in the tensor that are hot in the component. """ tensor = decomp.sptensor forward_track = [] # iterate over all hot indices in the tensor for i, coordinate_row in enumerate(tensor.entries.values[:, :-1]): nonzero_flag = 1 # check if the corresponding element of the outer product is 0 for label_index, factor_matrix in zip(coordinate_row, decomp.factors): # use a relative chop point below which is "zero" mode_max = factor_matrix[:, comp_id].max() if factor_matrix[int(label_index), comp_id] < REL_CHOP_POINT * mode_max: nonzero_flag = 0 if nonzero_flag: forward_track.append(tuple(coordinate_row)) return forward_track def _compute_cpd_backtrack(decomp): """ Computes the backtrack info from each component of a decompositon to the original tensor. Sets the appropriate field in the CPDecomp object. Parameters ---------- decomp : CPDecomp Decomposition object in question. """ indices = [tuple(map(int, t)) for t in decomp.sptensor.entries.values[:, :-1]] # save a map from indices to tensor line numbers line_nums = dict(list(zip(indices, list(range(len(indices)))))) cpd_backtrack = [] # for each comp keep track of hot indices in each mode for comp_id in range(decomp.rank): non_zero_indices = [] for mode_id in range(decomp.order): mode_vec = decomp.factors[mode_id][:, comp_id] mode_max = mode_vec.max() hot_mode_indices = list(np.argwhere(mode_vec > REL_CHOP_POINT * mode_max).ravel()) non_zero_indices.append(hot_mode_indices) # check if there are more hot indices in the component than in tensor num_comp_entries = np.prod(list(map(lambda l: len(l), non_zero_indices))) if num_comp_entries > len(decomp.sptensor.entries): indices_to_track = _forwardtrack_decomp_component(decomp, comp_id) else: indices_to_track = _filter_cartesian_product(non_zero_indices, line_nums) # convert from indices to line numbers entries_to_track = [line_nums[i] for i in indices_to_track] cpd_backtrack.append(entries_to_track) decomp.cpd_backtrack = cpd_backtrack return cpd_backtrack
[docs]def cp_als(sptensor, rank, out_dir=None, seed=CP_ALS_SEED, max_iter=CP_ALS_MAX_ITER, stop_tol=CP_ALS_STOP_TOL, mem_limit_gb=CP_ALS_MEM_LIMIT_GB, output_init_guess=CP_ALS_OUTPUT_INIT_GUESS, verbose=CP_ALS_VERBOSE, gen_backtrack=CP_ALS_BACKTRACK): """Performs a CP decomposition using the alternating least squares (ALS) method. [1]_ [2]_ Parameters ---------- sptensor : SPTensor or str Tensor to decompose. If sptensor is of type str, it should contain a path to a directory containing an SPTensor or directly to a tensor_data.txt file. rank : int Rank of the decomposition. out_dir : str Path to directory where decomposition results will be written. If not set, the decomposition will not be written to disk. Returns ------- decomp : CPDecomp Results of the decomposition. Other Parameters ---------------- seed : int, optional Random seed used to initialize decomposition. Default value is ``0``. max_iter : int, optional Maximum iterations before stopping decomposition. Default value is ``100``. stop_tol : float, optional Minimum change in fit required for convergence. Default value is ``1.0e-5``. mem_limit_gb : int, optional Maximum memory used during decomposition, in GiB. Default value is ``2``. output_init_guess : bool, optional Write the initial guess of decomposition to ``out_dir/input_<mode_id>.txt`` Default value is ``False``. verbose : bool, optional Output verbose decomposition progress to ``out_dir/output.txt`` Default value is ``False``. gen_backtrack : bool, optional Output component-wise backtracking information to ``out_dir/cpd_backtrack.txt`` Default value is ``False``. Raises ------ Exception If there is a problem performing the decomposition. See also -------- ensign.cp_decomp.CPDecomp : CP decomposition class ensign.sptensor.SPTensor : Sparse tensor class References ---------- .. [1] Kolda, T., Bader, B., "Tensor Decompositions and Applications," SIAM Review, 51(3), pp. 455-500, 2009. .. [2] Baskaran, M., Meister, B., Vasilache, N., Lethin, R., "Efficient and Scalable Computations with Sparse Tensors," IEEE HPEC, 2012. """ if not _validate_params(sptensor, rank, max_iter, stop_tol, mem_limit_gb): msg = "Invalid Parameters" logger.error(msg) raise TypeError(msg) if isinstance(sptensor, str): if os.path.isdir(sptensor): sptensor = spt.read_sptensor(sptensor) elif os.path.isfile(sptensor): sptensor = spt.read_sptensor(os.path.dirname(sptensor) + '/', os.path.basename(sptensor)) else: msg = '{} is not a valid sptensor file or directory.'.format(sptensor) logger.error(msg) raise IOError(msg) tensor = _get_sparse_co_tensor(sptensor) options = _get_ctype_decomp_options_obj( seed=seed, cp_als_max_iter=max_iter, cp_als_stop_tol=stop_tol, cp_als_memory_limit=mem_limit_gb*(1024**3), verbose=verbose, memory_limit=mem_limit_gb*(1024**3), output_init_guess=output_init_guess ) ktensor = _get_k_tensor(sptensor.order, rank) metrics = _get_decomp_metrics() cp_als = API.cp_als cp_als.argtypes = [POINTER(et.C_SparseCoTensor), c_int, POINTER(et.C_DecompOptions), POINTER(et.C_KTensor), POINTER(et.C_DecompMetrics)] ret = cp_als(pointer(tensor), rank, pointer(options), pointer(ktensor), pointer(metrics)) if ret == MALLOC_ERROR: logger.error('MALLOC_ERROR. Try increasing the memory limit by using the mem_limit_gb argument.') exit(MALLOC_ERROR) py_decomp = _get_py_decomp_obj(ktensor, sptensor, rank, metrics, options) if output_init_guess: _get_init_guess(py_decomp, options) if gen_backtrack: _compute_cpd_backtrack(py_decomp) else: py_decomp.cpd_backtrack = None if out_dir: if not os.path.exists(out_dir): os.makedirs(out_dir) write_cp_decomp_dir(out_dir, py_decomp) return py_decomp
[docs]def cp_als_nn(sptensor, rank, out_dir=None, seed=CP_ALS_NN_SEED, max_iter=CP_ALS_NN_MAX_ITER, stop_tol=CP_ALS_NN_STOP_TOL, mem_limit_gb=CP_ALS_NN_MEM_LIMIT_GB, output_init_guess=CP_ALS_NN_OUTPUT_INIT_GUESS, verbose=CP_ALS_NN_VERBOSE, gen_backtrack=CP_ALS_NN_BACKTRACK): """Performs a nonnegative CP decomposition using the alternating least squares (ALS) method. Nonegativity constraints are on output factor matrices. Input tensor entries must be nonnegative. [1]_ [2]_ Parameters ---------- sptensor : SPTensor or str Tensor to decompose. If sptensor is of type str, it should contain a path to a directory containing an SPTensor or directly to a tensor_data.txt file. rank : int Rank of the decomposition. out_dir : str Path to directory where decomposition results will be written. If not set, the decomposition will not be written to disk. Returns ------- decomp : CPDecomp Results of the decomposition. Other Parameters ---------------- seed : int, optional Random seed used to initialize decomposition. Default value is ``0``. max_iter : int, optional Maximum iterations before stopping decomposition. Default value is ``100``. stop_tol : float, optional Minimum change in fit required for convergence. Default value is ``1.0e-5``. mem_limit_gb : int, optional Maximum memory used during decomposition, in GiB. Default value is ``2``. output_init_guess : bool, optional Write the initial guess of decomposition to ``out_dir/input_<mode_id>.txt`` Default value is ``False``. verbose : bool, optional Output verbose decomposition progress to ``out_dir/output.txt`` Default value is ``False``. gen_backtrack : bool, optional Output component-wise backtracking information to ``out_dir/cpd_backtrack.txt`` Default value is ``False``. Raises ------ Exception If there is a problem performing the decomposition. See also -------- ensign.cp_decomp.CPDecomp : CP decomposition class ensign.sptensor.SPTensor : Sparse tensor class References ---------- .. [1] Kolda, T., Bader, B., "Tensor Decompositions and Applications," SIAM Review, 51(3), pp. 455-500, 2009. .. [2] Baskaran, M., Meister, B., Vasilache, N., Lethin, R., "Efficient and Scalable Computations with Sparse Tensors," IEEE HPEC, 2012. """ if not _validate_params(sptensor, rank, max_iter, stop_tol, mem_limit_gb): msg = "Invalid Parameters" logger.error(msg) raise TypeError(msg) if isinstance(sptensor, str): if os.path.isdir(sptensor): sptensor = spt.read_sptensor(sptensor) elif os.path.isfile(sptensor): sptensor = spt.read_sptensor(os.path.dirname(sptensor) + '/', os.path.basename(sptensor)) else: msg = '{} is not a valid sptensor file or directory.'.format(sptensor) logger.error(msg) raise IOError(msg) tensor = _get_sparse_co_tensor(sptensor) options = _get_ctype_decomp_options_obj( seed=seed, cp_als_max_iter=max_iter, cp_als_stop_tol=stop_tol, cp_als_memory_limit=mem_limit_gb*(1024**3), verbose=verbose, cp_als_nn_max_iter=max_iter, cp_als_nn_stop_tol=stop_tol, memory_limit=mem_limit_gb*(1024**3), output_init_guess=output_init_guess ) ktensor = _get_k_tensor(sptensor.order, rank) metrics = _get_decomp_metrics() cp_als_nn = API.cp_als_nn cp_als_nn.argtypes = [POINTER(et.C_SparseCoTensor), c_int, POINTER(et.C_DecompOptions), POINTER(et.C_KTensor), POINTER(et.C_DecompMetrics)] ret = cp_als_nn(pointer(tensor), rank, pointer(options), pointer(ktensor), pointer(metrics)) if ret == MALLOC_ERROR: logger.error('MALLOC_ERROR. Try increasing the memory limit by using the mem_limit_gb argument.') exit(MALLOC_ERROR) py_decomp = _get_py_decomp_obj(ktensor, sptensor, rank, metrics, options) if output_init_guess: _get_init_guess(py_decomp, options) if gen_backtrack: _compute_cpd_backtrack(py_decomp) else: py_decomp.cpd_backtrack = None if out_dir: if not os.path.exists(out_dir): os.makedirs(out_dir) write_cp_decomp_dir(out_dir, py_decomp) return py_decomp
[docs]def cp_apr(sptensor, rank, out_dir=None, seed=CP_APR_SEED, max_outer_iter=CP_APR_MAX_OUTER_ITER, max_inner_iter=CP_APR_MAX_INNER_ITER, stop_tol=CP_APR_STOP_TOL, mem_limit_gb=CP_APR_MEM_LIMIT_GB, output_init_guess=CP_APR_OUTPUT_INIT_GUESS, verbose=CP_APR_VERBOSE, gen_backtrack=CP_APR_BACKTRACK): """Performs a CP decomposition using the alternating Poisson regression (APR) method. [1]_ [2]_ Parameters ---------- sptensor : SPTensor or str Tensor to decompose. If sptensor is of type str, it should contain a path to a directory containing an SPTensor or directly to a tensor_data.txt file. rank : int Rank of the decomposition. out_dir : str Path to directory where decomposition results will be written. If not set, the decomposition will not be written to disk. Returns ------- decomp : CPDecomp Results of the decomposition. Other Parameters ---------------- seed : int, optional Random seed used to initialize decomposition. Default value is ``0``. max_outer_iter : int, optional Maximum iterations before stopping decomposition. Default value is ``200``. max_inner_iter : int, optional Maximum inner iterations per outer iteration. Default value is ``10``. stop_tol : float, optional Minimum change in KKT violation required for convergence. Default value is ``1.0e-4``. mem_limit_gb : int, optional Maximum memory used during decomposition, in GiB. Default value is ``2``. output_init_guess : bool, optional Write the initial guess of decomposition to ``out_dir/input_<mode_id>.txt`` Default value is ``False``. verbose : bool, optional Output verbose decomposition progress to ``out_dir/output.txt`` Default value is ``False``. gen_backtrack : bool, optional Output component-wise backtracking information to ``out_dir/cpd_backtrack.txt`` Default value is ``False``. Raises ------ Exception If there is a problem performing the decomposition. See also -------- ensign.cp_decomp.CPDecomp : CP decomposition class ensign.sptensor.SPTensor : Sparse tensor class References ---------- .. [1] Chi, E., Kolda, T., "On Tensors, Sparsity, and Nonnegative Factorizations," SIAM Journal on Matrix Analysis and Applications 33.4, pp. 1272-1299, 2012. .. [2] Baskaran, M., Meister, B., Vasilache, N., Lethin, R., "Efficient and Scalable Computations with Sparse Tensors," IEEE HPEC, 2012. """ if not _validate_params( sptensor, rank, 1, stop_tol, mem_limit_gb, max_outer_iter, max_inner_iter ): msg = "Invalid Parameters" logger.error(msg) raise TypeError() if isinstance(sptensor, str): if os.path.isdir(sptensor): sptensor = spt.read_sptensor(sptensor) elif os.path.isfile(sptensor): sptensor = spt.read_sptensor( os.path.dirname(sptensor) + '/', os.path.basename(sptensor) ) else: msg = '{} is not a valid sptensor file or directory.'.format(sptensor) logger.error(msg) raise IOError(msg) tensor = _get_sparse_co_tensor(sptensor) options = _get_ctype_decomp_options_obj( seed=seed, cp_apr_max_outer_iter=max_outer_iter, cp_apr_stop_tol=stop_tol, cp_apr_max_inner_iter=max_inner_iter, cp_apr_memory_limit=mem_limit_gb*(1024**3), verbose=verbose, memory_limit=mem_limit_gb*(1024**3), output_init_guess=output_init_guess ) ktensor = _get_k_tensor(sptensor.order, rank) metrics = _get_decomp_metrics() cp_apr = API.cp_apr cp_apr.argtypes = [POINTER(et.C_SparseCoTensor), c_int, POINTER(et.C_DecompOptions), POINTER(et.C_KTensor), POINTER(et.C_DecompMetrics)] ret = cp_apr(pointer(tensor), rank, pointer(options), pointer(ktensor), pointer(metrics)) if ret == MALLOC_ERROR: logger.error('MALLOC_ERROR. Try increasing the memory limit by using the mem_limit_gb argument.') exit(MALLOC_ERROR) py_decomp = _get_py_decomp_obj(ktensor, sptensor, rank, metrics, options) if output_init_guess: _get_init_guess(py_decomp, options) if gen_backtrack: _compute_cpd_backtrack(py_decomp) else: py_decomp.cpd_backtrack = None if out_dir: if not os.path.exists(out_dir): os.makedirs(out_dir) write_cp_decomp_dir(out_dir, py_decomp) return py_decomp
[docs]def cp_apr_pdnr(sptensor, rank, out_dir=None, seed=CP_APR_SEED, max_outer_iter=CP_APR_PDNR_MAX_OUTER_ITER, max_inner_iter=CP_APR_PDNR_MAX_INNER_ITER, stop_tol=CP_APR_PDNR_STOP_TOL, mem_limit_gb=CP_APR_PDNR_MEM_LIMIT_GB, output_init_guess=CP_APR_PDNR_OUTPUT_INIT_GUESS, load_balance=CP_APR_PDNR_LOAD_BALANCE, verbose=CP_APR_PDNR_VERBOSE, gen_backtrack=CP_APR_PDNR_BACKTRACK): """Performs a CP decomposition using alternating Poisson regression (APR) with projected damped Newton row (PDNR) subproblem. [1]_ Parameters ---------- sptensor : SPTensor or str Tensor to decompose. If sptensor is of type str, it should contain a path to a directory containing an SPTensor or directly to a tensor_data.txt file. rank : int Rank of the decomposition. out_dir : str Path to directory where decomposition results will be written. If not set, the decomposition will not be written to disk. Returns ------- decomp : CPDecomp Results of the decomposition. Other Parameters ---------------- seed : int, optional Random seed used to initialize decomposition. Default value is ``0``. max_outer_iter : int, optional Maximum iterations before stopping decomposition. Default value is ``200``. max_inner_iter : int, optional Maximum inner iterations per outer iteration. Default value is ``10``. stop_tol : float, optional Minimum change in KKT violation required for convergence. Default value is ``1.0e-4``. mem_limit_gb : int, optional Maximum memory used during decomposition, in GiB. Default value is ``2``. output_init_guess : bool, optional Write the initial guess of decomposition to ``out_dir/input_<mode_id>.txt`` Default value is ``False``. load_balance : int, optional The load balancing technique to use for parallel execution. ``ensign.constants.NO_LOAD_BALANCE`` (default) is no load balancing, ``ensign.constants.LOAD_BALANCE`` is standard load balancing, and ``ensign.constants.LOAD_BALANCE_UNSORTED`` is unsorted load balancing. verbose : bool, optional Output verbose decomposition progress to ``out_dir/output.txt`` Default value is ``False``. gen_backtrack : bool, optional Output component-wise backtracking information to ``out_dir/cpd_backtrack.txt`` Default value is ``False``. Raises ------ Exception If there is a problem performing the decomposition. See also -------- ensign.cp_decomp.CPDecomp : CP decomposition class ensign.sptensor.SPTensor : Sparse tensor class References ---------- .. [1] Hansen, S., Plantenga, T., Kolda, T., "Newton-based Optimization for Kullback-Leibler Nonnegative Tensor Factorizations," Optimization Methods and Software 30(5), pp. 1002-1029, 2015. """ if not _validate_params(sptensor, rank, 1, stop_tol, mem_limit_gb, max_outer_iter, max_inner_iter): msg = 'Invalid Parameters' logger.error(msg) raise TypeError(msg) if isinstance(sptensor, str): if os.path.isdir(sptensor): sptensor = spt.read_sptensor(sptensor) elif os.path.isfile(sptensor): sptensor = spt.read_sptensor(os.path.dirname(sptensor) + '/', os.path.basename(sptensor)) else: msg = '{} is not a valid sptensor file or directory.'.format(sptensor) raise IOError(msg) tensor = _get_sparse_co_tensor(sptensor) options = _get_ctype_decomp_options_obj( seed=seed, cp_apr_pdnr_max_outer_iter=max_outer_iter, cp_apr_pdnr_stop_tol=stop_tol, cp_apr_pdnr_max_inner_iter=max_inner_iter, cp_apr_pdnr_memory_limit=mem_limit_gb*(1024**3), verbose=verbose, memory_limit=mem_limit_gb*(1024**3), output_init_guess=output_init_guess ) ktensor = _get_k_tensor(sptensor.order, rank) metrics = _get_decomp_metrics() cp_apr_pdnr = API.cp_apr_pdnr cp_apr_pdnr.argtypes = [POINTER(et.C_SparseCoTensor), c_int, POINTER(et.C_DecompOptions), POINTER(et.C_KTensor), POINTER(et.C_DecompMetrics)] ret = cp_apr_pdnr(pointer(tensor), rank, pointer(options), pointer(ktensor), pointer(metrics)) if ret == MALLOC_ERROR: logger.error('MALLOC_ERROR. Try increasing the memory limit by using the mem_limit_gb argument.') exit(MALLOC_ERROR) py_decomp = _get_py_decomp_obj(ktensor, sptensor, rank, metrics, options) if output_init_guess: _get_init_guess(py_decomp, options) if gen_backtrack: _compute_cpd_backtrack(py_decomp) else: py_decomp.cpd_backtrack = None if out_dir: if not os.path.exists(out_dir): os.makedirs(out_dir) write_cp_decomp_dir(out_dir, py_decomp) return py_decomp
[docs]def cp_apr_pqnr(sptensor, rank, out_dir=None, seed=CP_APR_PQNR_SEED, max_outer_iter=CP_APR_PQNR_MAX_OUTER_ITER, max_inner_iter=CP_APR_PQNR_MAX_INNER_ITER, stop_tol=CP_APR_PQNR_STOP_TOL, mem_limit_gb=CP_APR_PQNR_MEM_LIMIT_GB, output_init_guess=CP_APR_PQNR_OUTPUT_INIT_GUESS, load_balance=CP_APR_PQNR_LOAD_BALANCE, verbose=CP_APR_PQNR_VERBOSE, gen_backtrack=CP_APR_PQNR_BACKTRACK): """Performs a CP decomposition using alternating Poisson regression (APR) with projected quasi-Newton row (PQNR) subproblem. [1]_ Parameters ---------- sptensor : SPTensor or str Tensor to decompose. If sptensor is of type str, it should contain a path to a directory containing an SPTensor or directly to a tensor_data.txt file. rank : int Rank of the decomposition. out_dir : str Path to directory where decomposition results will be written. If not set, the decomposition will not be written to disk. Returns ------- decomp : CPDecomp Results of the decomposition. Other Parameters ---------------- seed : int, optional Random seed used to initialize decomposition. Default value is ``0``. max_outer_iter : int, optional Maximum iterations before stopping decomposition. Default value is ``200``. max_inner_iter : int, optional Maximum inner iterations per outer iteration. Default value is ``10``. stop_tol : float, optional Minimum change in KKT violation required for convergence. Default value is ``1.0e-4``. mem_limit_gb : int, optional Maximum memory used during decomposition, in GiB. Default value is ``2``. output_init_guess : bool, optional Write the initial guess of decomposition to ``out_dir/input_<mode_id>.txt`` Default value is ``False``. load_balance : int, optional The load balancing technique to use for parallel execution. ``ensign.constants.NO_LOAD_BALANCE`` (default) is no load balancing, ``ensign.constants.LOAD_BALANCE`` is standard load balancing, and ``ensign.constants.LOAD_BALANCE_UNSORTED`` is unsorted load balancing. verbose : bool, optional Output verbose decomposition progress to ``out_dir/output.txt`` Default value is ``False``. gen_backtrack : bool, optional Output component-wise backtracking information to ``out_dir/cpd_backtrack.txt`` Default value is ``False``. Raises ------ Exception If there is a problem performing the decomposition. See also -------- ensign.cp_decomp.CPDecomp : CP decomposition class ensign.sptensor.SPTensor : Sparse tensor class References ---------- .. [1] Hansen, S., Plantenga, T., Kolda, T., "Newton-based Optimization for Kullback-Leibler Nonnegative Tensor Factorizations," Optimization Methods and Software 30(5), pp. 1002-1029, 2015. """ if not _validate_params(sptensor, rank, 1, stop_tol, mem_limit_gb, max_outer_iter, max_inner_iter): msg = "Invalid Parameters" logger.error(msg) raise TypeError(msg) if isinstance(sptensor, str): if os.path.isdir(sptensor): sptensor = spt.read_sptensor(sptensor) elif os.path.isfile(sptensor): sptensor = spt.read_sptensor(os.path.dirname(sptensor) + '/', os.path.basename(sptensor)) else: msg = '{} is not a valid sptensor file or directory.'.format(sptensor) logger.error(msg) raise IOError(msg) tensor = _get_sparse_co_tensor(sptensor) options = _get_ctype_decomp_options_obj( seed=seed, cp_apr_pqnr_max_outer_iter=max_outer_iter, cp_apr_pqnr_stop_tol=stop_tol, cp_apr_pqnr_max_inner_iter=max_inner_iter, cp_apr_pqnr_memory_limit=mem_limit_gb*(1024**3), verbose=verbose, memory_limit=mem_limit_gb*(1024**3), output_init_guess=output_init_guess ) ktensor = _get_k_tensor(sptensor.order, rank) metrics = _get_decomp_metrics() cp_apr_pqnr = API.cp_apr_pqnr cp_apr_pqnr.argtypes = [POINTER(et.C_SparseCoTensor), c_int, POINTER(et.C_DecompOptions), POINTER(et.C_KTensor), POINTER(et.C_DecompMetrics)] ret = cp_apr_pqnr(pointer(tensor), rank, pointer(options), pointer(ktensor), pointer(metrics)) if ret == MALLOC_ERROR: logger.error('MALLOC_ERROR. Try increasing the memory limit by using the mem_limit_gb argument.') exit(MALLOC_ERROR) py_decomp = _get_py_decomp_obj(ktensor, sptensor, rank, metrics, options) if output_init_guess: _get_init_guess(py_decomp, options) if gen_backtrack: _compute_cpd_backtrack(py_decomp) else: py_decomp.cpd_backtrack = None if out_dir: if not os.path.exists(out_dir): os.makedirs(out_dir) write_cp_decomp_dir(out_dir, py_decomp) return py_decomp