Source code for factor_analyzer.utils

"""
Utility functions, used primarily by the confirmatory factor analysis module.

:author: Jeremy Biggs (jeremy.m.biggs@gmail.com)
:author: Nitin Madnani (nmadnani@ets.org)
:organization: Educational Testing Service
:date: 2022-09-05
"""
import warnings

import numpy as np
from scipy.linalg import cholesky



[docs]
def inv_chol(x, logdet=False):
    """
    Calculate matrix inverse using Cholesky decomposition.

    Optionally, calculate the log determinant of the Cholesky.

    Parameters
    ----------
    x : array-like
        The matrix to invert.
    logdet : bool, optional
        Whether to calculate the log determinant, instead of the inverse.
        Defaults to ``False``.

    Returns
    -------
    chol_inv : array-like
        The inverted matrix.
    chol_logdet : array-like or None
        The log determinant, if ``logdet`` was ``True``, otherwise, ``None``.

    """
    chol = cholesky(x, lower=True)

    chol_inv = np.linalg.inv(chol)
    chol_inv = np.dot(chol_inv.T, chol_inv)
    chol_logdet = None

    if logdet:
        chol_diag = np.diag(chol)
        chol_logdet = np.sum(np.log(chol_diag * chol_diag))

    return chol_inv, chol_logdet




[docs]
def cov(x, ddof=0):
    """
    Calculate the covariance matrix.

    Parameters
    ----------
    x : array-like
        A 1-D or 2-D array containing multiple variables
        and observations. Each column of x represents a variable,
        and each row a single observation of all those variables.
    ddof : int, optional
        Means Delta Degrees of Freedom. The divisor used in calculations
        is N - ddof, where N represents the number of elements.
        Defaults to 0.

    Returns
    -------
    r : numpy array
        The covariance matrix of the variables.
    """
    r = np.cov(x, rowvar=False, ddof=ddof)
    return r




[docs]
def corr(x):
    """
    Calculate the correlation matrix.

    Parameters
    ----------
    x : array-like
        A 1-D or 2-D array containing multiple variables
        and observations. Each column of x represents a variable,
        and each row a single observation of all those variables.

    Returns
    -------
    r : numpy array
        The correlation matrix of the variables.
    """
    x = (x - np.mean(x, axis=0)) / np.std(x, axis=0, ddof=0)
    r = cov(x)
    return r




[docs]
def apply_impute_nan(x, how="mean"):
    """
    Apply a function to impute ``np.nan`` values with the mean or the median.

    Parameters
    ----------
    x : array-like
        The 1-D array to impute.
    how : str, optional
        Whether to impute the 'mean' or 'median'.
        Defaults to 'mean'.

    Returns
    -------
    x : :obj:`numpy.ndarray`
        The array, with the missing values imputed.
    """
    if how == "mean":
        x[np.isnan(x)] = np.nanmean(x)
    elif how == "median":
        x[np.isnan(x)] = np.nanmedian(x)
    return x




[docs]
def impute_values(x, how="mean"):
    """
    Impute ``np.nan`` values with the mean or median, or drop the containing rows.

    Parameters
    ----------
    x : array-like
        An array to impute.
    how : str, optional
        Whether to impute the 'mean' or 'median'.
        Defaults to 'mean'.

    Returns
    -------
    x : :obj:`numpy.ndarray`
        The array, with the missing values imputed or with rows dropped.
    """
    # impute mean or median, if `how` is set to 'mean' or 'median'
    if how in ["mean", "median"]:
        x = np.apply_along_axis(apply_impute_nan, 0, x, how=how)
    # drop missing if `how` is set to 'drop'
    elif how == "drop":
        x = x[~np.isnan(x).any(1), :].copy()
    return x




[docs]
def smc(corr_mtx, sort=False):
    """
    Calculate the squared multiple correlations.

    This is equivalent to regressing each variable on all others and
    calculating the r-squared values.

    Parameters
    ----------
    corr_mtx : array-like
        The correlation matrix used to calculate SMC.
    sort : bool, optional
        Whether to sort the values for SMC before returning.
        Defaults to ``False``.

    Returns
    -------
    smc : :obj:`numpy.ndarray`
        The squared multiple correlations matrix.
    """
    corr_inv = np.linalg.inv(corr_mtx)
    smc = 1 - 1 / np.diag(corr_inv)

    if sort:
        smc = np.sort(smc)
    return smc




[docs]
def covariance_to_correlation(m):
    """
    Compute cross-correlations from the given covariance matrix.

    This is a port of R ``cov2cor()`` function.

    Parameters
    ----------
    m : array-like
        The covariance matrix.

    Returns
    -------
    retval : :obj:`numpy.ndarray`
        The cross-correlation matrix.

    Raises
    ------
    ValueError
        If the input matrix is not square.
    """
    # make sure the matrix is square
    numrows, numcols = m.shape
    if not numrows == numcols:
        raise ValueError("Input matrix must be square")

    Is = np.sqrt(1 / np.diag(m))
    retval = Is * m * np.repeat(Is, numrows).reshape(numrows, numrows)
    np.fill_diagonal(retval, 1.0)
    return retval




[docs]
def partial_correlations(x):
    """
    Compute partial correlations between variable pairs.

    This is a python port of the ``pcor()`` function implemented in
    the ``ppcor`` R package, which computes partial correlations
    for each pair of variables in the given array, excluding all
    other variables.

    Parameters
    ----------
    x : array-like
        An array containing the feature values.

    Returns
    -------
    pcor : :obj:`numpy.ndarray`
        An array containing the partial correlations of of each
        pair of variables in the given array, excluding all other
        variables.
    """
    numrows, numcols = x.shape
    x_cov = cov(x, ddof=1)
    # create empty array for when we cannot compute the
    # matrix inversion
    empty_array = np.empty((numcols, numcols))
    empty_array[:] = np.nan
    if numcols > numrows:
        icvx = empty_array
    else:
        # if the determinant is less than the lowest representable
        # 32 bit integer, then we use the pseudo-inverse;
        # otherwise, use the inverse; if a linear algebra error
        # occurs, then we just set the matrix to empty
        try:
            assert np.linalg.det(x_cov) > np.finfo(np.float32).eps
            icvx = np.linalg.inv(x_cov)
        except AssertionError:
            icvx = np.linalg.pinv(x_cov)
            warnings.warn(
                "The inverse of the variance-covariance matrix "
                "was calculated using the Moore-Penrose generalized "
                "matrix inversion, due to its determinant being at "
                "or very close to zero."
            )
        except np.linalg.LinAlgError:
            icvx = empty_array

    pcor = -1 * covariance_to_correlation(icvx)
    np.fill_diagonal(pcor, 1.0)
    return pcor




[docs]
def unique_elements(seq):
    """
    Get first unique instance of every list element, while maintaining order.

    Parameters
    ----------
    seq : list-like
        The list of elements.

    Returns
    -------
    seq : list
        The updated list of elements.
    """
    seen = set()
    return [x for x in seq if not (x in seen or seen.add(x))]




[docs]
def fill_lower_diag(x):
    """
    Fill the lower diagonal of a square matrix, given a 1-D input array.

    Parameters
    ----------
    x : array-like
        The flattened input matrix that will be used to fill the lower
        diagonal of the square matrix.

    Returns
    -------
    out : :obj:`numpy.ndarray`
        The output square matrix, with the lower diagonal filled by x.

    References
    ----------
    [1] https://stackoverflow.com/questions/51439271/
        convert-1d-array-to-lower-triangular-matrix
    """
    x = np.array(x)
    x = x if len(x.shape) == 1 else np.squeeze(x, axis=1)
    n = int(np.sqrt(len(x) * 2)) + 1
    out = np.zeros((n, n), dtype=float)
    out[np.tri(n, dtype=bool, k=-1)] = x
    return out




[docs]
def merge_variance_covariance(variances, covariances=None):
    """
    Merge variances and covariances into a single variance-covariance matrix.

    Parameters
    ----------
    variances : array-like
        The variances that will be used to fill the diagonal of the
        square matrix.
    covariances : array-like or None, optional
        The flattened input matrix that will be used to fill the lower and
        upper diagonal of the square matrix. If None, then only the variances
        will be used.
        Defaults to ``None``.

    Returns
    -------
    variance_covariance : :obj:`numpy.ndarray`
        The variance-covariance matrix.
    """
    variances = (
        variances if len(variances.shape) == 1 else np.squeeze(variances, axis=1)
    )
    if covariances is None:
        variance_covariance = np.zeros((variances.shape[0], variances.shape[0]))
    else:
        variance_covariance = fill_lower_diag(covariances)
        variance_covariance += variance_covariance.T
    np.fill_diagonal(variance_covariance, variances)
    return variance_covariance




[docs]
def get_first_idxs_from_values(x, eq=1, use_columns=True):
    """
    Get the indexes  for a given value.

    Parameters
    ----------
    x : array-like
        The input matrix.
    eq : str or int, optional
        The given value to find.
        Defaults to 1.
    use_columns : bool, optional
        Whether to get the first indexes using the columns.
        If ``False``, then use the rows instead.
        Defaults to ``True``.

    Returns
    -------
    row_idx : list
        A list of row indexes.
    col_idx : list
        A list of column indexes.
    """
    x = np.array(x)
    if use_columns:
        n = x.shape[1]
        row_idx = [np.where(x[:, i] == eq)[0][0] for i in range(n)]
        col_idx = list(range(n))
    else:
        n = x.shape[0]
        col_idx = [np.where(x[i, :] == eq)[0][0] for i in range(n)]
        row_idx = list(range(n))
    return row_idx, col_idx




[docs]
def get_free_parameter_idxs(x, eq=1):
    """
    Get the free parameter indices from the flattened matrix.

    Parameters
    ----------
    x : array-like
        The input matrix.
    eq : str or int, optional
        The value that free parameters should be equal to. ``np.nan`` fields
        will be populated with this value.
        Defaults to 1.

    Returns
    -------
    idx : :obj:`numpy.ndarray`
        The free parameter indexes.
    """
    x[np.isnan(x)] = eq
    x = x.flatten(order="F")
    return np.where(x == eq)[0]




[docs]
def duplication_matrix(n=1):
    """
    Calculate the duplication matrix.

    A function to create the duplication matrix (Dn), which is
    the unique n2 × n(n+1)/2 matrix which, for any n × n symmetric
    matrix A, transforms vech(A) into vec(A), as in Dn vech(A) = vec(A).

    Parameters
    ----------
    n : int, optional
        The dimension of the n x n symmetric matrix.
        Defaults to 1.

    Returns
    -------
    duplication_matrix : :obj:`numpy.ndarray`
        The duplication matrix.

    Raises`
    ------
    ValueError
        If ``n`` is not a positive integer greater than 1.

    References
    ----------
    https://en.wikipedia.org/wiki/Duplication_and_elimination_matrices
    """
    if n < 1:
        raise ValueError(
            "The argument `n` must be a " "positive integer greater than 1."
        )

    dup = np.zeros((int(n * n), int(n * (n + 1) / 2)))
    count = 0
    for j in range(n):
        dup[j * n + j, count + j] = 1
        if j < n - 1:
            for i in range(j + 1, n):
                dup[j * n + i, count + i] = 1
                dup[i * n + j, count + i] = 1
        count += n - j - 1
    return dup




[docs]
def duplication_matrix_pre_post(x):
    """
    Transform given input symmetric matrix using pre-post duplication.

    Parameters
    ----------
    x : array-like
        The input matrix.

    Returns
    -------
    out : :obj:`numpy.ndarray`
        The transformed matrix.

    Raises
    ------
    AssertionError
        If ``x`` is not symmetric.
    """
    assert x.shape[0] == x.shape[1]

    n2 = x.shape[1]
    n = int(np.sqrt(n2))

    idx1 = get_symmetric_lower_idxs(n)
    idx2 = get_symmetric_upper_idxs(n)

    out = x[idx1, :] + x[idx2, :]
    u = np.where([i in idx2 for i in idx1])[0]
    out[u, :] = out[u, :] / 2.0
    out = out[:, idx1] + out[:, idx2]
    out[:, u] = out[:, u] / 2.0
    return out




[docs]
def commutation_matrix(p, q):
    """
    Calculate the commutation matrix.

    This matrix transforms the vectorized form of the matrix into the
    vectorized form of its transpose.

    Parameters
    ----------
    p : int
        The number of rows.
    q : int
        The number of columns.

    Returns
    -------
    commutation_matrix : :obj:`numpy.ndarray`
        The commutation matrix

    References
    ----------
    https://en.wikipedia.org/wiki/Commutation_matrix
    """
    identity = np.eye(p * q)
    indices = np.arange(p * q).reshape((p, q), order="F")
    return identity.take(indices.ravel(), axis=0)




[docs]
def get_symmetric_lower_idxs(n=1, diag=True):
    """
    Get the indices for the lower triangle of a symmetric matrix.

    Parameters
    ----------
    n : int, optional
        The dimension of the n x n symmetric matrix.
        Defaults to 1.
    diag : bool, optional
        Whether to include the diagonal.

    Returns
    -------
    indices : :obj:`numpy.ndarray`
        The indices for the lower triangle.
    """
    rows = np.repeat(np.arange(n), n).reshape(n, n)
    cols = rows.T
    if diag:
        return np.where((rows >= cols).T.flatten())[0]
    return np.where((cols > rows).T.flatten())[0]




[docs]
def get_symmetric_upper_idxs(n=1, diag=True):
    """
    Get the indices for the upper triangle of a symmetric matrix.

    Parameters
    ----------
    n : int, optional
        The dimension of the n x n symmetric matrix.
        Defaults to 1.
    diag : bool, optional
        Whether to include the diagonal.

    Returns
    -------
    indices : :obj:`numpy.ndarray`
        The indices for the upper triangle.
    """
    rows = np.repeat(np.arange(n), n).reshape(n, n)
    cols = rows.T
    temp = np.arange(n * n).reshape(n, n)
    if diag:
        return temp.T[(rows >= cols).T]
    return temp.T[(cols > rows).T]