"""
Utility functions, used primarily by the confirmatory factor analysis module.
:author: Jeremy Biggs (jeremy.m.biggs@gmail.com)
:author: Nitin Madnani (nmadnani@ets.org)
:organization: Educational Testing Service
:date: 2022-09-05
"""
import warnings
import numpy as np
from scipy.linalg import cholesky
[docs]
def inv_chol(x, logdet=False):
"""
Calculate matrix inverse using Cholesky decomposition.
Optionally, calculate the log determinant of the Cholesky.
Parameters
----------
x : array-like
The matrix to invert.
logdet : bool, optional
Whether to calculate the log determinant, instead of the inverse.
Defaults to ``False``.
Returns
-------
chol_inv : array-like
The inverted matrix.
chol_logdet : array-like or None
The log determinant, if ``logdet`` was ``True``, otherwise, ``None``.
"""
chol = cholesky(x, lower=True)
chol_inv = np.linalg.inv(chol)
chol_inv = np.dot(chol_inv.T, chol_inv)
chol_logdet = None
if logdet:
chol_diag = np.diag(chol)
chol_logdet = np.sum(np.log(chol_diag * chol_diag))
return chol_inv, chol_logdet
[docs]
def cov(x, ddof=0):
"""
Calculate the covariance matrix.
Parameters
----------
x : array-like
A 1-D or 2-D array containing multiple variables
and observations. Each column of x represents a variable,
and each row a single observation of all those variables.
ddof : int, optional
Means Delta Degrees of Freedom. The divisor used in calculations
is N - ddof, where N represents the number of elements.
Defaults to 0.
Returns
-------
r : numpy array
The covariance matrix of the variables.
"""
r = np.cov(x, rowvar=False, ddof=ddof)
return r
[docs]
def corr(x):
"""
Calculate the correlation matrix.
Parameters
----------
x : array-like
A 1-D or 2-D array containing multiple variables
and observations. Each column of x represents a variable,
and each row a single observation of all those variables.
Returns
-------
r : numpy array
The correlation matrix of the variables.
"""
x = (x - np.mean(x, axis=0)) / np.std(x, axis=0, ddof=0)
r = cov(x)
return r
[docs]
def apply_impute_nan(x, how="mean"):
"""
Apply a function to impute ``np.nan`` values with the mean or the median.
Parameters
----------
x : array-like
The 1-D array to impute.
how : str, optional
Whether to impute the 'mean' or 'median'.
Defaults to 'mean'.
Returns
-------
x : :obj:`numpy.ndarray`
The array, with the missing values imputed.
"""
if how == "mean":
x[np.isnan(x)] = np.nanmean(x)
elif how == "median":
x[np.isnan(x)] = np.nanmedian(x)
return x
[docs]
def impute_values(x, how="mean"):
"""
Impute ``np.nan`` values with the mean or median, or drop the containing rows.
Parameters
----------
x : array-like
An array to impute.
how : str, optional
Whether to impute the 'mean' or 'median'.
Defaults to 'mean'.
Returns
-------
x : :obj:`numpy.ndarray`
The array, with the missing values imputed or with rows dropped.
"""
# impute mean or median, if `how` is set to 'mean' or 'median'
if how in ["mean", "median"]:
x = np.apply_along_axis(apply_impute_nan, 0, x, how=how)
# drop missing if `how` is set to 'drop'
elif how == "drop":
x = x[~np.isnan(x).any(1), :].copy()
return x
[docs]
def smc(corr_mtx, sort=False):
"""
Calculate the squared multiple correlations.
This is equivalent to regressing each variable on all others and
calculating the r-squared values.
Parameters
----------
corr_mtx : array-like
The correlation matrix used to calculate SMC.
sort : bool, optional
Whether to sort the values for SMC before returning.
Defaults to ``False``.
Returns
-------
smc : :obj:`numpy.ndarray`
The squared multiple correlations matrix.
"""
corr_inv = np.linalg.inv(corr_mtx)
smc = 1 - 1 / np.diag(corr_inv)
if sort:
smc = np.sort(smc)
return smc
[docs]
def covariance_to_correlation(m):
"""
Compute cross-correlations from the given covariance matrix.
This is a port of R ``cov2cor()`` function.
Parameters
----------
m : array-like
The covariance matrix.
Returns
-------
retval : :obj:`numpy.ndarray`
The cross-correlation matrix.
Raises
------
ValueError
If the input matrix is not square.
"""
# make sure the matrix is square
numrows, numcols = m.shape
if not numrows == numcols:
raise ValueError("Input matrix must be square")
Is = np.sqrt(1 / np.diag(m))
retval = Is * m * np.repeat(Is, numrows).reshape(numrows, numrows)
np.fill_diagonal(retval, 1.0)
return retval
[docs]
def partial_correlations(x):
"""
Compute partial correlations between variable pairs.
This is a python port of the ``pcor()`` function implemented in
the ``ppcor`` R package, which computes partial correlations
for each pair of variables in the given array, excluding all
other variables.
Parameters
----------
x : array-like
An array containing the feature values.
Returns
-------
pcor : :obj:`numpy.ndarray`
An array containing the partial correlations of of each
pair of variables in the given array, excluding all other
variables.
"""
numrows, numcols = x.shape
x_cov = cov(x, ddof=1)
# create empty array for when we cannot compute the
# matrix inversion
empty_array = np.empty((numcols, numcols))
empty_array[:] = np.nan
if numcols > numrows:
icvx = empty_array
else:
# if the determinant is less than the lowest representable
# 32 bit integer, then we use the pseudo-inverse;
# otherwise, use the inverse; if a linear algebra error
# occurs, then we just set the matrix to empty
try:
assert np.linalg.det(x_cov) > np.finfo(np.float32).eps
icvx = np.linalg.inv(x_cov)
except AssertionError:
icvx = np.linalg.pinv(x_cov)
warnings.warn(
"The inverse of the variance-covariance matrix "
"was calculated using the Moore-Penrose generalized "
"matrix inversion, due to its determinant being at "
"or very close to zero."
)
except np.linalg.LinAlgError:
icvx = empty_array
pcor = -1 * covariance_to_correlation(icvx)
np.fill_diagonal(pcor, 1.0)
return pcor
[docs]
def unique_elements(seq):
"""
Get first unique instance of every list element, while maintaining order.
Parameters
----------
seq : list-like
The list of elements.
Returns
-------
seq : list
The updated list of elements.
"""
seen = set()
return [x for x in seq if not (x in seen or seen.add(x))]
[docs]
def fill_lower_diag(x):
"""
Fill the lower diagonal of a square matrix, given a 1-D input array.
Parameters
----------
x : array-like
The flattened input matrix that will be used to fill the lower
diagonal of the square matrix.
Returns
-------
out : :obj:`numpy.ndarray`
The output square matrix, with the lower diagonal filled by x.
References
----------
[1] https://stackoverflow.com/questions/51439271/
convert-1d-array-to-lower-triangular-matrix
"""
x = np.array(x)
x = x if len(x.shape) == 1 else np.squeeze(x, axis=1)
n = int(np.sqrt(len(x) * 2)) + 1
out = np.zeros((n, n), dtype=float)
out[np.tri(n, dtype=bool, k=-1)] = x
return out
[docs]
def merge_variance_covariance(variances, covariances=None):
"""
Merge variances and covariances into a single variance-covariance matrix.
Parameters
----------
variances : array-like
The variances that will be used to fill the diagonal of the
square matrix.
covariances : array-like or None, optional
The flattened input matrix that will be used to fill the lower and
upper diagonal of the square matrix. If None, then only the variances
will be used.
Defaults to ``None``.
Returns
-------
variance_covariance : :obj:`numpy.ndarray`
The variance-covariance matrix.
"""
variances = (
variances if len(variances.shape) == 1 else np.squeeze(variances, axis=1)
)
if covariances is None:
variance_covariance = np.zeros((variances.shape[0], variances.shape[0]))
else:
variance_covariance = fill_lower_diag(covariances)
variance_covariance += variance_covariance.T
np.fill_diagonal(variance_covariance, variances)
return variance_covariance
[docs]
def get_first_idxs_from_values(x, eq=1, use_columns=True):
"""
Get the indexes for a given value.
Parameters
----------
x : array-like
The input matrix.
eq : str or int, optional
The given value to find.
Defaults to 1.
use_columns : bool, optional
Whether to get the first indexes using the columns.
If ``False``, then use the rows instead.
Defaults to ``True``.
Returns
-------
row_idx : list
A list of row indexes.
col_idx : list
A list of column indexes.
"""
x = np.array(x)
if use_columns:
n = x.shape[1]
row_idx = [np.where(x[:, i] == eq)[0][0] for i in range(n)]
col_idx = list(range(n))
else:
n = x.shape[0]
col_idx = [np.where(x[i, :] == eq)[0][0] for i in range(n)]
row_idx = list(range(n))
return row_idx, col_idx
[docs]
def get_free_parameter_idxs(x, eq=1):
"""
Get the free parameter indices from the flattened matrix.
Parameters
----------
x : array-like
The input matrix.
eq : str or int, optional
The value that free parameters should be equal to. ``np.nan`` fields
will be populated with this value.
Defaults to 1.
Returns
-------
idx : :obj:`numpy.ndarray`
The free parameter indexes.
"""
x[np.isnan(x)] = eq
x = x.flatten(order="F")
return np.where(x == eq)[0]
[docs]
def duplication_matrix(n=1):
"""
Calculate the duplication matrix.
A function to create the duplication matrix (Dn), which is
the unique n2 × n(n+1)/2 matrix which, for any n × n symmetric
matrix A, transforms vech(A) into vec(A), as in Dn vech(A) = vec(A).
Parameters
----------
n : int, optional
The dimension of the n x n symmetric matrix.
Defaults to 1.
Returns
-------
duplication_matrix : :obj:`numpy.ndarray`
The duplication matrix.
Raises`
------
ValueError
If ``n`` is not a positive integer greater than 1.
References
----------
https://en.wikipedia.org/wiki/Duplication_and_elimination_matrices
"""
if n < 1:
raise ValueError(
"The argument `n` must be a " "positive integer greater than 1."
)
dup = np.zeros((int(n * n), int(n * (n + 1) / 2)))
count = 0
for j in range(n):
dup[j * n + j, count + j] = 1
if j < n - 1:
for i in range(j + 1, n):
dup[j * n + i, count + i] = 1
dup[i * n + j, count + i] = 1
count += n - j - 1
return dup
[docs]
def duplication_matrix_pre_post(x):
"""
Transform given input symmetric matrix using pre-post duplication.
Parameters
----------
x : array-like
The input matrix.
Returns
-------
out : :obj:`numpy.ndarray`
The transformed matrix.
Raises
------
AssertionError
If ``x`` is not symmetric.
"""
assert x.shape[0] == x.shape[1]
n2 = x.shape[1]
n = int(np.sqrt(n2))
idx1 = get_symmetric_lower_idxs(n)
idx2 = get_symmetric_upper_idxs(n)
out = x[idx1, :] + x[idx2, :]
u = np.where([i in idx2 for i in idx1])[0]
out[u, :] = out[u, :] / 2.0
out = out[:, idx1] + out[:, idx2]
out[:, u] = out[:, u] / 2.0
return out
[docs]
def commutation_matrix(p, q):
"""
Calculate the commutation matrix.
This matrix transforms the vectorized form of the matrix into the
vectorized form of its transpose.
Parameters
----------
p : int
The number of rows.
q : int
The number of columns.
Returns
-------
commutation_matrix : :obj:`numpy.ndarray`
The commutation matrix
References
----------
https://en.wikipedia.org/wiki/Commutation_matrix
"""
identity = np.eye(p * q)
indices = np.arange(p * q).reshape((p, q), order="F")
return identity.take(indices.ravel(), axis=0)
[docs]
def get_symmetric_lower_idxs(n=1, diag=True):
"""
Get the indices for the lower triangle of a symmetric matrix.
Parameters
----------
n : int, optional
The dimension of the n x n symmetric matrix.
Defaults to 1.
diag : bool, optional
Whether to include the diagonal.
Returns
-------
indices : :obj:`numpy.ndarray`
The indices for the lower triangle.
"""
rows = np.repeat(np.arange(n), n).reshape(n, n)
cols = rows.T
if diag:
return np.where((rows >= cols).T.flatten())[0]
return np.where((cols > rows).T.flatten())[0]
[docs]
def get_symmetric_upper_idxs(n=1, diag=True):
"""
Get the indices for the upper triangle of a symmetric matrix.
Parameters
----------
n : int, optional
The dimension of the n x n symmetric matrix.
Defaults to 1.
diag : bool, optional
Whether to include the diagonal.
Returns
-------
indices : :obj:`numpy.ndarray`
The indices for the upper triangle.
"""
rows = np.repeat(np.arange(n), n).reshape(n, n)
cols = rows.T
temp = np.arange(n * n).reshape(n, n)
if diag:
return temp.T[(rows >= cols).T]
return temp.T[(cols > rows).T]