"""
Confirmatory factor analysis using machine learning methods.
:author: Jeremy Biggs (jeremy.m.biggs@gmail.com)
:author: Nitin Madnani (nmadnani@ets.org)
:organization: Educational Testing Service
:date: 2022-09-05
"""
import warnings
from copy import deepcopy
import numpy as np
import pandas as pd
from scipy.linalg import block_diag
from scipy.optimize import minimize
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted
from .utils import (
commutation_matrix,
cov,
covariance_to_correlation,
duplication_matrix_pre_post,
get_free_parameter_idxs,
get_symmetric_lower_idxs,
get_symmetric_upper_idxs,
impute_values,
merge_variance_covariance,
unique_elements,
)
[docs]
class ModelSpecification:
"""
Encapsulate the model specification for CFA.
This class contains a number of specification properties
that are used in the CFA procedure.
Parameters
----------
loadings : array-like
The factor loadings specification.
n_factors : int
The number of factors.
n_variables : int
The number of variables.
factor_names : list of str or None
A list of factor names, if available.
Defaults to ``None``.
variable_names : list of str or None
A list of variable names, if available.
Defaults to ``None``.
"""
def __init__(
self, loadings, n_factors, n_variables, factor_names=None, variable_names=None
):
"""Initialize the specification."""
assert isinstance(loadings, np.ndarray)
assert loadings.shape[0] == n_variables
assert loadings.shape[1] == n_factors
self._loadings = loadings
self._n_factors = n_factors
self._n_variables = n_variables
self._factor_names = factor_names
self._variable_names = variable_names
self._n_lower_diag = get_symmetric_lower_idxs(n_factors, False).shape[0]
self._error_vars = np.full((n_variables, 1), np.nan)
self._factor_covs = np.full((n_factors, n_factors), np.nan)
self._loadings_free = get_free_parameter_idxs(loadings, eq=1)
self._error_vars_free = merge_variance_covariance(self._error_vars)
self._error_vars_free = get_free_parameter_idxs(self._error_vars_free, eq=-1)
self._factor_covs_free = get_symmetric_lower_idxs(n_factors, False)
def __str__(self):
"""Represent model specification as a string."""
return f"<ModelSpecification object at {hex(id(self))}>"
[docs]
def copy(self):
"""Return a copy of the model specification."""
return deepcopy(self)
@property
def loadings(self):
"""Get the factor loadings specification."""
return self._loadings.copy()
@property
def error_vars(self):
"""Get the error variance specification."""
return self._error_vars.copy()
@property
def factor_covs(self):
"""Get the factor covariance specification."""
return self._factor_covs.copy()
@property
def loadings_free(self):
"""Get the indices of "free" factor loading parameters."""
return self._loadings_free.copy()
@property
def error_vars_free(self):
"""Get the indices of "free" error variance parameters."""
return self._error_vars_free.copy()
@property
def factor_covs_free(self):
"""Get the indices of "free" factor covariance parameters."""
return self._factor_covs_free.copy()
@property
def n_variables(self):
"""Get the number of variables."""
return self._n_variables
@property
def n_factors(self):
"""Get the number of factors."""
return self._n_factors
@property
def n_lower_diag(self):
"""Get the lower diagonal of the factor covariance matrix."""
return self._n_lower_diag
@property
def factor_names(self):
"""Get list of factor names, if available."""
return self._factor_names
@property
def variable_names(self):
"""Get list of variable names, if available."""
return self._variable_names
[docs]
def get_model_specification_as_dict(self):
"""
Get the model specification as a dictionary.
Returns
-------
model_specification : dict
The model specification keys and values,
as a dictionary.
"""
return {
"loadings": self._loadings.copy(),
"error_vars": self._error_vars.copy(),
"factor_covs": self._factor_covs.copy(),
"loadings_free": self._loadings_free.copy(),
"error_vars_free": self._error_vars_free.copy(),
"factor_covs_free": self._factor_covs_free.copy(),
"n_variables": self._n_variables,
"n_factors": self._n_factors,
"n_lower_diag": self._n_lower_diag,
"variable_names": self._variable_names,
"factor_names": self._factor_names,
}
[docs]
class ModelSpecificationParser:
"""
Generate the model specification for CFA.
This class includes two static methods to generate a
:class:`ModelSpecification` object from either a dictionary
or a numpy array.
"""
[docs]
@staticmethod
def parse_model_specification_from_dict(X, specification=None):
"""
Generate the model specification from a dictionary.
The keys in the dictionary should be the factor names, and the
values should be the feature names. If this method is used to
create the :class:`ModelSpecification` object, then factor names
and variable names will be added as properties to that object.
Parameters
----------
X : array-like
The data set that will be used for CFA.
specification : dict or None
A dictionary with the loading details. If ``None``, the matrix will
be created assuming all variables load on all factors.
Defaults to ``None``.
Returns
-------
ModelSpecification
A model specification object.
Raises
------
ValueError
If ``specification`` is not in the expected format.
Examples
--------
>>> import pandas as pd
>>> from factor_analyzer import (ConfirmatoryFactorAnalyzer,
... ModelSpecificationParser)
>>> X = pd.read_csv('tests/data/test11.csv')
>>> model_dict = {"F1": ["V1", "V2", "V3", "V4"],
... "F2": ["V5", "V6", "V7", "V8"]}
>>> model_spec = ModelSpecificationParser.parse_model_specification_from_dict(X, model_dict)
"""
if specification is None:
factor_names, variable_names = None, None
n_variables, n_factors = X.shape[1], X.shape[1]
loadings = np.ones((n_factors, n_factors), dtype=int)
elif isinstance(specification, dict):
factor_names = list(specification)
variable_names = unique_elements(
[v for f in specification.values() for v in f]
)
loadings_new = {}
for factor in factor_names:
loadings_for_factor = pd.Series(variable_names).isin(
specification[factor]
)
loadings_for_factor = loadings_for_factor.astype(int)
loadings_new[factor] = loadings_for_factor
loadings = pd.DataFrame(loadings_new).values
n_variables, n_factors = loadings.shape
else:
raise ValueError(
"The model `specification` must be either a dict "
"or None, not {}".format(type(specification))
)
return ModelSpecification(
**{
"loadings": loadings,
"n_variables": n_variables,
"n_factors": n_factors,
"factor_names": factor_names,
"variable_names": variable_names,
}
)
[docs]
@staticmethod
def parse_model_specification_from_array(X, specification=None):
"""
Generate the model specification from a numpy array.
The columns should correspond to the factors, and the rows
should correspond to the variables. If this method is used
to create the :class:`ModelSpecification` object, then *no* factor
names and variable names will be added as properties to that
object.
Parameters
----------
X : array-like
The data set that will be used for CFA.
specification : array-like or None
An array with the loading details. If ``None``, the matrix will
be created assuming all variables load on all factors.
Defaults to ``None``.
Returns
-------
ModelSpecification
A model specification object.
Raises
------
ValueError
If ``specification`` is not in the expected format.
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> from factor_analyzer import (ConfirmatoryFactorAnalyzer,
... ModelSpecificationParser)
>>> X = pd.read_csv('tests/data/test11.csv')
>>> model_array = np.array([[1, 1, 1, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 1, 1, 1]])
>>> model_spec = ModelSpecificationParser.parse_model_specification_from_array(X,
... model_array)
"""
if specification is None:
n_variables, n_factors = X.shape[1], X.shape[1]
loadings = np.ones((n_factors, n_factors), dtype=int)
elif isinstance(specification, (np.ndarray, pd.DataFrame)):
n_variables, n_factors = specification.shape
if isinstance(specification, pd.DataFrame):
loadings = specification.values.copy()
else:
loadings = specification.copy()
else:
raise ValueError(
"The model `specification` must be either a numpy array "
"or None, not {}".format(type(specification))
)
return ModelSpecification(
**{"loadings": loadings, "n_variables": n_variables, "n_factors": n_factors}
)
[docs]
class ConfirmatoryFactorAnalyzer(BaseEstimator, TransformerMixin):
"""
Fit a confirmatory factor analysis model using maximum likelihood.
Parameters
----------
specification : :class:`ModelSpecification` or None, optional
A model specification. This must be a :class:`ModelSpecification` object
or ``None``. If ``None``, a :class:`ModelSpecification` object will be
generated assuming that ``n_factors`` == ``n_variables``, and that
all variables load on all factors. Note that this could mean the
factor model is not identified, and the optimization could fail.
Defaults to `None`.
n_obs : int or None, optional
The number of observations in the original data set.
If this is not passed and ``is_cov_matrix`` is ``True``,
then an error will be raised.
Defaults to ``None``.
is_cov_matrix : bool, optional
Whether the input ``X`` is a covariance matrix. If ``False``,
assume it is the full data set.
Defaults to ``False``.
bounds : list of tuples or None, optional
A list of minimum and maximum boundaries for each element
of the input array. This must equal ``x0``, which is the
input array from your parsed and combined model specification.
The length is:
((n_factors * n_variables) + n_variables + n_factors +
(((n_factors * n_factors) - n_factors) // 2)
If `None`, nothing will be bounded.
Defaults to ``None``.
max_iter : int, optional
The maximum number of iterations for the optimization routine.
Defaults to 200.
tol : float or None, optional
The tolerance for convergence.
Defaults to ``None``.
disp : bool, optional
Whether to print the scipy optimization ``fmin`` message to
standard output.
Defaults to ``True``.
Raises
------
ValueError
If `is_cov_matrix` is `True`, and `n_obs` is not provided.
Attributes
----------
model : ModelSpecification
The model specification object.
loadings_ : :obj:`numpy.ndarray`
The factor loadings matrix.
``None``, if ``fit()``` has not been called.
error_vars_ : :obj:`numpy.ndarray`
The error variance matrix
factor_varcovs_ : :obj:`numpy.ndarray`
The factor covariance matrix.
log_likelihood_ : float
The log likelihood from the optimization routine.
aic_ : float
The Akaike information criterion.
bic_ : float
The Bayesian information criterion.
Examples
--------
>>> import pandas as pd
>>> from factor_analyzer import (ConfirmatoryFactorAnalyzer,
... ModelSpecificationParser)
>>> X = pd.read_csv('tests/data/test11.csv')
>>> model_dict = {"F1": ["V1", "V2", "V3", "V4"],
... "F2": ["V5", "V6", "V7", "V8"]}
>>> model_spec = ModelSpecificationParser.parse_model_specification_from_dict(X, model_dict)
>>> cfa = ConfirmatoryFactorAnalyzer(model_spec, disp=False)
>>> cfa.fit(X.values)
>>> cfa.loadings_
array([[0.99131285, 0. ],
[0.46074919, 0. ],
[0.3502267 , 0. ],
[0.58331488, 0. ],
[0. , 0.98621042],
[0. , 0.73389239],
[0. , 0.37602988],
[0. , 0.50049507]])
>>> cfa.factor_varcovs_
array([[1. , 0.17385704],
[0.17385704, 1. ]])
>>> cfa.get_standard_errors()
(array([[0.06779949, 0. ],
[0.04369956, 0. ],
[0.04153113, 0. ],
[0.04766645, 0. ],
[0. , 0.06025341],
[0. , 0.04913149],
[0. , 0.0406604 ],
[0. , 0.04351208]]),
array([0.11929873, 0.05043616, 0.04645803, 0.05803088,
0.10176889, 0.06607524, 0.04742321, 0.05373646]))
>>> cfa.transform(X.values)
array([[-0.46852166, -1.08708035],
[ 2.59025301, 1.20227783],
[-0.47215977, 2.65697245],
...,
[-1.5930886 , -0.91804114],
[ 0.19430887, 0.88174818],
[-0.27863554, -0.7695101 ]])
"""
def __init__(
self,
specification=None,
n_obs=None,
is_cov_matrix=False,
bounds=None,
max_iter=200,
tol=None,
impute="median",
disp=True,
):
"""Initialize the analyzer object."""
# if the input is going to be a covariance matrix, rather than
# the full data set, then users must pass the number of observations
if is_cov_matrix and n_obs is None:
raise ValueError(
"If `is_cov_matrix=True`, you must provide "
"the number of observations, `n_obs`."
)
self.specification = specification
self.n_obs = n_obs
self.is_cov_matrix = is_cov_matrix
self.bounds = bounds
self.max_iter = max_iter
self.tol = tol
self.impute = impute
self.disp = disp
self.cov_ = None
self.mean_ = None
self.loadings_ = None
self.error_vars_ = None
self.factor_varcovs_ = None
self.log_likelihood_ = None
self.aic_ = None
self.bic_ = None
self._n_factors = None
self._n_variables = None
self._n_lower_diag = None
@staticmethod
def _combine(
loadings,
error_vars,
factor_vars,
factor_covs,
n_factors,
n_variables,
n_lower_diag,
):
"""
Combine given multi-dimensional matrics into a one-dimensional matrix.
Combine a set of multi-dimensional loading, error variance, factor
variance, and factor covariance matrices into a one-dimensional
(X, 1) matrix, where the length of X is the length of all elements
across all of the matrices.
Parameters
----------
loadings : array-like
The loadings matrix (``n_factors`` * ``n_variables``)
error_vars : array-like (``n_variables`` * 1)
The error variance array.
factor_vars : array-like (``n_factors`` * 1)
The factor variance array.
factor_covs : array-like (``n_lower_diag`` * 1)
The factor covariance array.
n_factors : int
The number of factors.
n_variables : int
The number of variables.
n_lower_diag :int
The number of elements in the ``factor_covs`` array, which
is equal to the lower diagonal of the factor covariance
matrix.
Returns
-------
array : :obj:`numpy.ndarray`
The combined (X, 1) array.
"""
loadings = loadings.reshape(n_factors * n_variables, 1, order="F")
error_vars = error_vars.reshape(n_variables, 1, order="F")
factor_vars = factor_vars.reshape(n_factors, 1, order="F")
factor_covs = factor_covs.reshape(n_lower_diag, 1, order="F")
return np.concatenate([loadings, error_vars, factor_vars, factor_covs])
@staticmethod
def _split(x, n_factors, n_variables, n_lower_diag):
"""
Split given one-dimensional array into multi-dimensional arrays.
Given a one-dimensional array, split it into a set of
multi-dimensional loading, error variance, factor variance,
and factor covariance matrices.
Parameters
----------
x : array-like
The combined (X, 1) array to split.
n_factors : int
The number of factors.
n_variables : int
The number of variables.
n_lower_diag : int
The number of elements in the ``factor_covs`` array, which is
equal to the lower diagonal of the factor covariance matrix.
Returns
-------
loadings : array-like
The loadings matrix (``n_factors`` * ``n_variables``)
error_vars : array-like (``n_variables`` * 1)
The error variance array.
factor_vars : array-like (``n_factors`` * 1)
The factor variance array.
factor_covs : array-like (``n_lower_diag`` * 1)
The factor covariance array.
"""
loadings_ix = int(n_factors * n_variables)
error_vars_ix = n_variables + loadings_ix
factor_vars_ix = n_factors + error_vars_ix
factor_covs_ix = n_lower_diag + factor_vars_ix
return (
x[:loadings_ix].reshape((n_variables, n_factors), order="F"),
x[loadings_ix:error_vars_ix].reshape((n_variables, 1), order="F"),
x[error_vars_ix:factor_vars_ix].reshape((n_factors, 1), order="F"),
x[factor_vars_ix:factor_covs_ix].reshape((n_lower_diag, 1), order="F"),
)
def _objective(self, x0, cov_mtx, loadings):
"""
Get the objective function for the analyzer.
Parameters
----------
x0: array-like
The combined (X, 1) array. These are the initial values for
the ``minimize`()` function.
cov_mtx : array-like
The covariance matrix from the original data set.
loadings : array-like
The loadings matrix (``n_factors`` * ``n_variables``)
from the model parser. This tells the objective function what
elements should be fixed.
Returns
-------
error : float
The error from the objective function.
"""
(
loadings_init,
error_vars_init,
factor_vars_init,
factor_covs_init,
) = self._split(
x0, self.model.n_factors, self.model.n_variables, self.model.n_lower_diag
)
# set the loadings to zero where applicable
loadings_init[np.where(loadings == 0)] = 0
# combine factor variances and covariances into a single matrix
factor_varcov_init = merge_variance_covariance(
factor_vars_init, factor_covs_init
)
# make the error variance into a variance-covariance matrix
error_varcov_init = merge_variance_covariance(error_vars_init)
# make the factor variance-covariance matrix into a correlation matrix
with np.errstate(all="ignore"):
factor_varcov_init = covariance_to_correlation(factor_varcov_init)
# calculate sigma-theta, needed for the objective function
sigma_theta = (
loadings_init.dot(factor_varcov_init).dot(loadings_init.T)
+ error_varcov_init
)
with np.errstate(all="ignore"):
error = -(
((-self.n_obs * self.model.n_variables / 2) * np.log(2 * np.pi))
- (self.n_obs / 2)
* (
np.log(np.linalg.det(sigma_theta))
+ np.trace(cov_mtx.dot(np.linalg.inv(sigma_theta)))
)
)
# make sure the error is greater than or
# equal to zero before we return it; we
# do not do this for the Bollen approach
error = 0.0 if error < 0.0 else error
return error
[docs]
def fit(self, X, y=None):
"""
Perform confirmatory factor analysis.
Parameters
----------
X : array-like
The data to use for confirmatory factor analysis. If this is just a
covariance matrix, make sure ``is_cov_matrix`` was set to ``True``.
y : ignored
Raises
------
ValueError
If the specification is not None or a :class:`ModelSpecification` object.
AssertionError
If ``is_cov_matrix`` was ``True`` and the matrix is not square.
AssertionError
If ``len(bounds)`` != ``len(x0)``
Examples
--------
>>> import pandas as pd
>>> from factor_analyzer import (ConfirmatoryFactorAnalyzer,
... ModelSpecificationParser)
>>> X = pd.read_csv('tests/data/test11.csv')
>>> model_dict = {"F1": ["V1", "V2", "V3", "V4"],
... "F2": ["V5", "V6", "V7", "V8"]}
>>> model_spec = ModelSpecificationParser.parse_model_specification_from_dict(X, model_dict)
>>> cfa = ConfirmatoryFactorAnalyzer(model_spec, disp=False)
>>> cfa.fit(X.values)
>>> cfa.loadings_
array([[0.99131285, 0. ],
[0.46074919, 0. ],
[0.3502267 , 0. ],
[0.58331488, 0. ],
[0. , 0.98621042],
[0. , 0.73389239],
[0. , 0.37602988],
[0. , 0.50049507]])
"""
if self.specification is None:
self.model = ModelSpecificationParser.parse_model_specification_from_array(
X
)
elif isinstance(self.specification, ModelSpecification):
self.model = self.specification.copy()
else:
raise ValueError(
"The `specification` must be None or `ModelSpecification` "
"instance, not {}".format(type(self.specification))
)
if isinstance(X, pd.DataFrame):
X = X.values
# now check the array, and make sure it
# meets all of our expected criteria
X = check_array(X, force_all_finite="allow-nan", estimator=self, copy=True)
# check to see if there are any null values, and if
# so impute using the desired imputation approach
if np.isnan(X).any() and not self.is_cov_matrix:
X = impute_values(X, how=self.impute)
if not self.is_cov_matrix:
# make sure that the columns are in the proper order
# data = data[variable_names].copy()
# get the number of observations from the data, if `n_obs` not passed;
# then, calculate the covariance matrix from the data set
self.n_obs = X.shape[0] if self.n_obs is None else self.n_obs
self.mean_ = np.mean(X, axis=0)
cov_mtx = cov(X)
else:
error_msg = (
"If `is_cov_matrix=True`, then the rows and column in the data "
"set must be equal, and must equal the number of variables "
"in your model."
)
assert X.shape[0] == X.shape[1] == self.model.n_variables, error_msg
cov_mtx = X.copy()
self.cov_ = cov_mtx.copy()
# we initialize all of the arrays, setting the covariances
# lower than the expected variances, and the loadings to 1 or 0
loading_init = self.model.loadings
error_vars_init = np.full((self.model.n_variables, 1), 0.5)
factor_vars_init = np.full((self.model.n_factors, 1), 1.0)
factor_covs_init = np.full((self.model.n_lower_diag, 1), 0.05)
# we merge all of the arrays into a single 1d vector
x0 = self._combine(
loading_init,
error_vars_init,
factor_vars_init,
factor_covs_init,
self.model.n_factors,
self.model.n_variables,
self.model.n_lower_diag,
)
# if the bounds argument is None, then we initialized the
# boundaries to (None, None) for everything except factor covariances;
# at some point in the future, we may update this to place limits
# on the loading matrix boundaries, too, but the case in R and SAS
if self.bounds is not None:
error_msg = (
"The length of `bounds` must equal the length of your "
"input array `x0`: {} != {}.".format(len(self.bounds), len(x0))
)
assert len(self.bounds) == len(x0), error_msg
# fit the actual model using L-BFGS algorithm;
# the constraints are set inside the objective function,
# so that we can avoid using linear programming methods (e.g. SLSQP)
res = minimize(
self._objective,
x0.flatten(),
method="L-BFGS-B",
options={"maxiter": self.max_iter, "disp": self.disp},
bounds=self.bounds,
args=(cov_mtx, self.model.loadings),
)
# if the optimizer failed to converge, print the message
if not res.success:
warnings.warn(
f"The optimization routine failed to converge: {str(res.message)}"
)
# we split all the 1d array back into the set of original arrays
(loadings_res, error_vars_res, factor_vars_res, factor_covs_res) = self._split(
res.x, self.model.n_factors, self.model.n_variables, self.model.n_lower_diag
)
# we combine the factor covariances and variances into
# a single variance-covariance matrix to make things easier,
# but also check to make see if anything was fixed
factor_varcovs_res = merge_variance_covariance(factor_vars_res, factor_covs_res)
with np.errstate(all="ignore"):
factor_varcovs_res = covariance_to_correlation(factor_varcovs_res)
self.loadings_ = loadings_res
self.error_vars_ = error_vars_res
self.factor_varcovs_ = factor_varcovs_res
# we also calculate the log-likelihood, AIC, and BIC
self.log_likelihood_ = -res.fun
self.aic_ = 2 * res.fun + 2 * (x0.shape[0] + self.model.n_variables)
if self.n_obs is not None:
self.bic_ = 2 * res.fun + np.log(self.n_obs) * (
x0.shape[0] + self.model.n_variables
)
return self
[docs]
def get_model_implied_cov(self):
"""
Get the model-implied covariance matrix (sigma) for an estimated model.
Returns
-------
model_implied_cov : :obj:`numpy.ndarray`
The model-implied covariance matrix.
Examples
--------
>>> import pandas as pd
>>> from factor_analyzer import (ConfirmatoryFactorAnalyzer,
... ModelSpecificationParser)
>>> X = pd.read_csv('tests/data/test11.csv')
>>> model_dict = {"F1": ["V1", "V2", "V3", "V4"],
... "F2": ["V5", "V6", "V7", "V8"]}
>>> model_spec = ModelSpecificationParser.parse_model_specification_from_dict(X, model_dict)
>>> cfa = ConfirmatoryFactorAnalyzer(model_spec, disp=False)
>>> cfa.fit(X.values)
>>> cfa.get_model_implied_cov()
array([[2.07938612, 0.45674659, 0.34718423, 0.57824753, 0.16997013,
0.12648394, 0.06480751, 0.08625868],
[0.45674659, 1.16703337, 0.16136667, 0.26876186, 0.07899988,
0.05878807, 0.03012168, 0.0400919 ],
[0.34718423, 0.16136667, 1.07364855, 0.20429245, 0.06004974,
0.04468625, 0.02289622, 0.03047483],
[0.57824753, 0.26876186, 0.20429245, 1.28809317, 0.10001495,
0.07442652, 0.03813447, 0.05075691],
[0.16997013, 0.07899988, 0.06004974, 0.10001495, 2.0364391 ,
0.72377232, 0.37084458, 0.49359346],
[0.12648394, 0.05878807, 0.04468625, 0.07442652, 0.72377232,
1.48080077, 0.27596546, 0.36730952],
[0.06480751, 0.03012168, 0.02289622, 0.03813447, 0.37084458,
0.27596546, 1.11761918, 0.1882011 ],
[0.08625868, 0.0400919 , 0.03047483, 0.05075691, 0.49359346,
0.36730952, 0.1882011 , 1.28888233]])
"""
# meets all of our expected criteria
check_is_fitted(self, ["loadings_", "factor_varcovs_"])
error = np.diag(self.error_vars_.flatten())
return self.loadings_.dot(self.factor_varcovs_).dot(self.loadings_.T) + error
def _get_derivatives_implied_cov(self):
"""
Compute the derivatives for the implied covariance matrix (sigma).
Returns
-------
loadings_dx : :obj:`numpy.ndarray`
The derivative of the loadings matrix.
factor_covs_dx : :obj:`numpy.ndarray`
The derivative of the factor covariance matrix.
error_covs_dx : :obj:`numpy.ndarray`
The derivative of the error covariance matrix.
"""
# meets all of our expected criteria
check_is_fitted(self, "loadings_")
loadings = self.loadings_.copy()
factor_covs = self.factor_varcovs_.copy()
sym_lower_var_idx = get_symmetric_lower_idxs(self.model.n_variables)
sym_upper_fac_idx = get_symmetric_upper_idxs(self.model.n_factors, diag=False)
sym_lower_fac_idx = get_symmetric_lower_idxs(self.model.n_factors, diag=False)
factors_diag = np.eye(self.model.n_factors)
factors_diag_mult = (
factors_diag.dot(factor_covs).dot(factors_diag.T).dot(loadings.T)
)
# calculate the derivative of the loadings matrix, using the commutation matrix
loadings_dx = np.eye(self.model.n_variables**2) + commutation_matrix(
self.model.n_variables, self.model.n_variables
)
loadings_dx = loadings_dx.dot(
np.kron(factors_diag_mult, np.eye(self.model.n_variables)).T
)
# calculate the derivative of the factor_covs matrix
factor_covs_dx = loadings.dot(factors_diag)
factor_covs_dx = np.kron(factor_covs_dx, factor_covs_dx)
off_diag = (
factor_covs_dx[:, sym_lower_fac_idx] + factor_covs_dx[:, sym_upper_fac_idx]
)
combine_indices = np.concatenate([sym_upper_fac_idx, sym_lower_fac_idx])
combine_diag = np.concatenate([off_diag, off_diag], axis=1)
factor_covs_dx[:, combine_indices] = combine_diag
factor_covs_dx = factor_covs_dx[:, : factor_covs.size]
# calculate the derivative of the error_cov matrix,
# which we assume will always be a diagonal matrix
error_covs_dx = np.eye(self.model.n_variables**2)
# make sure these matrices are symmetric
loadings_dx = loadings_dx[sym_lower_var_idx, :]
factor_covs_dx = factor_covs_dx[sym_lower_var_idx, :]
error_covs_dx = error_covs_dx[sym_lower_var_idx, :]
# we also want to calculate the derivative for the intercepts
intercept_dx = np.zeros(
(loadings_dx.shape[0], self.model.n_variables), dtype=float
)
return (
loadings_dx[:, self.model.loadings_free].copy(),
factor_covs_dx[:, self.model.factor_covs_free].copy(),
error_covs_dx[:, self.model.error_vars_free].copy(),
intercept_dx,
)
def _get_derivatives_implied_mu(self):
"""
Compute the "derivatives" for the implied means.
Note that the derivatives of the implied means will not correspond
to the actual mean values of the original data set, because that
data could be a covariance matrix, rather than the full data set.
Thus, we assume the mean values are zero and the data are normally
distributed.
Returns
-------
loadings_dx : :obj:`numpy.ndarray`
The derivative of the loadings means.
factor_covs_dx : :obj:`numpy.ndarray`
The derivative of the factor covariance means.
error_covs_dx : :obj:`numpy.ndarray`
The derivative of the error covariance means.
"""
# meets all of our expected criteria
check_is_fitted(self, "loadings_")
# initialize some matrices that we'll use below to
# correct the shape of the mean loadings derivatives
factors_zero = np.zeros((self.model.n_factors, 1))
factors_diag = np.eye(self.model.n_factors)
# the mean derivatives will just be zeros for both
# the error covariance and factor covariance matrices,
# since we don't have actual mean values
error_covs_dx = np.zeros(
(self.model.n_variables, len(self.model.error_vars_free))
)
factor_covs_dx = np.zeros(
(self.model.n_variables, len(self.model.factor_covs_free))
)
# again, the implied means are just going to be diagonal matrices
# corresponding to the number of variables and factors
loadings_dx = np.kron(
factors_diag.dot(factors_zero).T, np.eye(self.model.n_variables)
)
loadings_dx = loadings_dx[:, self.model.loadings_free].copy()
# we also calculate the derivative for the intercept, which will be zeros again
intercept_dx = np.zeros((loadings_dx.shape[0], self.model.n_variables))
intercept_dx[: self.model.n_variables, : self.model.n_variables] = np.eye(
self.model.n_variables
)
return (loadings_dx, factor_covs_dx, error_covs_dx, intercept_dx)
[docs]
def get_standard_errors(self):
"""
Get standard errors from the implied covariance matrix and implied means.
Returns
-------
loadings_se : :obj:`numpy.ndarray`
The standard errors for the factor loadings.
error_vars_se : :obj:`numpy.ndarray`
The standard errors for the error variances.
Examples
--------
>>> import pandas as pd
>>> from factor_analyzer import (ConfirmatoryFactorAnalyzer,
... ModelSpecificationParser)
>>> X = pd.read_csv('tests/data/test11.csv')
>>> model_dict = {"F1": ["V1", "V2", "V3", "V4"],
... "F2": ["V5", "V6", "V7", "V8"]}
>>> model_spec = ModelSpecificationParser.parse_model_specification_from_dict(X, model_dict)
>>> cfa = ConfirmatoryFactorAnalyzer(model_spec, disp=False)
>>> cfa.fit(X.values)
>>> cfa.get_standard_errors()
(array([[0.06779949, 0. ],
[0.04369956, 0. ],
[0.04153113, 0. ],
[0.04766645, 0. ],
[0. , 0.06025341],
[0. , 0.04913149],
[0. , 0.0406604 ],
[0. , 0.04351208]]),
array([0.11929873, 0.05043616, 0.04645803, 0.05803088,
0.10176889, 0.06607524, 0.04742321, 0.05373646]))
"""
# meets all of our expected criteria
check_is_fitted(self, ["loadings_", "n_obs"])
(
loadings_dx,
factor_covs_dx,
error_covs_dx,
intercept_dx,
) = self._get_derivatives_implied_cov()
(
loadings_dx_mu,
factor_covs_dx_mu,
error_covs_dx_mu,
intercept_dx_mu,
) = self._get_derivatives_implied_mu()
# combine all of our derivatives; below we will merge all of these
# together in a single matrix, delta, to use the delta rule
# (basically, using the gradients to calculate the information)
loadings_dx = np.append(loadings_dx_mu, loadings_dx, axis=0)
factor_covs_dx = np.append(factor_covs_dx_mu, factor_covs_dx, axis=0)
error_cov_dx = np.append(error_covs_dx_mu, error_covs_dx, axis=0)
intercept_dx = np.append(intercept_dx_mu, intercept_dx, axis=0)
# get get the implied covariance, invert it, and take the Kronecker product
sigma = self.get_model_implied_cov()
sigma_inv = np.linalg.inv(sigma)
sigma_inv_kron = np.kron(sigma_inv, sigma_inv)
# we get the fisher information matrix for H1, which is the unrestricted
# model information; we'll use this with the deltas to calculate the full
# (inverted) information matrix below, and then invert the whole thing
h1_information = 0.5 * duplication_matrix_pre_post(sigma_inv_kron)
h1_information = block_diag(sigma_inv, h1_information)
# we concatenate all derivatives into a delta matrix
delta = np.concatenate(
(loadings_dx, error_cov_dx, factor_covs_dx, intercept_dx), axis=1
)
# calculate the fisher information matrix
information = delta.T.dot(h1_information).dot(delta)
information = (1 / self.n_obs) * np.linalg.inv(information)
# calculate the standard errors from the diagonal of the
# information / cov matrix; also take the absolute value,
# just in case anything is negative
se = np.sqrt(np.abs(np.diag(information)))
# get the indexes for the standard errors
# for the loadings and the errors variances;
# in the future, we may add the factor and intercept
# covariances, but these sometimes require transformations
# that are more complicated, so for now we won't return them
loadings_idx = len(self.model.loadings_free)
error_vars_idx = self.model.n_variables + loadings_idx
# get the loading standard errors and reshape them into the
# format of the original loadings matrix
loadings_se = np.zeros((self.model.n_factors * self.model.n_variables,))
loadings_se[self.model.loadings_free] = se[:loadings_idx]
loadings_se = loadings_se.reshape(
(self.model.n_variables, self.model.n_factors), order="F"
)
# get the error variance standard errors
error_vars_se = se[loadings_idx:error_vars_idx]
return loadings_se, error_vars_se