Source code for factor_analyzer.confirmatory_factor_analyzer

"""
Confirmatory factor analysis using machine learning methods.

:author: Jeremy Biggs (jeremy.m.biggs@gmail.com)
:author: Nitin Madnani (nmadnani@ets.org)
:organization: Educational Testing Service
:date: 2022-09-05
"""

import warnings
from copy import deepcopy

import numpy as np
import pandas as pd
from scipy.linalg import block_diag
from scipy.optimize import minimize
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted

from .utils import (
    commutation_matrix,
    cov,
    covariance_to_correlation,
    duplication_matrix_pre_post,
    get_free_parameter_idxs,
    get_symmetric_lower_idxs,
    get_symmetric_upper_idxs,
    impute_values,
    merge_variance_covariance,
    unique_elements,
)



[docs]
class ModelSpecification:
    """
    Encapsulate the model specification for CFA.

    This class contains a number of specification properties
    that are used in the CFA procedure.

    Parameters
    ----------
    loadings : array-like
        The factor loadings specification.
    n_factors : int
        The number of factors.
    n_variables : int
        The number of variables.
    factor_names : list of str or None
        A list of factor names, if available.
        Defaults to ``None``.
    variable_names : list of str or None
        A list of variable names, if available.
        Defaults to ``None``.
    """

    def __init__(
        self, loadings, n_factors, n_variables, factor_names=None, variable_names=None
    ):
        """Initialize the specification."""
        assert isinstance(loadings, np.ndarray)
        assert loadings.shape[0] == n_variables
        assert loadings.shape[1] == n_factors

        self._loadings = loadings
        self._n_factors = n_factors
        self._n_variables = n_variables
        self._factor_names = factor_names
        self._variable_names = variable_names

        self._n_lower_diag = get_symmetric_lower_idxs(n_factors, False).shape[0]

        self._error_vars = np.full((n_variables, 1), np.nan)
        self._factor_covs = np.full((n_factors, n_factors), np.nan)

        self._loadings_free = get_free_parameter_idxs(loadings, eq=1)
        self._error_vars_free = merge_variance_covariance(self._error_vars)
        self._error_vars_free = get_free_parameter_idxs(self._error_vars_free, eq=-1)
        self._factor_covs_free = get_symmetric_lower_idxs(n_factors, False)

    def __str__(self):
        """Represent model specification as a string."""
        return f"<ModelSpecification object at {hex(id(self))}>"


[docs]
    def copy(self):
        """Return a copy of the model specification."""
        return deepcopy(self)


    @property
    def loadings(self):
        """Get the factor loadings specification."""
        return self._loadings.copy()

    @property
    def error_vars(self):
        """Get the error variance specification."""
        return self._error_vars.copy()

    @property
    def factor_covs(self):
        """Get the factor covariance specification."""
        return self._factor_covs.copy()

    @property
    def loadings_free(self):
        """Get the indices of "free" factor loading parameters."""
        return self._loadings_free.copy()

    @property
    def error_vars_free(self):
        """Get the indices of "free" error variance parameters."""
        return self._error_vars_free.copy()

    @property
    def factor_covs_free(self):
        """Get the indices of "free" factor covariance parameters."""
        return self._factor_covs_free.copy()

    @property
    def n_variables(self):
        """Get the number of variables."""
        return self._n_variables

    @property
    def n_factors(self):
        """Get the number of factors."""
        return self._n_factors

    @property
    def n_lower_diag(self):
        """Get the lower diagonal of the factor covariance matrix."""
        return self._n_lower_diag

    @property
    def factor_names(self):
        """Get list of factor names, if available."""
        return self._factor_names

    @property
    def variable_names(self):
        """Get list of variable names, if available."""
        return self._variable_names


[docs]
    def get_model_specification_as_dict(self):
        """
        Get the model specification as a dictionary.

        Returns
        -------
        model_specification : dict
            The model specification keys and values,
            as a dictionary.
        """
        return {
            "loadings": self._loadings.copy(),
            "error_vars": self._error_vars.copy(),
            "factor_covs": self._factor_covs.copy(),
            "loadings_free": self._loadings_free.copy(),
            "error_vars_free": self._error_vars_free.copy(),
            "factor_covs_free": self._factor_covs_free.copy(),
            "n_variables": self._n_variables,
            "n_factors": self._n_factors,
            "n_lower_diag": self._n_lower_diag,
            "variable_names": self._variable_names,
            "factor_names": self._factor_names,
        }





[docs]
class ModelSpecificationParser:
    """
    Generate the model specification for CFA.

    This class includes two static methods to generate a
    :class:`ModelSpecification` object from either a dictionary
    or a numpy array.
    """


[docs]
    @staticmethod
    def parse_model_specification_from_dict(X, specification=None):
        """
        Generate the model specification from a dictionary.

        The keys in the dictionary should be the factor names, and the
        values should be the feature names. If this method is used to
        create the :class:`ModelSpecification` object, then factor names
        and variable names will be added as properties to that object.

        Parameters
        ----------
        X : array-like
            The data set that will be used for CFA.
        specification : dict or None
            A dictionary with the loading details. If ``None``, the matrix will
            be created assuming all variables load on all factors.
            Defaults to ``None``.

        Returns
        -------
        ModelSpecification
            A model specification object.

        Raises
        ------
        ValueError
            If ``specification`` is not in the expected format.

        Examples
        --------
        >>> import pandas as pd
        >>> from factor_analyzer import (ConfirmatoryFactorAnalyzer,
        ...                              ModelSpecificationParser)
        >>> X = pd.read_csv('tests/data/test11.csv')
        >>> model_dict = {"F1": ["V1", "V2", "V3", "V4"],
        ...               "F2": ["V5", "V6", "V7", "V8"]}
        >>> model_spec = ModelSpecificationParser.parse_model_specification_from_dict(X, model_dict)
        """
        if specification is None:
            factor_names, variable_names = None, None
            n_variables, n_factors = X.shape[1], X.shape[1]
            loadings = np.ones((n_factors, n_factors), dtype=int)
        elif isinstance(specification, dict):
            factor_names = list(specification)
            variable_names = unique_elements(
                [v for f in specification.values() for v in f]
            )
            loadings_new = {}
            for factor in factor_names:
                loadings_for_factor = pd.Series(variable_names).isin(
                    specification[factor]
                )
                loadings_for_factor = loadings_for_factor.astype(int)
                loadings_new[factor] = loadings_for_factor
            loadings = pd.DataFrame(loadings_new).values
            n_variables, n_factors = loadings.shape
        else:
            raise ValueError(
                "The model `specification` must be either a dict "
                "or None, not {}".format(type(specification))
            )

        return ModelSpecification(
            **{
                "loadings": loadings,
                "n_variables": n_variables,
                "n_factors": n_factors,
                "factor_names": factor_names,
                "variable_names": variable_names,
            }
        )



[docs]
    @staticmethod
    def parse_model_specification_from_array(X, specification=None):
        """
        Generate the model specification from a numpy array.

        The columns should correspond to the factors, and the rows
        should correspond to the variables. If this method is used
        to create the :class:`ModelSpecification` object, then *no* factor
        names and variable names will be added as properties to that
        object.

        Parameters
        ----------
        X : array-like
            The data set that will be used for CFA.
        specification : array-like or None
            An array with the loading details. If ``None``, the matrix will
            be created assuming all variables load on all factors.
            Defaults to ``None``.

        Returns
        -------
        ModelSpecification
            A model specification object.

        Raises
        ------
        ValueError
            If ``specification`` is not in the expected format.

        Examples
        --------
        >>> import pandas as pd
        >>> import numpy as np
        >>> from factor_analyzer import (ConfirmatoryFactorAnalyzer,
        ...                              ModelSpecificationParser)
        >>> X = pd.read_csv('tests/data/test11.csv')
        >>> model_array = np.array([[1, 1, 1, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 1, 1, 1]])
        >>> model_spec = ModelSpecificationParser.parse_model_specification_from_array(X,
        ...                                                                            model_array)
        """
        if specification is None:
            n_variables, n_factors = X.shape[1], X.shape[1]
            loadings = np.ones((n_factors, n_factors), dtype=int)
        elif isinstance(specification, (np.ndarray, pd.DataFrame)):
            n_variables, n_factors = specification.shape
            if isinstance(specification, pd.DataFrame):
                loadings = specification.values.copy()
            else:
                loadings = specification.copy()
        else:
            raise ValueError(
                "The model `specification` must be either a numpy array "
                "or None, not {}".format(type(specification))
            )

        return ModelSpecification(
            **{"loadings": loadings, "n_variables": n_variables, "n_factors": n_factors}
        )





[docs]
class ConfirmatoryFactorAnalyzer(BaseEstimator, TransformerMixin):
    """
    Fit a confirmatory factor analysis model using maximum likelihood.

    Parameters
    ----------
    specification : :class:`ModelSpecification` or None, optional
        A model specification. This must be a :class:`ModelSpecification` object
        or ``None``. If ``None``, a :class:`ModelSpecification` object will be
        generated assuming that ``n_factors`` == ``n_variables``, and that
        all variables load on all factors. Note that this could mean the
        factor model is not identified, and the optimization could fail.
        Defaults to `None`.
    n_obs : int or None, optional
        The number of observations in the original data set.
        If this is not passed and ``is_cov_matrix`` is ``True``,
        then an error will be raised.
        Defaults to ``None``.
    is_cov_matrix : bool, optional
        Whether the input ``X`` is a covariance matrix. If ``False``,
        assume it is the full data set.
        Defaults to ``False``.
    bounds : list of tuples or None, optional
        A list of minimum and maximum boundaries for each element
        of the input array. This must equal ``x0``, which is the
        input array from your parsed and combined model specification.

        The length is:
        ((n_factors * n_variables) + n_variables + n_factors +
        (((n_factors * n_factors) - n_factors) // 2)

        If `None`, nothing will be bounded.
        Defaults to ``None``.
    max_iter : int, optional
        The maximum number of iterations for the optimization routine.
        Defaults to 200.
    tol : float or None, optional
        The tolerance for convergence.
        Defaults to ``None``.
    disp : bool, optional
        Whether to print the scipy optimization ``fmin`` message to
        standard output.
        Defaults to ``True``.

    Raises
    ------
    ValueError
        If `is_cov_matrix` is `True`, and `n_obs` is not provided.

    Attributes
    ----------
    model : ModelSpecification
        The model specification object.
    loadings_ : :obj:`numpy.ndarray`
        The factor loadings matrix.
        ``None``, if ``fit()``` has not been called.
    error_vars_ : :obj:`numpy.ndarray`
        The error variance matrix
    factor_varcovs_ : :obj:`numpy.ndarray`
        The factor covariance matrix.
    log_likelihood_ : float
        The log likelihood from the optimization routine.
    aic_ : float
        The Akaike information criterion.
    bic_ : float
        The Bayesian information criterion.

    Examples
    --------
    >>> import pandas as pd
    >>> from factor_analyzer import (ConfirmatoryFactorAnalyzer,
    ...                              ModelSpecificationParser)
    >>> X = pd.read_csv('tests/data/test11.csv')
    >>> model_dict = {"F1": ["V1", "V2", "V3", "V4"],
    ...               "F2": ["V5", "V6", "V7", "V8"]}
    >>> model_spec = ModelSpecificationParser.parse_model_specification_from_dict(X, model_dict)
    >>> cfa = ConfirmatoryFactorAnalyzer(model_spec, disp=False)
    >>> cfa.fit(X.values)
    >>> cfa.loadings_
    array([[0.99131285, 0.        ],
           [0.46074919, 0.        ],
           [0.3502267 , 0.        ],
           [0.58331488, 0.        ],
           [0.        , 0.98621042],
           [0.        , 0.73389239],
           [0.        , 0.37602988],
           [0.        , 0.50049507]])
    >>> cfa.factor_varcovs_
    array([[1.        , 0.17385704],
           [0.17385704, 1.        ]])
    >>> cfa.get_standard_errors()
    (array([[0.06779949, 0.        ],
           [0.04369956, 0.        ],
           [0.04153113, 0.        ],
           [0.04766645, 0.        ],
           [0.        , 0.06025341],
           [0.        , 0.04913149],
           [0.        , 0.0406604 ],
           [0.        , 0.04351208]]),
     array([0.11929873, 0.05043616, 0.04645803, 0.05803088,
            0.10176889, 0.06607524, 0.04742321, 0.05373646]))
    >>> cfa.transform(X.values)
    array([[-0.46852166, -1.08708035],
           [ 2.59025301,  1.20227783],
           [-0.47215977,  2.65697245],
           ...,
           [-1.5930886 , -0.91804114],
           [ 0.19430887,  0.88174818],
           [-0.27863554, -0.7695101 ]])
    """

    def __init__(
        self,
        specification=None,
        n_obs=None,
        is_cov_matrix=False,
        bounds=None,
        max_iter=200,
        tol=None,
        impute="median",
        disp=True,
    ):
        """Initialize the analyzer object."""
        # if the input is going to be a covariance matrix, rather than
        # the full data set, then users must pass the number of observations
        if is_cov_matrix and n_obs is None:
            raise ValueError(
                "If `is_cov_matrix=True`, you must provide "
                "the number of observations, `n_obs`."
            )

        self.specification = specification
        self.n_obs = n_obs
        self.is_cov_matrix = is_cov_matrix
        self.bounds = bounds
        self.max_iter = max_iter
        self.tol = tol
        self.impute = impute
        self.disp = disp

        self.cov_ = None
        self.mean_ = None
        self.loadings_ = None
        self.error_vars_ = None
        self.factor_varcovs_ = None

        self.log_likelihood_ = None
        self.aic_ = None
        self.bic_ = None

        self._n_factors = None
        self._n_variables = None
        self._n_lower_diag = None

    @staticmethod
    def _combine(
        loadings,
        error_vars,
        factor_vars,
        factor_covs,
        n_factors,
        n_variables,
        n_lower_diag,
    ):
        """
        Combine given multi-dimensional matrics into a one-dimensional matrix.

        Combine a set of multi-dimensional loading, error variance, factor
        variance, and factor covariance matrices into a one-dimensional
        (X, 1) matrix, where the length of X is the length of all elements
        across all of the matrices.

        Parameters
        ----------
        loadings : array-like
            The loadings matrix (``n_factors`` * ``n_variables``)
        error_vars : array-like (``n_variables`` * 1)
            The error variance array.
        factor_vars : array-like (``n_factors`` * 1)
            The factor variance array.
        factor_covs : array-like (``n_lower_diag`` * 1)
            The factor covariance array.
        n_factors : int
            The number of factors.
        n_variables : int
            The number of variables.
        n_lower_diag :int
            The number of elements in the ``factor_covs`` array, which
            is equal to the lower diagonal of the factor covariance
            matrix.

        Returns
        -------
        array : :obj:`numpy.ndarray`
            The combined (X, 1) array.
        """
        loadings = loadings.reshape(n_factors * n_variables, 1, order="F")
        error_vars = error_vars.reshape(n_variables, 1, order="F")
        factor_vars = factor_vars.reshape(n_factors, 1, order="F")
        factor_covs = factor_covs.reshape(n_lower_diag, 1, order="F")
        return np.concatenate([loadings, error_vars, factor_vars, factor_covs])

    @staticmethod
    def _split(x, n_factors, n_variables, n_lower_diag):
        """
        Split given one-dimensional array into multi-dimensional arrays.

        Given a one-dimensional array, split it into a set of
        multi-dimensional loading, error variance, factor variance,
        and factor covariance matrices.

        Parameters
        ----------
        x : array-like
            The combined (X, 1) array to split.
        n_factors : int
            The number of factors.
        n_variables : int
            The number of variables.
        n_lower_diag : int
            The number of elements in the ``factor_covs`` array, which is
            equal to the lower diagonal of the factor covariance matrix.

        Returns
        -------
        loadings : array-like
            The loadings matrix (``n_factors`` * ``n_variables``)
        error_vars : array-like (``n_variables`` * 1)
            The error variance array.
        factor_vars : array-like (``n_factors`` * 1)
            The factor variance array.
        factor_covs : array-like (``n_lower_diag`` * 1)
            The factor covariance array.
        """
        loadings_ix = int(n_factors * n_variables)
        error_vars_ix = n_variables + loadings_ix
        factor_vars_ix = n_factors + error_vars_ix
        factor_covs_ix = n_lower_diag + factor_vars_ix
        return (
            x[:loadings_ix].reshape((n_variables, n_factors), order="F"),
            x[loadings_ix:error_vars_ix].reshape((n_variables, 1), order="F"),
            x[error_vars_ix:factor_vars_ix].reshape((n_factors, 1), order="F"),
            x[factor_vars_ix:factor_covs_ix].reshape((n_lower_diag, 1), order="F"),
        )

    def _objective(self, x0, cov_mtx, loadings):
        """
        Get the objective function for the analyzer.

        Parameters
        ----------
        x0: array-like
            The combined (X, 1) array. These are the initial values for
            the ``minimize`()` function.
        cov_mtx : array-like
            The covariance matrix from the original data set.
        loadings : array-like
            The loadings matrix (``n_factors`` * ``n_variables``)
            from the model parser. This tells the objective function what
            elements should be fixed.

        Returns
        -------
        error : float
            The error from the objective function.
        """
        (
            loadings_init,
            error_vars_init,
            factor_vars_init,
            factor_covs_init,
        ) = self._split(
            x0, self.model.n_factors, self.model.n_variables, self.model.n_lower_diag
        )

        # set the loadings to zero where applicable
        loadings_init[np.where(loadings == 0)] = 0

        # combine factor variances and covariances into a single matrix
        factor_varcov_init = merge_variance_covariance(
            factor_vars_init, factor_covs_init
        )

        # make the error variance into a variance-covariance matrix
        error_varcov_init = merge_variance_covariance(error_vars_init)

        # make the factor variance-covariance matrix into a correlation matrix
        with np.errstate(all="ignore"):
            factor_varcov_init = covariance_to_correlation(factor_varcov_init)

        # calculate sigma-theta, needed for the objective function
        sigma_theta = (
            loadings_init.dot(factor_varcov_init).dot(loadings_init.T)
            + error_varcov_init
        )

        with np.errstate(all="ignore"):
            error = -(
                ((-self.n_obs * self.model.n_variables / 2) * np.log(2 * np.pi))
                - (self.n_obs / 2)
                * (
                    np.log(np.linalg.det(sigma_theta))
                    + np.trace(cov_mtx.dot(np.linalg.inv(sigma_theta)))
                )
            )

            # make sure the error is greater than or
            # equal to zero before we return it; we
            # do not do this for the Bollen approach
            error = 0.0 if error < 0.0 else error

        return error


[docs]
    def fit(self, X, y=None):
        """
        Perform confirmatory factor analysis.

        Parameters
        ----------
        X : array-like
            The data to use for confirmatory factor analysis. If this is just a
            covariance matrix, make sure ``is_cov_matrix`` was set to ``True``.
        y : ignored

        Raises
        ------
        ValueError
            If the specification is not None or a :class:`ModelSpecification` object.
        AssertionError
            If ``is_cov_matrix`` was ``True`` and the matrix is not square.
        AssertionError
            If ``len(bounds)`` != ``len(x0)``

        Examples
        --------
        >>> import pandas as pd
        >>> from factor_analyzer import (ConfirmatoryFactorAnalyzer,
        ...                              ModelSpecificationParser)
        >>> X = pd.read_csv('tests/data/test11.csv')
        >>> model_dict = {"F1": ["V1", "V2", "V3", "V4"],
        ...               "F2": ["V5", "V6", "V7", "V8"]}
        >>> model_spec = ModelSpecificationParser.parse_model_specification_from_dict(X, model_dict)
        >>> cfa = ConfirmatoryFactorAnalyzer(model_spec, disp=False)
        >>> cfa.fit(X.values)
        >>> cfa.loadings_
        array([[0.99131285, 0.        ],
               [0.46074919, 0.        ],
               [0.3502267 , 0.        ],
               [0.58331488, 0.        ],
               [0.        , 0.98621042],
               [0.        , 0.73389239],
               [0.        , 0.37602988],
               [0.        , 0.50049507]])
        """
        if self.specification is None:
            self.model = ModelSpecificationParser.parse_model_specification_from_array(
                X
            )
        elif isinstance(self.specification, ModelSpecification):
            self.model = self.specification.copy()
        else:
            raise ValueError(
                "The `specification` must be None or `ModelSpecification` "
                "instance, not {}".format(type(self.specification))
            )

        if isinstance(X, pd.DataFrame):
            X = X.values

        # now check the array, and make sure it
        # meets all of our expected criteria
        X = check_array(X, ensure_all_finite="allow-nan", estimator=self, copy=True)

        # check to see if there are any null values, and if
        # so impute using the desired imputation approach
        if np.isnan(X).any() and not self.is_cov_matrix:
            X = impute_values(X, how=self.impute)

        if not self.is_cov_matrix:
            # make sure that the columns are in the proper order
            # data = data[variable_names].copy()
            # get the number of observations from the data, if `n_obs` not passed;
            # then, calculate the covariance matrix from the data set
            self.n_obs = X.shape[0] if self.n_obs is None else self.n_obs
            self.mean_ = np.mean(X, axis=0)
            cov_mtx = cov(X)
        else:
            error_msg = (
                "If `is_cov_matrix=True`, then the rows and column in the data "
                "set must be equal, and must equal the number of variables "
                "in your model."
            )
            assert X.shape[0] == X.shape[1] == self.model.n_variables, error_msg
            cov_mtx = X.copy()

        self.cov_ = cov_mtx.copy()

        # we initialize all of the arrays, setting the covariances
        # lower than the expected variances, and the loadings to 1 or 0
        loading_init = self.model.loadings
        error_vars_init = np.full((self.model.n_variables, 1), 0.5)
        factor_vars_init = np.full((self.model.n_factors, 1), 1.0)
        factor_covs_init = np.full((self.model.n_lower_diag, 1), 0.05)

        # we merge all of the arrays into a single 1d vector
        x0 = self._combine(
            loading_init,
            error_vars_init,
            factor_vars_init,
            factor_covs_init,
            self.model.n_factors,
            self.model.n_variables,
            self.model.n_lower_diag,
        )

        # if the bounds argument is None, then we initialized the
        # boundaries to (None, None) for everything except factor covariances;
        # at some point in the future, we may update this to place limits
        # on the loading matrix boundaries, too, but the case in R and SAS
        if self.bounds is not None:
            error_msg = (
                "The length of `bounds` must equal the length of your "
                "input array `x0`: {} != {}.".format(len(self.bounds), len(x0))
            )
            assert len(self.bounds) == len(x0), error_msg

        # fit the actual model using L-BFGS algorithm;
        # the constraints are set inside the objective function,
        # so that we can avoid using linear programming methods (e.g. SLSQP)
        res = minimize(
            self._objective,
            x0.flatten(),
            method="L-BFGS-B",
            options={"maxiter": self.max_iter, "disp": self.disp},
            bounds=self.bounds,
            args=(cov_mtx, self.model.loadings),
        )

        # if the optimizer failed to converge, print the message
        if not res.success:
            warnings.warn(
                f"The optimization routine failed to converge: {str(res.message)}"
            )

        # we split all the 1d array back into the set of original arrays
        (loadings_res, error_vars_res, factor_vars_res, factor_covs_res) = self._split(
            res.x, self.model.n_factors, self.model.n_variables, self.model.n_lower_diag
        )

        # we combine the factor covariances and variances into
        # a single variance-covariance matrix to make things easier,
        # but also check to make see if anything was fixed
        factor_varcovs_res = merge_variance_covariance(factor_vars_res, factor_covs_res)
        with np.errstate(all="ignore"):
            factor_varcovs_res = covariance_to_correlation(factor_varcovs_res)

        self.loadings_ = loadings_res
        self.error_vars_ = error_vars_res
        self.factor_varcovs_ = factor_varcovs_res

        # we also calculate the log-likelihood, AIC, and BIC
        self.log_likelihood_ = -res.fun
        self.aic_ = 2 * res.fun + 2 * (x0.shape[0] + self.model.n_variables)
        if self.n_obs is not None:
            self.bic_ = 2 * res.fun + np.log(self.n_obs) * (
                x0.shape[0] + self.model.n_variables
            )
        return self



[docs]
    def transform(self, X):
        """
        Get the factor scores for a new data set.

        Parameters
        ----------
        X : array-like, shape (``n_samples``, ``n_features``)
            The data to score using the fitted factor model.

        Returns
        -------
        scores : numpy array, shape (``n_samples``, ``n_components``)
            The latent variables of X.

        Examples
        --------
        >>> import pandas as pd
        >>> from factor_analyzer import (ConfirmatoryFactorAnalyzer,
        ...                              ModelSpecificationParser)
        >>> X = pd.read_csv('tests/data/test11.csv')
        >>> model_dict = {"F1": ["V1", "V2", "V3", "V4"],
        ...               "F2": ["V5", "V6", "V7", "V8"]}
        >>> model_spec = ModelSpecificationParser.parse_model_specification_from_dict(X, model_dict)
        >>> cfa = ConfirmatoryFactorAnalyzer(model_spec, disp=False)
        >>> cfa.fit(X.values)
        >>> cfa.transform(X.values)
        array([[-0.46852166, -1.08708035],
               [ 2.59025301,  1.20227783],
               [-0.47215977,  2.65697245],
               ...,
               [-1.5930886 , -0.91804114],
               [ 0.19430887,  0.88174818],
           [-0.27863554, -0.7695101 ]])

        References
        ----------
        https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6157408/
        """
        # check if the data is a data frame,
        # so we can convert it to an array
        if isinstance(X, pd.DataFrame):
            X = X.values

        # now check the array, and make sure it
        # meets all of our expected criteria
        X = check_array(X, ensure_all_finite=True, estimator=self, copy=True)

        # meets all of our expected criteria
        check_is_fitted(self, ["loadings_", "error_vars_"])

        # see if we saved the original mean and std
        if self.mean_ is None:
            warnings.warn(
                "Could not find original mean; using the mean "
                "from the current data set."
            )
            mean = np.mean(X, axis=0)
        else:
            mean = self.mean_

        # get the scaled data
        X_scale = X - mean

        # get the loadings and error variances
        loadings = self.loadings_.copy()
        error_vars = self.error_vars_.copy()

        # make the error variance an identity matrix,
        # and take the inverse of this matrix
        error_covs = np.eye(error_vars.shape[0])
        np.fill_diagonal(error_covs, error_vars)
        error_covs_inv = np.linalg.inv(error_covs)

        # calculate the weights, using Bartlett's formula
        # (lambda' error_covsˆ−1 lambda)ˆ−1 lambda' error_covsˆ−1 (X - muX)
        weights = (
            np.linalg.pinv(loadings.T.dot(error_covs_inv).dot(loadings))
            .dot(loadings.T)
            .dot(error_covs_inv)
        )

        scores = weights.dot(X_scale.T).T
        return scores



[docs]
    def get_model_implied_cov(self):
        """
        Get the model-implied covariance matrix (sigma) for an estimated model.

        Returns
        -------
        model_implied_cov : :obj:`numpy.ndarray`
            The model-implied covariance matrix.

        Examples
        --------
        >>> import pandas as pd
        >>> from factor_analyzer import (ConfirmatoryFactorAnalyzer,
        ...                              ModelSpecificationParser)
        >>> X = pd.read_csv('tests/data/test11.csv')
        >>> model_dict = {"F1": ["V1", "V2", "V3", "V4"],
        ...               "F2": ["V5", "V6", "V7", "V8"]}
        >>> model_spec = ModelSpecificationParser.parse_model_specification_from_dict(X, model_dict)
        >>> cfa = ConfirmatoryFactorAnalyzer(model_spec, disp=False)
        >>> cfa.fit(X.values)
        >>> cfa.get_model_implied_cov()
        array([[2.07938612, 0.45674659, 0.34718423, 0.57824753, 0.16997013,
                0.12648394, 0.06480751, 0.08625868],
               [0.45674659, 1.16703337, 0.16136667, 0.26876186, 0.07899988,
                0.05878807, 0.03012168, 0.0400919 ],
               [0.34718423, 0.16136667, 1.07364855, 0.20429245, 0.06004974,
                0.04468625, 0.02289622, 0.03047483],
               [0.57824753, 0.26876186, 0.20429245, 1.28809317, 0.10001495,
                0.07442652, 0.03813447, 0.05075691],
               [0.16997013, 0.07899988, 0.06004974, 0.10001495, 2.0364391 ,
                0.72377232, 0.37084458, 0.49359346],
               [0.12648394, 0.05878807, 0.04468625, 0.07442652, 0.72377232,
                1.48080077, 0.27596546, 0.36730952],
               [0.06480751, 0.03012168, 0.02289622, 0.03813447, 0.37084458,
                0.27596546, 1.11761918, 0.1882011 ],
               [0.08625868, 0.0400919 , 0.03047483, 0.05075691, 0.49359346,
                0.36730952, 0.1882011 , 1.28888233]])
        """
        # meets all of our expected criteria
        check_is_fitted(self, ["loadings_", "factor_varcovs_"])
        error = np.diag(self.error_vars_.flatten())
        return self.loadings_.dot(self.factor_varcovs_).dot(self.loadings_.T) + error


    def _get_derivatives_implied_cov(self):
        """
        Compute the derivatives for the implied covariance matrix (sigma).

        Returns
        -------
        loadings_dx : :obj:`numpy.ndarray`
            The derivative of the loadings matrix.
        factor_covs_dx : :obj:`numpy.ndarray`
            The derivative of the factor covariance matrix.
        error_covs_dx : :obj:`numpy.ndarray`
            The derivative of the error covariance matrix.
        """
        # meets all of our expected criteria
        check_is_fitted(self, "loadings_")

        loadings = self.loadings_.copy()
        factor_covs = self.factor_varcovs_.copy()

        sym_lower_var_idx = get_symmetric_lower_idxs(self.model.n_variables)
        sym_upper_fac_idx = get_symmetric_upper_idxs(self.model.n_factors, diag=False)
        sym_lower_fac_idx = get_symmetric_lower_idxs(self.model.n_factors, diag=False)

        factors_diag = np.eye(self.model.n_factors)
        factors_diag_mult = (
            factors_diag.dot(factor_covs).dot(factors_diag.T).dot(loadings.T)
        )

        # calculate the derivative of the loadings matrix, using the commutation matrix
        loadings_dx = np.eye(self.model.n_variables**2) + commutation_matrix(
            self.model.n_variables, self.model.n_variables
        )
        loadings_dx = loadings_dx.dot(
            np.kron(factors_diag_mult, np.eye(self.model.n_variables)).T
        )

        # calculate the derivative of the factor_covs matrix
        factor_covs_dx = loadings.dot(factors_diag)
        factor_covs_dx = np.kron(factor_covs_dx, factor_covs_dx)

        off_diag = (
            factor_covs_dx[:, sym_lower_fac_idx] + factor_covs_dx[:, sym_upper_fac_idx]
        )

        combine_indices = np.concatenate([sym_upper_fac_idx, sym_lower_fac_idx])
        combine_diag = np.concatenate([off_diag, off_diag], axis=1)

        factor_covs_dx[:, combine_indices] = combine_diag
        factor_covs_dx = factor_covs_dx[:, : factor_covs.size]

        # calculate the derivative of the error_cov matrix,
        # which we assume will always be a diagonal matrix
        error_covs_dx = np.eye(self.model.n_variables**2)

        # make sure these matrices are symmetric
        loadings_dx = loadings_dx[sym_lower_var_idx, :]
        factor_covs_dx = factor_covs_dx[sym_lower_var_idx, :]
        error_covs_dx = error_covs_dx[sym_lower_var_idx, :]

        # we also want to calculate the derivative for the intercepts
        intercept_dx = np.zeros(
            (loadings_dx.shape[0], self.model.n_variables), dtype=float
        )
        return (
            loadings_dx[:, self.model.loadings_free].copy(),
            factor_covs_dx[:, self.model.factor_covs_free].copy(),
            error_covs_dx[:, self.model.error_vars_free].copy(),
            intercept_dx,
        )

    def _get_derivatives_implied_mu(self):
        """
        Compute the "derivatives" for the implied means.

        Note that the derivatives of the implied means will not correspond
        to the actual mean values of the original data set, because that
        data could be a covariance matrix, rather than the full data set.
        Thus, we assume the mean values are zero and the data are normally
        distributed.

        Returns
        -------
        loadings_dx : :obj:`numpy.ndarray`
            The derivative of the loadings means.
        factor_covs_dx : :obj:`numpy.ndarray`
            The derivative of the factor covariance means.
        error_covs_dx : :obj:`numpy.ndarray`
            The derivative of the error covariance means.
        """
        # meets all of our expected criteria
        check_is_fitted(self, "loadings_")

        # initialize some matrices that we'll use below to
        # correct the shape of the mean loadings derivatives
        factors_zero = np.zeros((self.model.n_factors, 1))
        factors_diag = np.eye(self.model.n_factors)

        # the mean derivatives will just be zeros for both
        # the error covariance and factor covariance matrices,
        # since we don't have actual mean values
        error_covs_dx = np.zeros(
            (self.model.n_variables, len(self.model.error_vars_free))
        )
        factor_covs_dx = np.zeros(
            (self.model.n_variables, len(self.model.factor_covs_free))
        )

        # again, the implied means are just going to be diagonal matrices
        # corresponding to the number of variables and factors
        loadings_dx = np.kron(
            factors_diag.dot(factors_zero).T, np.eye(self.model.n_variables)
        )
        loadings_dx = loadings_dx[:, self.model.loadings_free].copy()

        # we also calculate the derivative for the intercept, which will be zeros again
        intercept_dx = np.zeros((loadings_dx.shape[0], self.model.n_variables))
        intercept_dx[: self.model.n_variables, : self.model.n_variables] = np.eye(
            self.model.n_variables
        )

        return (loadings_dx, factor_covs_dx, error_covs_dx, intercept_dx)


[docs]
    def get_standard_errors(self):
        """
        Get standard errors from the implied covariance matrix and implied means.

        Returns
        -------
        loadings_se : :obj:`numpy.ndarray`
            The standard errors for the factor loadings.
        error_vars_se : :obj:`numpy.ndarray`
            The standard errors for the error variances.

        Examples
        --------
        >>> import pandas as pd
        >>> from factor_analyzer import (ConfirmatoryFactorAnalyzer,
        ...                              ModelSpecificationParser)
        >>> X = pd.read_csv('tests/data/test11.csv')
        >>> model_dict = {"F1": ["V1", "V2", "V3", "V4"],
        ...               "F2": ["V5", "V6", "V7", "V8"]}
        >>> model_spec = ModelSpecificationParser.parse_model_specification_from_dict(X, model_dict)
        >>> cfa = ConfirmatoryFactorAnalyzer(model_spec, disp=False)
        >>> cfa.fit(X.values)
        >>> cfa.get_standard_errors()
        (array([[0.06779949, 0.        ],
               [0.04369956, 0.        ],
               [0.04153113, 0.        ],
               [0.04766645, 0.        ],
               [0.        , 0.06025341],
               [0.        , 0.04913149],
               [0.        , 0.0406604 ],
               [0.        , 0.04351208]]),
         array([0.11929873, 0.05043616, 0.04645803, 0.05803088,
                0.10176889, 0.06607524, 0.04742321, 0.05373646]))
        """
        # meets all of our expected criteria
        check_is_fitted(self, ["loadings_", "n_obs"])

        (
            loadings_dx,
            factor_covs_dx,
            error_covs_dx,
            intercept_dx,
        ) = self._get_derivatives_implied_cov()

        (
            loadings_dx_mu,
            factor_covs_dx_mu,
            error_covs_dx_mu,
            intercept_dx_mu,
        ) = self._get_derivatives_implied_mu()

        # combine all of our derivatives; below we will  merge all of these
        # together in a single matrix, delta, to use the delta rule
        # (basically, using the gradients to calculate the information)
        loadings_dx = np.append(loadings_dx_mu, loadings_dx, axis=0)
        factor_covs_dx = np.append(factor_covs_dx_mu, factor_covs_dx, axis=0)
        error_cov_dx = np.append(error_covs_dx_mu, error_covs_dx, axis=0)
        intercept_dx = np.append(intercept_dx_mu, intercept_dx, axis=0)

        # get get the implied covariance, invert it, and take the Kronecker product
        sigma = self.get_model_implied_cov()
        sigma_inv = np.linalg.inv(sigma)
        sigma_inv_kron = np.kron(sigma_inv, sigma_inv)

        # we get the fisher information matrix for H1, which is the unrestricted
        # model information; we'll use this with the deltas to calculate the full
        # (inverted) information matrix below, and then invert the whole thing
        h1_information = 0.5 * duplication_matrix_pre_post(sigma_inv_kron)
        h1_information = block_diag(sigma_inv, h1_information)

        # we concatenate all derivatives into a delta matrix
        delta = np.concatenate(
            (loadings_dx, error_cov_dx, factor_covs_dx, intercept_dx), axis=1
        )

        # calculate the fisher information matrix
        information = delta.T.dot(h1_information).dot(delta)
        information = (1 / self.n_obs) * np.linalg.inv(information)

        # calculate the standard errors from the diagonal of the
        # information / cov matrix; also take the absolute value,
        # just in case anything is negative
        se = np.sqrt(np.abs(np.diag(information)))

        # get the indexes for the standard errors
        # for the loadings and the errors variances;
        # in the future, we may add the factor and intercept
        # covariances, but these sometimes require transformations
        # that are more complicated, so for now we won't return them
        loadings_idx = len(self.model.loadings_free)
        error_vars_idx = self.model.n_variables + loadings_idx

        # get the loading standard errors and reshape them into the
        # format of the original loadings matrix
        loadings_se = np.zeros((self.model.n_factors * self.model.n_variables,))
        loadings_se[self.model.loadings_free] = se[:loadings_idx]
        loadings_se = loadings_se.reshape(
            (self.model.n_variables, self.model.n_factors), order="F"
        )

        # get the error variance standard errors
        error_vars_se = se[loadings_idx:error_vars_idx]
        return loadings_se, error_vars_se