Source code for factor_analyzer.factor_analyzer

"""
Factor analysis using MINRES or ML, with optional rotation using Varimax or Promax.

:author: Jeremy Biggs (jeremy.m.biggs@gmail.com)
:author: Nitin Madnani (nmadnani@ets.org)
:organization: Educational Testing Service
:date: 2022-09-05
"""

import warnings
from typing import Tuple

import numpy as np
import pandas as pd
import scipy as sp
from scipy.optimize import minimize
from scipy.stats import chi2, pearsonr
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.utils.extmath import randomized_svd
from sklearn.utils.validation import check_is_fitted

from .rotator import OBLIQUE_ROTATIONS, POSSIBLE_ROTATIONS, Rotator
from .utils import corr, impute_values, partial_correlations, smc

POSSIBLE_SVDS = ["randomized", "lapack"]

POSSIBLE_IMPUTATIONS = ["mean", "median", "drop"]

POSSIBLE_METHODS = ["ml", "mle", "uls", "minres", "principal"]


[docs] def calculate_kmo(x): """ Calculate the Kaiser-Meyer-Olkin criterion for items and overall. This statistic represents the degree to which each observed variable is predicted, without error, by the other variables in the dataset. In general, a KMO < 0.6 is considered inadequate. Parameters ---------- x : array-like The array from which to calculate KMOs. Returns ------- kmo_per_variable : :obj:`numpy.ndarray` The KMO score per item. kmo_total : float The overall KMO score. """ # calculate the partial correlations partial_corr = partial_correlations(x) # calcualte the pair-wise correlations x_corr = corr(x) # fill matrix diagonals with zeros # and square all elements np.fill_diagonal(x_corr, 0) np.fill_diagonal(partial_corr, 0) partial_corr = partial_corr**2 x_corr = x_corr**2 # calculate KMO per item partial_corr_sum = np.sum(partial_corr, axis=0) corr_sum = np.sum(x_corr, axis=0) kmo_per_item = corr_sum / (corr_sum + partial_corr_sum) # calculate KMO overall corr_sum_total = np.sum(x_corr) partial_corr_sum_total = np.sum(partial_corr) kmo_total = corr_sum_total / (corr_sum_total + partial_corr_sum_total) return kmo_per_item, kmo_total
[docs] def calculate_bartlett_sphericity(x): """ Compute the Bartlett sphericity test. H0: The matrix of population correlations is equal to I. H1: The matrix of population correlations is not equal to I. The formula for Bartlett's Sphericity test is: .. math:: -1 * (n - 1 - ((2p + 5) / 6)) * ln(det(R)) Where R det(R) is the determinant of the correlation matrix, and p is the number of variables. Parameters ---------- x : array-like The array for which to calculate sphericity. Returns ------- statistic : float The chi-square value. p_value : float The associated p-value for the test. """ n, p = x.shape x_corr = corr(x) corr_det = np.linalg.det(x_corr) statistic = -np.log(corr_det) * (n - 1 - (2 * p + 5) / 6) degrees_of_freedom = p * (p - 1) / 2 p_value = chi2.sf(statistic, degrees_of_freedom) return statistic, p_value
[docs] class FactorAnalyzer(BaseEstimator, TransformerMixin): """ The main exploratory factor analysis class. This class: (1) Fits a factor analysis model using minres, maximum likelihood, or principal factor extraction and returns the loading matrix (2) Optionally performs a rotation, with method including: (a) varimax (orthogonal rotation) (b) promax (oblique rotation) (c) oblimin (oblique rotation) (d) oblimax (orthogonal rotation) (e) quartimin (oblique rotation) (f) quartimax (orthogonal rotation) (g) equamax (orthogonal rotation) Parameters ---------- n_factors : int, optional The number of factors to select. Defaults to 3. rotation : str, optional The type of rotation to perform after fitting the factor analysis model. If set to ``None``, no rotation will be performed, nor will any associated Kaiser normalization. Possible values include: (a) varimax (orthogonal rotation) (b) promax (oblique rotation) (c) oblimin (oblique rotation) (d) oblimax (orthogonal rotation) (e) quartimin (oblique rotation) (f) quartimax (orthogonal rotation) (g) equamax (orthogonal rotation) Defaults to 'promax'. method : {'minres', 'ml', 'principal'}, optional The fitting method to use, either MINRES or Maximum Likelihood. Defaults to 'minres'. use_smc : bool, optional Whether to use squared multiple correlation as starting guesses for factor analysis. Defaults to ``True``. bounds : tuple, optional The lower and upper bounds on the variables for "L-BFGS-B" optimization. Defaults to (0.005, 1). impute : {'drop', 'mean', 'median'}, optional How to handle missing values, if any, in the data: (a) use list-wise deletion ('drop'), or (b) impute the column median ('median'), or impute the column mean ('mean'). Defaults to 'median' is_corr_matrix : bool, optional Set to ``True if the ``data`` is the correlation matrix. Defaults to `False`. svd_method : {‘lapack’, ‘randomized’} The SVD method to use when ``method`` is 'principal'. If 'lapack', use standard SVD from ``scipy.linalg``. If 'randomized', use faster ``randomized_svd`` function from scikit-learn. The latter should only be used if the number of columns is greater than or equal to the number of rows in in the dataset. Defaults to 'randomized' rotation_kwargs, optional Dictionary containing keyword arguments for the rotation method. Attributes ---------- loadings_ : :obj:`numpy.ndarray` The factor loadings matrix. ``None``, if ``fit()``` has not been called. corr_ : :obj:`numpy.ndarray` The original correlation matrix. ``None``, if ``fit()`` has not been called. rotation_matrix_ : :obj:`numpy.ndarray` The rotation matrix, if a rotation has been performed. ``None`` otherwise. structure_ : :obj:`numpy.ndarray` or None The structure loading matrix. This only exists if ``rotation`` is 'promax' and is ``None`` otherwise. phi_ : :obj:`numpy.ndarray` or None The factor correlations matrix. This only exists if ``rotation`` is 'oblique' and is ``None`` otherwise. Notes ----- This code was partly derived from the excellent R package `psych`. References ---------- [1] https://github.com/cran/psych/blob/master/R/fa.R Examples -------- >>> import pandas as pd >>> from factor_analyzer import FactorAnalyzer >>> df_features = pd.read_csv('tests/data/test02.csv') >>> fa = FactorAnalyzer(rotation=None) >>> fa.fit(df_features) FactorAnalyzer(bounds=(0.005, 1), impute='median', is_corr_matrix=False, method='minres', n_factors=3, rotation=None, rotation_kwargs={}, use_smc=True) >>> fa.loadings_ array([[-0.12991218, 0.16398154, 0.73823498], [ 0.03899558, 0.04658425, 0.01150343], [ 0.34874135, 0.61452341, -0.07255667], [ 0.45318006, 0.71926681, -0.07546472], [ 0.36688794, 0.44377343, -0.01737067], [ 0.74141382, -0.15008235, 0.29977512], [ 0.741675 , -0.16123009, -0.20744495], [ 0.82910167, -0.20519428, 0.04930817], [ 0.76041819, -0.23768727, -0.1206858 ], [ 0.81533404, -0.12494695, 0.17639683]]) >>> fa.get_communalities() array([0.588758 , 0.00382308, 0.50452402, 0.72841183, 0.33184336, 0.66208428, 0.61911036, 0.73194557, 0.64929612, 0.71149718]) """ def __init__( self, n_factors=3, rotation="promax", method="minres", use_smc=True, is_corr_matrix=False, bounds=(0.005, 1), impute="median", svd_method="randomized", rotation_kwargs=None, ): """Initialize the factor analyzer.""" self.n_factors = n_factors self.rotation = rotation self.method = method self.use_smc = use_smc self.bounds = bounds self.impute = impute self.is_corr_matrix = is_corr_matrix self.svd_method = svd_method self.rotation_kwargs = rotation_kwargs # default matrices to None self.mean_ = None self.std_ = None self.phi_ = None self.structure_ = None self.corr_ = None self.loadings_ = None self.rotation_matrix_ = None self.weights_ = None def _arg_checker(self): """ Check the input parameters to make sure they're properly formattted. We need to do this to ensure that the FactorAnalyzer class can be properly cloned when used with grid search CV, for example. """ self.rotation = ( self.rotation.lower() if isinstance(self.rotation, str) else self.rotation ) if self.rotation not in POSSIBLE_ROTATIONS + [None]: raise ValueError( f"The rotation must be one of the following: {POSSIBLE_ROTATIONS + [None]}" ) self.method = ( self.method.lower() if isinstance(self.method, str) else self.method ) if self.method not in POSSIBLE_METHODS: raise ValueError( f"The method must be one of the following: {POSSIBLE_METHODS}" ) self.impute = ( self.impute.lower() if isinstance(self.impute, str) else self.impute ) if self.impute not in POSSIBLE_IMPUTATIONS: raise ValueError( f"The imputation must be one of the following: {POSSIBLE_IMPUTATIONS}" ) self.svd_method = ( self.svd_method.lower() if isinstance(self.svd_method, str) else self.svd_method ) if self.svd_method not in POSSIBLE_SVDS: raise ValueError( f"The SVD method must be one of the following: {POSSIBLE_SVDS}" ) if self.method == "principal" and self.is_corr_matrix: raise ValueError( "The principal method is only implemented using " "the full data set, not the correlation matrix." ) self.rotation_kwargs = ( {} if self.rotation_kwargs is None else self.rotation_kwargs ) @staticmethod def _fit_uls_objective(psi, corr_mtx, n_factors): # noqa: D401 """ The objective function passed for unweighted least-squares (ULS). Parameters ---------- psi : array-like Value passed to minimize the objective function. corr_mtx : array-like The correlation matrix. n_factors : int The number of factors to select. Returns ------- error : float The scalar error calculated from the residuals of the loading matrix. """ np.fill_diagonal(corr_mtx, 1 - psi) # get the eigen values and vectors for n_factors values, vectors = sp.linalg.eigh(corr_mtx) values = values[::-1] # this is a bit of a hack, borrowed from R's `fac()` function; # if values are smaller than the smallest representable positive # number * 100, set them to that number instead. values = np.maximum(values, np.finfo(float).eps * 100) # sort the values and vectors in ascending order values = values[:n_factors] vectors = vectors[:, ::-1][:, :n_factors] # calculate the loadings if n_factors > 1: loadings = np.dot(vectors, np.diag(np.sqrt(values))) else: loadings = vectors * np.sqrt(values[0]) # calculate the error from the loadings model model = np.dot(loadings, loadings.T) # note that in a more recent version of the `fa()` source # code on GitHub, the minres objective function only sums the # lower triangle of the residual matrix; this could be # implemented here using `np.tril()` when this change is # merged into the stable version of `psych`. residual = (corr_mtx - model) ** 2 error = np.sum(residual) return error @staticmethod def _normalize_uls(solution, corr_mtx, n_factors): """ Weighted least squares normalization for loadings using MINRES. Parameters ---------- solution : array-like The solution from the L-BFGS-B optimization. corr_mtx : array-like The correlation matrix. n_factors : int The number of factors to select. Returns ------- loadings : :obj:`numpy.ndarray` The factor loading matrix """ np.fill_diagonal(corr_mtx, 1 - solution) # get the eigenvalues and vectors for n_factors values, vectors = np.linalg.eigh(corr_mtx) # sort the values and vectors in ascending order values = values[::-1][:n_factors] vectors = vectors[:, ::-1][:, :n_factors] # calculate loadings # if values are smaller than 0, set them to zero loadings = np.dot(vectors, np.diag(np.sqrt(np.maximum(values, 0)))) return loadings @staticmethod def _fit_ml_objective(psi, corr_mtx, n_factors): # noqa: D401 """ The objective function for maximum likelihood. Parameters ---------- psi : array-like Value passed to minimize the objective function. corr_mtx : array-like The correlation matrix. n_factors : int The number of factors to select. Returns ------- error : float The scalar error calculated from the residuals of the loading matrix. Note ---- The ML objective is based on the `factanal()` function from ``stats`` package in R. It may generate different results from the ``fa()`` function in ``psych``. References ---------- [1] https://github.com/SurajGupta/r-source/blob/master/src/library/stats/R/factanal.R """ sc = np.diag(1 / np.sqrt(psi)) sstar = np.dot(np.dot(sc, corr_mtx), sc) # get the eigenvalues and eigenvectors for n_factors values, _ = np.linalg.eigh(sstar) values = values[::-1][n_factors:] # calculate the error error = -(np.sum(np.log(values) - values) - n_factors + corr_mtx.shape[0]) return error @staticmethod def _normalize_ml(solution, corr_mtx, n_factors): """ Normalize loadings estimated using maximum likelihood. Parameters ---------- solution : array-like The solution from the L-BFGS-B optimization. corr_mtx : array-like The correlation matrix. n_factors : int The number of factors to select. Returns ------- loadings : :obj:`numpy.ndarray` The factor loading matrix """ sc = np.diag(1 / np.sqrt(solution)) sstar = np.dot(np.dot(sc, corr_mtx), sc) # get the eigenvalues for n_factors values, vectors = np.linalg.eigh(sstar) # sort the values and vectors in ascending order values = values[::-1][:n_factors] vectors = vectors[:, ::-1][:, :n_factors] values = np.maximum(values - 1, 0) # get the loadings loadings = np.dot(vectors, np.diag(np.sqrt(values))) return np.dot(np.diag(np.sqrt(solution)), loadings) def _fit_principal(self, X): """ Fit factor analysis model using principal factor analysis. Parameters ---------- X : array-like The full data set. Returns ------- loadings : :obj:`numpy.ndarray` The factor loadings matrix. """ # standardize the data X = X.copy() X = (X - X.mean(0)) / X.std(0) # if the number of rows is less than the number of columns, # warn the user that the number of factors will be constrained nrows, ncols = X.shape if nrows < ncols and self.n_factors >= nrows: warnings.warn( "The number of factors will be " "constrained to min(n_samples, n_features)" "={}.".format(min(nrows, ncols)) ) # perform the randomized singular value decomposition if self.svd_method == "randomized": _, _, V = randomized_svd(X, self.n_factors, random_state=1234567890) # otherwise, perform the full SVD else: _, _, V = np.linalg.svd(X, full_matrices=False) corr_mtx = np.dot(X, V.T) loadings = np.array([[pearsonr(x, c)[0] for c in corr_mtx.T] for x in X.T]) return loadings def _fit_factor_analysis(self, corr_mtx): """ Fit factor analysis model using either MINRES or maximum likelihood. Parameters ---------- corr_mtx : array-like The correlation matrix. Returns ------- loadings : :obj:`numpy.ndarray` Raises ------ ValueError If any of the correlations are null, most likely due to having zero standard deviation. """ # if `use_smc` is True, get get squared multiple correlations # and use these as initial guesses for optimizer if self.use_smc: smc_mtx = smc(corr_mtx) start = (np.diag(corr_mtx) - smc_mtx.T).squeeze() # otherwise, just start with a guess of 0.5 for everything else: start = [0.5 for _ in range(corr_mtx.shape[0])] # if `bounds`, set initial boundaries for all variables; # this must be a list passed to `minimize()` if self.bounds is not None: bounds = [self.bounds for _ in range(corr_mtx.shape[0])] else: bounds = self.bounds # minimize the appropriate objective function # and the L-BFGS-B algorithm if self.method == "ml" or self.method == "mle": objective = self._fit_ml_objective else: objective = self._fit_uls_objective # use scipy to perform the actual minimization res = minimize( objective, start, method="L-BFGS-B", bounds=bounds, options={"maxiter": 1000}, args=(corr_mtx, self.n_factors), ) if not res.success: warnings.warn(f"Failed to converge: {res.message}") # transform the final loading matrix (using wls for MINRES, # and ml normalization for ML), and convert to DataFrame if self.method == "ml" or self.method == "mle": loadings = self._normalize_ml(res.x, corr_mtx, self.n_factors) else: loadings = self._normalize_uls(res.x, corr_mtx, self.n_factors) return loadings
[docs] def fit(self, X, y=None): """ Fit factor analysis model using either MINRES, ML, or principal factor analysis. By default, use SMC as starting guesses. Parameters ---------- X : array-like The data to analyze. y : ignored Examples -------- >>> import pandas as pd >>> from factor_analyzer import FactorAnalyzer >>> df_features = pd.read_csv('tests/data/test02.csv') >>> fa = FactorAnalyzer(rotation=None) >>> fa.fit(df_features) FactorAnalyzer(bounds=(0.005, 1), impute='median', is_corr_matrix=False, method='minres', n_factors=3, rotation=None, rotation_kwargs={}, use_smc=True) >>> fa.loadings_ array([[-0.12991218, 0.16398154, 0.73823498], [ 0.03899558, 0.04658425, 0.01150343], [ 0.34874135, 0.61452341, -0.07255667], [ 0.45318006, 0.71926681, -0.07546472], [ 0.36688794, 0.44377343, -0.01737067], [ 0.74141382, -0.15008235, 0.29977512], [ 0.741675 , -0.16123009, -0.20744495], [ 0.82910167, -0.20519428, 0.04930817], [ 0.76041819, -0.23768727, -0.1206858 ], [ 0.81533404, -0.12494695, 0.17639683]]) """ # check the input arguments self._arg_checker() # check if the data is a data frame, # so we can convert it to an array if isinstance(X, pd.DataFrame): X = X.copy().values else: X = X.copy() # now check the array, and make sure it # meets all of our expected criteria X = check_array(X, force_all_finite="allow-nan", estimator=self, copy=True) # check to see if there are any null values, and if # so impute using the desired imputation approach if np.isnan(X).any() and not self.is_corr_matrix: X = impute_values(X, how=self.impute) # get the correlation matrix if self.is_corr_matrix: corr_mtx = X else: corr_mtx = corr(X) self.std_ = np.std(X, axis=0) self.mean_ = np.mean(X, axis=0) # save the original correlation matrix self.corr_ = corr_mtx.copy() # fit factor analysis model if self.method == "principal": loadings = self._fit_principal(X) else: loadings = self._fit_factor_analysis(corr_mtx) # only used if we do an oblique rotations; # default rotation matrix to None phi = None structure = None rotation_mtx = None # whether to rotate the loadings matrix if self.rotation is not None: if loadings.shape[1] <= 1: warnings.warn( "No rotation will be performed when " "the number of factors equals 1." ) else: if "method" in self.rotation_kwargs: warnings.warn( "You cannot pass a rotation method to " "`rotation_kwargs`. This will be ignored." ) self.rotation_kwargs.pop("method") rotator = Rotator(method=self.rotation, **self.rotation_kwargs) loadings = rotator.fit_transform(loadings) rotation_mtx = rotator.rotation_ phi = rotator.phi_ # update the rotation matrix for everything, except promax if self.rotation != "promax": rotation_mtx = np.linalg.inv(rotation_mtx).T if self.n_factors > 1: # update loading signs to match column sums # this is to ensure that signs align with R signs = np.sign(loadings.sum(0)) signs[(signs == 0)] = 1 loadings = np.dot(loadings, np.diag(signs)) if phi is not None: # update phi, if it exists -- that is, if the rotation is oblique # create the structure matrix for any oblique rotation phi = np.dot(np.dot(np.diag(signs), phi), np.diag(signs)) structure = ( np.dot(loadings, phi) if self.rotation in OBLIQUE_ROTATIONS else None ) # resort the factors according to their variance, # unless the method is principal if self.method != "principal": variance = self._get_factor_variance(loadings)[0] new_order = list(reversed(np.argsort(variance))) loadings = loadings[:, new_order].copy() # if the structure matrix exists, reorder if structure is not None: structure = structure[:, new_order].copy() self.phi_ = phi self.structure_ = structure self.loadings_ = loadings self.rotation_matrix_ = rotation_mtx return self
[docs] def transform(self, X): """ Get factor scores for a new data set. Parameters ---------- X : array-like, shape (``n_samples``, ``n_features``) The data to score using the fitted factor model. Returns ------- X_new : :obj:`numpy.ndarray`, shape (``n_samples``, ``n_components``) The latent variables of X. Examples -------- >>> import pandas as pd >>> from factor_analyzer import FactorAnalyzer >>> df_features = pd.read_csv('tests/data/test02.csv') >>> fa = FactorAnalyzer(rotation=None) >>> fa.fit(df_features) FactorAnalyzer(bounds=(0.005, 1), impute='median', is_corr_matrix=False, method='minres', n_factors=3, rotation=None, rotation_kwargs={}, use_smc=True) >>> fa.transform(df_features) array([[-1.05141425, 0.57687826, 0.1658788 ], [-1.59940101, 0.89632125, 0.03824552], [-1.21768164, -1.16319406, 0.57135189], ..., [ 0.13601554, 0.03601086, 0.28813877], [ 1.86904519, -0.3532394 , -0.68170573], [ 0.86133386, 0.18280695, -0.79170903]]) """ # check if the data is a data frame, # so we can convert it to an array if isinstance(X, pd.DataFrame): X = X.copy().values else: X = X.copy() # now check the array, and make sure it # meets all of our expected criteria X = check_array(X, force_all_finite=True, estimator=self, copy=True) # meets all of our expected criteria check_is_fitted(self, "loadings_") # see if we saved the original mean and std if self.mean_ is None or self.std_ is None: warnings.warn( "Could not find original mean and standard deviation; using" "the mean and standard deviation from the current data set." ) mean = np.mean(X, axis=0) std = np.std(X, axis=0) else: mean = self.mean_ std = self.std_ # get the scaled data X_scale = (X - mean) / std # use the structure matrix, if it exists; # otherwise, just use the loadings matrix if self.structure_ is not None: structure = self.structure_ else: structure = self.loadings_ try: self.weights_ = np.linalg.solve(self.corr_, structure) except Exception as error: warnings.warn( "Unable to calculate the factor score weights; " "factor loadings used instead: {}".format(error) ) self.weights_ = self.loadings_ scores = np.dot(X_scale, self.weights_) return scores
[docs] def get_eigenvalues(self): """ Calculate the eigenvalues, given the factor correlation matrix. Returns ------- original_eigen_values : :obj:`numpy.ndarray` The original eigenvalues common_factor_eigen_values : :obj:`numpy.ndarray` The common factor eigenvalues Examples -------- >>> import pandas as pd >>> from factor_analyzer import FactorAnalyzer >>> df_features = pd.read_csv('tests/data/test02.csv') >>> fa = FactorAnalyzer(rotation=None) >>> fa.fit(df_features) FactorAnalyzer(bounds=(0.005, 1), impute='median', is_corr_matrix=False, method='minres', n_factors=3, rotation=None, rotation_kwargs={}, use_smc=True) >>> fa.get_eigenvalues() (array([ 3.51018854, 1.28371018, 0.73739507, 0.1334704 , 0.03445558, 0.0102918 , -0.00740013, -0.03694786, -0.05959139, -0.07428112]), array([ 3.51018905, 1.2837105 , 0.73739508, 0.13347082, 0.03445601, 0.01029184, -0.0074 , -0.03694834, -0.05959057, -0.07428059])) """ # meets all of our expected criteria check_is_fitted(self, ["loadings_", "corr_"]) corr_mtx = self.corr_.copy() e_values, _ = np.linalg.eigh(corr_mtx) e_values = e_values[::-1] communalities = self.get_communalities() communalities = communalities.copy() np.fill_diagonal(corr_mtx, communalities) values, _ = np.linalg.eigh(corr_mtx) values = values[::-1] return e_values, values
[docs] def get_communalities(self): """ Calculate the communalities, given the factor loading matrix. Returns ------- communalities : :obj:`numpy.ndarray` The communalities from the factor loading matrix. Examples -------- >>> import pandas as pd >>> from factor_analyzer import FactorAnalyzer >>> df_features = pd.read_csv('tests/data/test02.csv') >>> fa = FactorAnalyzer(rotation=None) >>> fa.fit(df_features) FactorAnalyzer(bounds=(0.005, 1), impute='median', is_corr_matrix=False, method='minres', n_factors=3, rotation=None, rotation_kwargs={}, use_smc=True) >>> fa.get_communalities() array([0.588758 , 0.00382308, 0.50452402, 0.72841183, 0.33184336, 0.66208428, 0.61911036, 0.73194557, 0.64929612, 0.71149718]) """ # meets all of our expected criteria check_is_fitted(self, "loadings_") loadings = self.loadings_.copy() communalities = (loadings**2).sum(axis=1) return communalities
[docs] def get_uniquenesses(self): """ Calculate the uniquenesses, given the factor loading matrix. Returns ------- uniquenesses : :obj:`numpy.ndarray` The uniquenesses from the factor loading matrix. Examples -------- >>> import pandas as pd >>> from factor_analyzer import FactorAnalyzer >>> df_features = pd.read_csv('tests/data/test02.csv') >>> fa = FactorAnalyzer(rotation=None) >>> fa.fit(df_features) FactorAnalyzer(bounds=(0.005, 1), impute='median', is_corr_matrix=False, method='minres', n_factors=3, rotation=None, rotation_kwargs={}, use_smc=True) >>> fa.get_uniquenesses() array([0.411242 , 0.99617692, 0.49547598, 0.27158817, 0.66815664, 0.33791572, 0.38088964, 0.26805443, 0.35070388, 0.28850282]) """ # meets all of our expected criteria check_is_fitted(self, "loadings_") communalities = self.get_communalities() communalities = communalities.copy() uniqueness = 1 - communalities return uniqueness
@staticmethod def _get_factor_variance(loadings): """ Get the factor variances. This is a private helper method to get the factor variances, because sometimes we need them even before the model is fitted. Parameters ---------- loadings : array-like The factor loading matrix, in whatever state. Returns ------- variance : :obj:`numpy.ndarray` The factor variances. proportional_variance : :obj:`numpy.ndarray` The proportional factor variances. cumulative_variances : :obj:`numpy.ndarray` The cumulative factor variances. """ n_rows = loadings.shape[0] # calculate variance loadings = loadings**2 variance = np.sum(loadings, axis=0) # calculate proportional variance proportional_variance = variance / n_rows # calculate cumulative variance cumulative_variance = np.cumsum(proportional_variance, axis=0) return (variance, proportional_variance, cumulative_variance)
[docs] def get_factor_variance(self): """ Calculate factor variance information. The factor variance information including the variance, proportional variance, and cumulative variance for each factor. Returns ------- variance : :obj:`numpy.ndarray` The factor variances. proportional_variance : :obj:`numpy.ndarray` The proportional factor variances. cumulative_variances : :obj:`numpy.ndarray` The cumulative factor variances. Examples -------- >>> import pandas as pd >>> from factor_analyzer import FactorAnalyzer >>> df_features = pd.read_csv('tests/data/test02.csv') >>> fa = FactorAnalyzer(rotation=None) >>> fa.fit(df_features) FactorAnalyzer(bounds=(0.005, 1), impute='median', is_corr_matrix=False, method='minres', n_factors=3, rotation=None, rotation_kwargs={}, use_smc=True) >>> # 1. Sum of squared loadings (variance) ... # 2. Proportional variance ... # 3. Cumulative variance >>> fa.get_factor_variance() (array([3.51018854, 1.28371018, 0.73739507]), array([0.35101885, 0.12837102, 0.07373951]), array([0.35101885, 0.47938987, 0.55312938])) """ # meets all of our expected criteria check_is_fitted(self, "loadings_") loadings = self.loadings_.copy() return self._get_factor_variance(loadings)
[docs] def sufficiency(self, num_observations: int) -> Tuple[float, int, float]: """ Perform the sufficiency test. The test calculates statistics under the null hypothesis that the selected number of factors is sufficient. Parameters ---------- num_observations: int The number of observations in the input data that this factor analyzer was fit using. Returns ------- statistic: float The test statistic degrees: int The degrees of freedom pvalue: float The p-value of the test References ---------- [1] Lawley, D. N. and Maxwell, A. E. (1971). Factor Analysis as a Statistical Method. Second edition. Butterworths. P. 36. Examples -------- >>> import pandas as pd >>> from factor_analyzer import FactorAnalyzer >>> df_features = pd.read_csv('tests/data/test01.csv') >>> fa = FactorAnalyzer(n_factors=3, rotation=None, method="ml") >>> fa.fit(df_features) >>> fa.sufficiency(df_features.shape[0]) (1475.8755629859675, 663, 8.804286459822274e-64) """ nvar = self.corr_.shape[0] degrees = ((nvar - self.n_factors) ** 2 - nvar - self.n_factors) // 2 obj = self._fit_ml_objective( self.get_uniquenesses(), self.corr_, self.n_factors ) statistic = ( num_observations - 1 - (2 * nvar + 5) / 6 - (2 * self.n_factors) / 3 ) * obj pvalue = chi2.sf(statistic, df=degrees) return statistic, degrees, pvalue