Source code for estimagic.inference.bootstrap

import functools
from dataclasses import dataclass
from functools import cached_property
from typing import Any

import numpy as np
import pandas as pd
from pybaum import leaf_names, tree_flatten, tree_just_flatten, tree_unflatten

from estimagic.batch_evaluators import joblib_batch_evaluator
from estimagic.inference.bootstrap_ci import calculate_ci
from estimagic.inference.bootstrap_helpers import check_inputs
from estimagic.inference.bootstrap_outcomes import get_bootstrap_outcomes
from estimagic.inference.shared import calculate_estimation_summary
from estimagic.parameters.block_trees import matrix_to_block_tree
from estimagic.parameters.tree_registry import get_registry
from estimagic.utilities import get_rng


[docs]def bootstrap( outcome, data, *, existing_result=None, outcome_kwargs=None, n_draws=1_000, cluster_by=None, seed=None, n_cores=1, error_handling="continue", batch_evaluator=joblib_batch_evaluator, ): """Use the bootstrap to calculate inference quantities. Args: outcome (callable): A function that computes the statistic of interest. data (pd.DataFrame): Dataset. existing_result (BootstrapResult): An existing BootstrapResult object from a previous call of bootstrap(). Default is None. outcome_kwargs (dict): Additional keyword arguments for outcome. n_draws (int): Number of bootstrap samples to draw. If len(existing_outcomes) >= n_draws, a random subset of existing_outcomes is used. cluster_by (str): Column name of variable to cluster by or None. seed (Union[None, int, numpy.random.Generator]): If seed is None or int the numpy.random.default_rng is used seeded with seed. If seed is already a Generator instance then that instance is used. n_cores (int): number of jobs for parallelization. error_handling (str): One of "continue", "raise". Default "continue" which means that bootstrap estimates are only calculated for those samples where no errors occur and a warning is produced if any error occurs. batch_evaluator (str or Callable): Name of a pre-implemented batch evaluator (currently 'joblib' and 'pathos_mp') or Callable with the same interface as the estimagic batch_evaluators. See :ref:`batch_evaluators`. Returns: BootstrapResult: A BootstrapResult object storing information on summary statistics, the covariance matrix, and estimated boostrap outcomes. """ if callable(outcome): check_inputs(data=data, cluster_by=cluster_by) if outcome_kwargs is not None: outcome = functools.partial(outcome, **outcome_kwargs) else: raise TypeError("outcome must be a callable.") if existing_result is None: base_outcome = outcome(data) existing_outcomes = [] elif isinstance(existing_result, BootstrapResult): base_outcome = existing_result.base_outcome existing_outcomes = existing_result.outcomes else: raise ValueError("existing_result must be None or a BootstrapResult.") rng = get_rng(seed) n_existing = len(existing_outcomes) if n_draws > n_existing: new_outcomes = get_bootstrap_outcomes( data=data, outcome=outcome, cluster_by=cluster_by, rng=rng, n_draws=n_draws - n_existing, n_cores=n_cores, error_handling=error_handling, batch_evaluator=batch_evaluator, ) all_outcomes = existing_outcomes + new_outcomes else: random_indices = rng.choice(n_existing, n_draws, replace=False) all_outcomes = [existing_outcomes[k] for k in random_indices] # ================================================================================== # Process results # ================================================================================== registry = get_registry(extended=True) flat_outcomes = [ tree_just_flatten(_outcome, registry=registry) for _outcome in all_outcomes ] internal_outcomes = np.array(flat_outcomes) result = BootstrapResult( _base_outcome=base_outcome, _internal_outcomes=internal_outcomes, _internal_cov=np.cov(internal_outcomes, rowvar=False), ) return result
[docs]@dataclass class BootstrapResult: _base_outcome: Any _internal_outcomes: np.ndarray _internal_cov: np.ndarray @cached_property def _se(self): return self.se() @cached_property def _cov(self): return self.cov() @cached_property def _ci(self): return self.ci() @cached_property def _p_values(self): return self.p_values() @cached_property def _summary(self): return self.summary() @property def base_outcome(self): """Returns the base outcome statistic(s). Returns: pytree: Pytree of base outcomes, i.e. the outcome statistic(s) evaluated on the original data set. """ return self._base_outcome @cached_property def outcomes(self): """Returns the estimated bootstrap outcomes. Returns: List[Any]: The boostrap outcomes as a list of pytrees. """ registry = get_registry(extended=True) _, treedef = tree_flatten(self._base_outcome, registry=registry) outcomes = [ tree_unflatten(treedef, out, registry=registry) for out in self._internal_outcomes ] return outcomes
[docs] def se(self): """Calculate standard errors. Returns: Any: The standard errors of the estimated parameters as a block-pytree, numpy.ndarray, or pandas.DataFrame. """ cov = self._internal_cov se = np.sqrt(np.diagonal(cov)) registry = get_registry(extended=True) _, treedef = tree_flatten(self._base_outcome, registry=registry) se = tree_unflatten(treedef, se, registry=registry) return se
[docs] def cov(self, return_type="pytree"): """Calculate the variance-covariance matrix of the estimated parameters. Args: return_type (str): One of "pytree", "array" or "dataframe". Default pytree. If "array", a 2d numpy array with the covariance is returned. If "dataframe", a pandas DataFrame with parameter names in the index and columns are returned. The default is "pytree". Returns: Any: The covariance matrix of the estimated parameters as a block-pytree, numpy.ndarray, or pandas.DataFrame. """ cov = self._internal_cov if return_type == "dataframe": registry = get_registry(extended=True) names = np.array(leaf_names(self._base_outcome, registry=registry)) cov = pd.DataFrame(cov, columns=names, index=names) elif return_type == "pytree": cov = matrix_to_block_tree(cov, self._base_outcome, self._base_outcome) elif return_type != "array": raise ValueError( "return_type must be one of pytree, array, or dataframe, " f"not {return_type}." ) return cov
[docs] def ci(self, ci_method="percentile", ci_level=0.95): """Calculate confidence intervals. Args: ci_method (str): Method of choice for computing confidence intervals. The default is "percentile". ci_level (float): Confidence level for the calculation of confidence intervals. The default is 0.95. Returns: Any: Pytree with the same structure as base_outcome containing lower bounds of confidence intervals. Any: Pytree with the same structure as base_outcome containing upper bounds of confidence intervals. """ registry = get_registry(extended=True) base_outcome_flat, treedef = tree_flatten(self._base_outcome, registry=registry) lower_flat, upper_flat = calculate_ci( base_outcome_flat, self._internal_outcomes, ci_method, ci_level ) lower = tree_unflatten(treedef, lower_flat, registry=registry) upper = tree_unflatten(treedef, upper_flat, registry=registry) return lower, upper
[docs] def p_values(self): """Calculate p-values. Returns: Any: A pytree with the same structure as base_outcome containing p-values for the parameter estimates. """ msg = "Bootstrap p_values are not yet implemented." raise NotImplementedError(msg)
[docs] def summary(self, ci_method="percentile", ci_level=0.95): """Create a summary of bootstrap results. Args: ci_method (str): Method of choice for confidence interval computation. The default is "percentile". ci_level (float): Confidence level for the calculation of confidence intervals. The default is 0.95. Returns: pd.DataFrame: The estimation summary as a DataFrame containing information on the mean, standard errors, as well as the confidence intervals. Soon this will be a pytree. """ registry = get_registry(extended=True) names = leaf_names(self.base_outcome, registry=registry) summary_data = _calulcate_summary_data_bootstrap( self, ci_method=ci_method, ci_level=ci_level ) summary = calculate_estimation_summary( summary_data=summary_data, names=names, free_names=names, ) return summary
def _calulcate_summary_data_bootstrap(bootstrap_result, ci_method, ci_level): lower, upper = bootstrap_result.ci(ci_method=ci_method, ci_level=ci_level) summary_data = { "value": bootstrap_result.base_outcome, "standard_error": bootstrap_result.se(), "ci_lower": lower, "ci_upper": upper, "p_value": np.full(len(lower), np.nan), # p-values are not implemented yet } return summary_data