Source code for estimagic.inference.bootstrap

import functools
from dataclasses import dataclass
from functools import cached_property
from typing import Any

import numpy as np
import pandas as pd
from pybaum import leaf_names, tree_flatten, tree_just_flatten, tree_unflatten

from estimagic.batch_evaluators import joblib_batch_evaluator
from estimagic.inference.bootstrap_ci import calculate_ci
from estimagic.inference.bootstrap_helpers import check_inputs
from estimagic.inference.bootstrap_outcomes import get_bootstrap_outcomes
from estimagic.inference.shared import calculate_estimation_summary
from estimagic.parameters.block_trees import matrix_to_block_tree
from estimagic.parameters.tree_registry import get_registry
from estimagic.utilities import get_rng


[docs]def bootstrap(
    outcome,
    data,
    *,
    existing_result=None,
    outcome_kwargs=None,
    n_draws=1_000,
    cluster_by=None,
    seed=None,
    n_cores=1,
    error_handling="continue",
    batch_evaluator=joblib_batch_evaluator,
):
    """Use the bootstrap to calculate inference quantities.

    Args:
        outcome (callable): A function that computes the statistic of interest.
        data (pd.DataFrame): Dataset.
        existing_result (BootstrapResult): An existing BootstrapResult
            object from a previous call of bootstrap(). Default is None.
        outcome_kwargs (dict): Additional keyword arguments for outcome.
        n_draws (int): Number of bootstrap samples to draw.
            If len(existing_outcomes) >= n_draws, a random subset of existing_outcomes
            is used.
        cluster_by (str): Column name of variable to cluster by or None.
        seed (Union[None, int, numpy.random.Generator]): If seed is None or int the
            numpy.random.default_rng is used seeded with seed. If seed is already a
            Generator instance then that instance is used.
        n_cores (int): number of jobs for parallelization.
        error_handling (str): One of "continue", "raise". Default "continue" which means
            that bootstrap estimates are only calculated for those samples where no
            errors occur and a warning is produced if any error occurs.
        batch_evaluator (str or Callable): Name of a pre-implemented batch evaluator
            (currently 'joblib' and 'pathos_mp') or Callable with the same interface
            as the estimagic batch_evaluators. See :ref:`batch_evaluators`.

    Returns:
        BootstrapResult: A BootstrapResult object storing information on summary
            statistics, the covariance matrix, and estimated boostrap outcomes.

    """
    if callable(outcome):
        check_inputs(data=data, cluster_by=cluster_by)

        if outcome_kwargs is not None:
            outcome = functools.partial(outcome, **outcome_kwargs)
    else:
        raise TypeError("outcome must be a callable.")

    if existing_result is None:
        base_outcome = outcome(data)
        existing_outcomes = []
    elif isinstance(existing_result, BootstrapResult):
        base_outcome = existing_result.base_outcome
        existing_outcomes = existing_result.outcomes
    else:
        raise ValueError("existing_result must be None or a BootstrapResult.")

    rng = get_rng(seed)
    n_existing = len(existing_outcomes)

    if n_draws > n_existing:
        new_outcomes = get_bootstrap_outcomes(
            data=data,
            outcome=outcome,
            cluster_by=cluster_by,
            rng=rng,
            n_draws=n_draws - n_existing,
            n_cores=n_cores,
            error_handling=error_handling,
            batch_evaluator=batch_evaluator,
        )

        all_outcomes = existing_outcomes + new_outcomes
    else:
        random_indices = rng.choice(n_existing, n_draws, replace=False)
        all_outcomes = [existing_outcomes[k] for k in random_indices]

    # ==================================================================================
    # Process results
    # ==================================================================================

    registry = get_registry(extended=True)
    flat_outcomes = [
        tree_just_flatten(_outcome, registry=registry) for _outcome in all_outcomes
    ]
    internal_outcomes = np.array(flat_outcomes)

    result = BootstrapResult(
        _base_outcome=base_outcome,
        _internal_outcomes=internal_outcomes,
        _internal_cov=np.cov(internal_outcomes, rowvar=False),
    )

    return result


[docs]@dataclass
class BootstrapResult:
    _base_outcome: Any
    _internal_outcomes: np.ndarray
    _internal_cov: np.ndarray

    @cached_property
    def _se(self):
        return self.se()

    @cached_property
    def _cov(self):
        return self.cov()

    @cached_property
    def _ci(self):
        return self.ci()

    @cached_property
    def _p_values(self):
        return self.p_values()

    @cached_property
    def _summary(self):
        return self.summary()

    @property
    def base_outcome(self):
        """Returns the base outcome statistic(s).

        Returns:
            pytree: Pytree of base outcomes, i.e. the outcome statistic(s) evaluated
                on the original data set.

        """
        return self._base_outcome

    @cached_property
    def outcomes(self):
        """Returns the estimated bootstrap outcomes.

        Returns:
            List[Any]: The boostrap outcomes as a list of pytrees.

        """
        registry = get_registry(extended=True)
        _, treedef = tree_flatten(self._base_outcome, registry=registry)

        outcomes = [
            tree_unflatten(treedef, out, registry=registry)
            for out in self._internal_outcomes
        ]
        return outcomes

[docs]    def se(self):
        """Calculate standard errors.

        Returns:
            Any: The standard errors of the estimated parameters as a block-pytree,
                numpy.ndarray, or pandas.DataFrame.

        """
        cov = self._internal_cov
        se = np.sqrt(np.diagonal(cov))

        registry = get_registry(extended=True)
        _, treedef = tree_flatten(self._base_outcome, registry=registry)

        se = tree_unflatten(treedef, se, registry=registry)
        return se

[docs]    def cov(self, return_type="pytree"):
        """Calculate the variance-covariance matrix of the estimated parameters.

        Args:
            return_type (str): One of "pytree", "array" or "dataframe". Default pytree.
                If "array", a 2d numpy array with the covariance is returned. If
                "dataframe", a pandas DataFrame with parameter names in the
                index and columns are returned.
                The default is "pytree".

        Returns:
            Any: The covariance matrix of the estimated parameters as a block-pytree,
                numpy.ndarray, or pandas.DataFrame.

        """
        cov = self._internal_cov

        if return_type == "dataframe":
            registry = get_registry(extended=True)
            names = np.array(leaf_names(self._base_outcome, registry=registry))
            cov = pd.DataFrame(cov, columns=names, index=names)
        elif return_type == "pytree":
            cov = matrix_to_block_tree(cov, self._base_outcome, self._base_outcome)
        elif return_type != "array":
            raise ValueError(
                "return_type must be one of pytree, array, or dataframe, "
                f"not {return_type}."
            )
        return cov

[docs]    def ci(self, ci_method="percentile", ci_level=0.95):
        """Calculate confidence intervals.

        Args:
            ci_method (str): Method of choice for computing confidence intervals.
                The default is "percentile".
            ci_level (float): Confidence level for the calculation of confidence
                intervals. The default is 0.95.

        Returns:
            Any: Pytree with the same structure as base_outcome containing lower
                bounds of confidence intervals.
            Any: Pytree with the same structure as base_outcome containing upper
                bounds of confidence intervals.

        """
        registry = get_registry(extended=True)
        base_outcome_flat, treedef = tree_flatten(self._base_outcome, registry=registry)

        lower_flat, upper_flat = calculate_ci(
            base_outcome_flat, self._internal_outcomes, ci_method, ci_level
        )

        lower = tree_unflatten(treedef, lower_flat, registry=registry)
        upper = tree_unflatten(treedef, upper_flat, registry=registry)
        return lower, upper

[docs]    def p_values(self):
        """Calculate p-values.

        Returns:
            Any: A pytree with the same structure as base_outcome containing p-values
                for the parameter estimates.

        """
        msg = "Bootstrap p_values are not yet implemented."
        raise NotImplementedError(msg)

[docs]    def summary(self, ci_method="percentile", ci_level=0.95):
        """Create a summary of bootstrap results.

        Args:
            ci_method (str): Method of choice for confidence interval computation.
                The default is "percentile".
            ci_level (float): Confidence level for the calculation of confidence
                intervals. The default is 0.95.

        Returns:
            pd.DataFrame: The estimation summary as a DataFrame containing information
                on the mean, standard errors, as well as the confidence intervals.
                Soon this will be a pytree.

        """
        registry = get_registry(extended=True)
        names = leaf_names(self.base_outcome, registry=registry)
        summary_data = _calulcate_summary_data_bootstrap(
            self, ci_method=ci_method, ci_level=ci_level
        )
        summary = calculate_estimation_summary(
            summary_data=summary_data,
            names=names,
            free_names=names,
        )
        return summary


def _calulcate_summary_data_bootstrap(bootstrap_result, ci_method, ci_level):
    lower, upper = bootstrap_result.ci(ci_method=ci_method, ci_level=ci_level)
    summary_data = {
        "value": bootstrap_result.base_outcome,
        "standard_error": bootstrap_result.se(),
        "ci_lower": lower,
        "ci_upper": upper,
        "p_value": np.full(len(lower), np.nan),  # p-values are not implemented yet
    }
    return summary_data