Source code for estimagic.visualization.estimation_table

import re
from copy import deepcopy
from functools import partial
from pathlib import Path
from warnings import warn
from estimagic.compat import pd_df_map

import numpy as np
import pandas as pd


suppress_performance_warnings = np.testing.suppress_warnings()
suppress_performance_warnings.filter(category=pd.errors.PerformanceWarning)


[docs]@suppress_performance_warnings def estimation_table( models, *, return_type="dataframe", render_options=None, show_col_names=True, show_col_groups=None, show_index_names=False, show_inference=True, show_stars=True, show_footer=True, custom_param_names=None, custom_col_names=None, custom_col_groups=None, custom_index_names=None, custom_notes=None, confidence_intervals=False, significance_levels=(0.1, 0.05, 0.01), append_notes=True, notes_label="Note:", stats_options=None, number_format=("{0:.3g}", "{0:.5f}", "{0:.4g}"), add_trailing_zeros=True, escape_special_characters=True, siunitx_warning=True, ): r"""Generate html or LaTex tables provided (lists of) of models. The function can create publication quality tables in various formats from statsmodels or estimagic results. It allows for extensive customization via optional arguments and almost limitless flexibility when using a two-stage approach where the ``return_type`` is set to ``"render_inputs"``, the resulting dictionary representation of the table is modified and that modified version is then passed to ``render_latex`` or ``render_html``. The formatting of the numbers in the table is completely configurable via the ``number_format`` argument. By default we round to three significant digits (i.e. the three leftmost non-zero digits are displayed). This is very different from other table packages and motivated by the fact that most estimation tables give a wrong feeling of precision by showing too many decimal points. Args: models (list): list of estimation results. The models can come from statmodels or be constructed from the outputs of `estimagic.estimate_ml` or `estimagic.estimate_msm`. With a little bit of work it is also possible to construct them out of R or other results. If a model is not a statsmodels results they must be dictionaries with the following entries: "params" (a DataFrame with value column), "info" (a dictionary with summary statistics such as "n_obs", "rsquared", ...) and "name" (a string), or a DataFrame with value column. If a models is a statsmodels result, model.endog_names is used as name and the rest is extracted from corresponding statsmodels attributes. The model names do not have to be unique but if they are not, models with the same name need to be grouped together. return_type (str): Can be "dataframe", "latex", "html", "render_inputs" or a file path with the extension .tex or .html. If "render_inputs" is passed, a dictionary with the entries "body", "footer" and other information is returned. The entries can be modified by the user ( e.g. change formatting, renameof columns or index, ...) and then passed to ``render_latex`` or ``render_html``. Default "dataframe". render_options (dict): a dictionary with keyword arguments that are passed to df.style.to_latex or df.style.to_html, depending on the return_type. The default is None. show_col_names (bool): If True, the column names are displayed. The default column names are the model names if the model names are unique, otherwise (1), (2), etc.. Default True. show_col_groups (bool): If True, the column groups are displayed. The default column groups are the model names if the model names are not unique and undefined otherwise. Default None. None means that the column groups are displayed if they are defined. show_index_names (bool): If True, the index names are displayed. Default False. This is mostly relevant when working with estimagic style params DataFrames with a MultiIndex. show_inference(bool): If True, inference (standard errors or confidence intervals) are displayed below parameter values. Default True. show_stars (bool): a boolean variable for displaying significance stars. Default is True. show_footer (bool): a boolean variable for displaying statistics, e.g. R2, Obs numbers. Default is True. Which statistics are displayed and how they are labeled can be determined via ``stats_options``. custom_param_names (dict): Dictionary that is used to rename parameters. The keys are the old parameter names or index entries. The values are the new names. Default None. custom_col_names (dict or list): A list of column names or dict to rename the default column names. The default column names are the model names if the model names are unique, otherwise (1), (2), etc.. custom_col_groups (dict or list): A list of column group or dict to rename the default column groups. The default column groups are the model names if the model names are not unique and undefined otherwise. custom_index_names (dict or list): Dictionary or list to set the names of the index levels of the parameters. This is mostly relevant when working with estimagic style params DataFrames with a MultiIndex and only used if "index_names" is set to True in the render_options. Default None. custom_notes (list): A list of strings for additional notes. Default is None. confidence_intervals (bool): If True, display confidence intervals as inference values. Display standard errors otherwise. Default False. significance_levels (list): a list of floats for p value's significance cut-off values. This is used to generate the significance stars. Default is [0.1,0.05,0.01]. append_notes (bool): A boolean variable for printing p value cutoff explanation and additional notes, if applicable. Default is True. notes_label (str): A sting to print as the title of the notes section, if applicable. Default is 'Notes' stats_options (dict): A dictionary that determines which statistics (e.g. R-Squared, No. of Observations) are displayed and how they are labeled. The keys are the names of the statistics inside the model['info'] dictionary or attribute names of a statsmodels results object. The values are the new labels to be displayed for those statistics, i.e. the set of the values is used as row names in the table. number_format (int, str, iterable or callable): A callable, iterable, integer or string that is used to apply string formatter(s) to floats in the table. Default ("{0:.3g}", "{0:.5f}", "{0:.4g}"). add_trailing_zeros (bool): If True, format floats such that they have same number of digits after the decimal point. Default True. siunitx_warning (bool): If True, print warning about LaTex preamble to add for proper compilation of when working with siunitx package. Default True. escape_special_characters (bool): If True, replaces special characters in parameter and model names with LaTeX or HTML safe sequences. Returns: res_table (data frame, str or dictionary): depending on the rerturn type, data frame with formatted strings, a string for html or latex tables, or a dictionary with statistics and parameters dataframes, and strings for footers is returned. If the return type is a path, the function saves the resulting table at the given path. """ if not isinstance(models, (tuple, list)): raise TypeError(f"models must be a list or tuple. Not: {type(models)}") models = [_process_model(model) for model in models] model_names = _get_model_names(models) default_col_names, default_col_groups = _get_default_column_names_and_groups( model_names ) column_groups = _customize_col_groups( default_col_groups=default_col_groups, custom_col_groups=custom_col_groups ) column_names = _customize_col_names( default_col_names=default_col_names, custom_col_names=custom_col_names ) show_col_groups = _update_show_col_groups(show_col_groups, column_groups) stats_options = _set_default_stats_options(stats_options) body, footer = _get_estimation_table_body_and_footer( models, column_names, column_groups, custom_param_names, custom_index_names, significance_levels, stats_options, show_col_names, show_col_groups, show_stars, show_inference, confidence_intervals, number_format, add_trailing_zeros, ) render_inputs = { "body": body, "footer": footer, "render_options": render_options, } if return_type == "render_inputs": out = render_inputs elif str(return_type).endswith("tex"): out = _render_latex( **render_inputs, show_footer=show_footer, append_notes=append_notes, notes_label=notes_label, significance_levels=significance_levels, custom_notes=custom_notes, siunitx_warning=siunitx_warning, show_index_names=show_index_names, show_col_names=show_col_names, escape_special_characters=escape_special_characters, ) elif str(return_type).endswith("html"): out = render_html( **render_inputs, show_footer=show_footer, append_notes=append_notes, notes_label=notes_label, custom_notes=custom_notes, significance_levels=significance_levels, show_index_names=show_index_names, show_col_names=show_col_names, escape_special_characters=escape_special_characters, ) elif return_type == "dataframe": if show_footer: footer.index.names = body.index.names out = pd.concat([body.reset_index(), footer.reset_index()]).set_index( body.index.names ) else: out = body else: raise ValueError( f"""Value of return type can be either of ['data_frame', 'render_inputs','latex' ,'html'] or a path ending with '.html' or '.tex'. Not: {return_type}.""" ) return_type = Path(return_type) if return_type.suffix not in (".html", ".tex"): return out else: return_type.write_text(out)
[docs]@suppress_performance_warnings def render_latex( body, footer, render_options=None, show_footer=True, append_notes=True, notes_label="Note:", significance_levels=(0.1, 0.05, 0.01), custom_notes=None, siunitx_warning=True, show_index_names=False, show_col_names=True, show_col_groups=True, escape_special_characters=True, ): r"""Return estimation table in LaTeX format as string. Args: body (pandas.DataFrame): DataFrame with formatted strings of parameter values, inferences (standard errors or confidence intervals, if applicable) and significance stars (if applicable). footer (pandas.DataFrame): DataFrame with formatted strings of summary statistics (such as number of observations, r-squared, etc.) render_options(dict): A dictionary with custom kwargs to pass to pd.Styler.to_latex(), to update the default options. An example keyword argument is: - siunitx (bool): If True, the table is structured to be compatible with siunitx package. Default is set to True internally. For the list of all possible arguments, see documentation of `pandas.io.formats.style.Styler.to_latex`. show_footer (bool): a boolean variable for displaying footer_df. Default True. append_notes (bool): A boolean variable for printing p value cutoff explanation and additional notes, if applicable. Default is True. notes_label (str): A sting to print as the title of the notes section, if applicable. Default is 'Notes' significance_levels (list or tuple): a list of floats for p value's significance cutt-off values. Default is [0.1,0.05,0.01]. custom_notes (list): A list of strings for additional notes. Default is None. siunitx_warning (bool): If True, print warning about LaTex preamble to add for proper compilation of when working with siunitx package. Default True. show_index_names (bool): If True, display index names in the table. show_col_names (bool): If True, the column names are displayed. show_col_groups (bool): If True, the column groups are displayed. escape_special_characters (bool): If True, replaces the characters &, %, $, #, _, {, }, ~, ^, and \ in parameter and model names with LaTeX-safe sequences. Returns: latex_str (str): The resulting string with Latex tabular code. """ return _render_latex( body=body, footer=footer, render_options=render_options, show_footer=show_footer, append_notes=append_notes, notes_label=notes_label, significance_levels=significance_levels, custom_notes=custom_notes, siunitx_warning=siunitx_warning, show_index_names=show_index_names, show_col_names=show_col_names, show_col_groups=show_col_groups, escape_special_characters=escape_special_characters, )
def _render_latex( body, footer, render_options=None, show_footer=True, append_notes=True, notes_label="Note:", significance_levels=(0.1, 0.05, 0.01), custom_notes=None, siunitx_warning=True, show_index_names=False, show_col_names=True, show_col_groups=True, escape_special_characters=True, ): """See docstring of render_latex for more information.""" if not pd.__version__ >= "1.4.0": raise ValueError( r"""render_latex or estimation_table with return_type="latex" requires pandas 1.4.0 or higher. Update to a newer version of pandas or use estimation_table with return_type="render_inputs" and manually render those results using the DataFrame.to_latex method. """ ) if siunitx_warning: warn( r"""Proper LaTeX compilation requires the package siunitx and adding \sisetup{ input-symbols = (), table-align-text-post = false, group-digits = false, } to your main tex file. To turn this warning off set value of siunitx_warning = False""" ) body = body.copy(deep=True) try: ci_in_body = body.loc[("",)][body.columns[0]].str.contains(";").any() except KeyError: ci_in_body = False if ci_in_body: body.loc[("",)] = pd_df_map(body.loc[("",)], "{{{}}}".format).values if body.columns.nlevels > 1: column_groups = body.columns.get_level_values(0) else: column_groups = None group_to_col_position = _create_group_to_col_position(column_groups) n_levels = body.index.nlevels n_columns = len(body.columns) if escape_special_characters: escape_special_characters = "latex" else: escape_special_characters = None body_styler = _get_updated_styler( body, show_index_names=show_index_names, show_col_names=show_col_names, show_col_groups=show_col_groups, escape_special_characters=escape_special_characters, ) default_options = { "multicol_align": "c", "hrules": True, "siunitx": True, "column_format": "l" * n_levels + "S" * n_columns, "multirow_align": "t", } if render_options: default_options.update(render_options) latex_str = body_styler.to_latex(**default_options) if group_to_col_position: temp_str = "\n" for k in group_to_col_position: max_col = max(group_to_col_position[k]) + n_levels + 1 min_col = min(group_to_col_position[k]) + n_levels + 1 temp_str += f"\\cmidrule(lr){{{min_col}-{max_col}}}" temp_str += "\n" latex_str = ( latex_str.split("\\\\", 1)[0] + "\\\\" + temp_str + latex_str.split("\\\\", 1)[1] ) latex_str = latex_str.split("\\bottomrule")[0] if show_footer: footer = footer.copy(deep=True) footer = footer.apply(_center_align_integers_and_non_numeric_strings, axis=1) footer_styler = footer.style stats_str = footer_styler.to_latex(**default_options) if "\\midrule" in stats_str: stats_str = ( "\\midrule" + stats_str.split("\\midrule")[1].split("\\bottomrule")[0] ) else: stats_str = ( "\\midrule" + stats_str.split("\\toprule")[1].split("\\bottomrule")[0] ) latex_str += stats_str notes = _generate_notes_latex( append_notes, notes_label, significance_levels, custom_notes, body ) latex_str += notes latex_str += "\\bottomrule\n\\end{tabular}\n" if latex_str.startswith("\\begin{table}"): latex_str += "\n\\end{table}\n" return latex_str
[docs]def render_html( body, footer, render_options=None, show_footer=True, append_notes=True, notes_label="Note:", custom_notes=None, significance_levels=(0.1, 0.05, 0.01), show_index_names=False, show_col_names=True, show_col_groups=True, escape_special_characters=True, **kwargs, # noqa: ARG001 ): """Return estimation table in html format as string. Args: body (pandas.DataFrame): DataFrame with formatted strings of parameter values, inferences (standard errors or confidence intervals, if applicable) and significance stars (if applicable). footer (pandas.DataFrame): DataFrame with formatted strings of summary statistics (such as number of observations, r-squared, etc.) notes (str): The html string with notes with additional information (e.g. mapping from pvalues to significance stars) to append to the footer of the estimation table string with LaTex code for the notes section. render_options(dict): A dictionary with custom kwargs to pass to pd.to_latex(), to update the default options. An example is `{header: False}` that disables displaying column names. show_footer (bool): a boolean variable for displaying footer_df. Default True. append_notes (bool): A boolean variable for printing p value cutoff explanation and additional notes, if applicable. Default is True. notes_label (str): A sting to print as the title of the notes section, if applicable. Default is 'Notes' significance_levels (list or tuple): a list of floats for p value's significance cutt-off values. Default is [0.1,0.05,0.01]. show_index_names (bool): If True, display index names in the table. show_col_names (bool): If True, the column names are displayed. show_col_groups (bool): If True, the column groups are displayed. escape_special_characters (bool): If True, replace the characters &, <, >, ', and " in parameter and model names with HTML-safe sequences. Returns: html_str (str): The resulting string with html tabular code. """ if not pd.__version__ >= "1.4.0": raise ValueError( r"""render_html or estimation_table with return_type="html" requires pandas 1.4.0 or higher. Update to a newer version of pandas or use estimation_table with return_type="render_inputs" and manually render those results using the DataFrame.to_html method. """ ) n_levels = body.index.nlevels n_columns = len(body.columns) html_str = "" if escape_special_characters: escape_special_characters = "html" else: escape_special_characters = None body_styler = _get_updated_styler( body, show_index_names=show_index_names, show_col_names=show_col_names, show_col_groups=show_col_groups, escape_special_characters=escape_special_characters, ) default_options = {"exclude_styles": True} if render_options: default_options.update(render_options) html_str = body_styler.to_html(**default_options).split("</tbody>\n</table>")[0] if show_footer: stats_str = """<tr><td colspan="{}" style="border-bottom: 1px solid black"> </td></tr>""".format( n_levels + n_columns ) stats_str += ( footer.style.to_html(**default_options) .split("</thead>\n")[1] .split("</tbody>\n</table>")[0] ) stats_str = re.sub(r"(?<=[\d)}{)])}", "", re.sub(r"{(?=[}\d(])", "", stats_str)) html_str += stats_str notes = _generate_notes_html( append_notes, notes_label, significance_levels, custom_notes, body ) html_str += notes html_str += "</tbody>\n</table>" return html_str
def _process_model(model): """Check model validity, convert to dictionary. Args model: Estimation result. See docstring of estimation_table for more info. Returns: processed_model: A dictionary with keys params, info and name. """ if isinstance(model, dict): params = model["params"].copy(deep=True) info = model.get("info", {}) name = model.get("name", "") elif isinstance(model, pd.DataFrame): params = model.copy(deep=True) info = {} name = None else: try: params = _extract_params_from_sm(model) info = {**_extract_info_from_sm(model)} name = info.pop("name") except (KeyboardInterrupt, SystemExit): raise except Exception as e: raise TypeError( f"""Model can be of type dict, pd.DataFrame or a statsmodels result. Model {model} is of type {type(model)}.""" ) from e if "pvalue" in params.columns: params = params.rename(columns={"pvalue": "p_value"}) processed_model = {"params": params, "info": info, "name": name} return processed_model def _get_estimation_table_body_and_footer( models, column_names, column_groups, custom_param_names, custom_index_names, significance_levels, stats_options, show_col_names, show_col_groups, show_stars, show_inference, confidence_intervals, number_format, add_trailing_zeros, ): """Create body and footer blocs with significance stars and inference values. Applies number formatting to parameters and summary statitistics. Concatinates infere values to parameter values if applicable, Adds significance stars if applicable. Args: models (list): List of dictionaries with keys 'params', 'info' and 'name'. column_names (list): List of strigs to display as names of the model columns in estimation table. column_groups (list or NoneType): If defined, list of strings to display as names of groups of model columns in estimation table. custom_param_names (dict or list): A list of strings to display as parameter names or a mapping from original to custom paramter names. custom_index_names (dict or list): Dictionary or list to set the names of the index levels of the parameters. significance_levels (list): a list of floats for p value's significance cutt-off values. stats_options (dict): A dictionary with displayed statistics names as keys, and statistics names to be retrieved from model['info'] as values show_col_names (bool): If True, the column names are displayed. show_col_groups (bool): If True, the column groups are displayed. show_stars (bool): a boolean variable for printing significance stars. show_inference(bool): If True, inference (standard errors or confidence intervals) below param values. confidence_intervals (bool): If True, display confidence intervals as inference values. number_format (int, str, iterable or callable): A callable, iterable, integer or callable that is used to apply string formatter(s) to floats in the table. add_trailing_zeros (bool): If True, format floats such that they have same number of digits after the decimal point. Returns: body (DataFrame): DataFrame data frame with formatted strings of parameter and inference values and significance stars to display in estimation table. footer (DataFrame): DataFrame with formatted strings of summary statistics to display at the bottom of estimation table. """ body, max_trail = _build_estimation_table_body( models, column_names, column_groups, custom_param_names, custom_index_names, show_col_names, show_col_groups, show_inference, show_stars, confidence_intervals, significance_levels, number_format, add_trailing_zeros, ) footer = _build_estimation_table_footer( models, stats_options, significance_levels, show_stars, number_format, add_trailing_zeros, max_trail, ) footer.columns = body.columns return body, footer def _build_estimation_table_body( models, column_names, column_groups, custom_param_names, custom_index_names, show_col_names, show_col_groups, show_inference, show_stars, confidence_intervals, significance_levels, number_format, add_trailing_zeros, ): """Create body bloc significance stars and inference values. Applies number formatting to parameters. Concatinates inference values to parameter values if applicable. Adds significance stars if applicable. Args: models (list): List of dictionaries with keys 'params', 'info' and 'name'. column_names (list): List of strigs to display as names of the model columns in estimation table. column_groups (list or NoneType): If defined, list of strings to display as names of groups of model columns in estimation table. custom_param_names (dict or list): A list of strings to display as parameter names or a mapping from original to custom paramter names. custom_index_names (dict or list): Dictionary or list to set the names of the index levels of the parameters. significance_levels (list): a list of floats for p value's significance cutt-off values. show_col_names (bool): If True, the column names are displayed. show_col_groups (bool): If True, the column groups are displayed. show_stars (bool): a boolean variable for printing significance stars. show_inference(bool): If True, inference (standard errors or confidence intervals) below param values. confidence_intervals (bool): If True, display confidence intervals as inference values. number_format (int, str, iterable or callable): A callable, iterable, integer or callable that is used to apply string formatter(s) to floats in the table. add_trailing_zeros (bool): If True, format floats such that they have same number of digits after the decimal point. Returns: body (DataFrame): DataFrame data frame with formatted strings of parameter and inference values and significance stars to display in estimation table. max_trail (int): Integer that shows the maximum number of digits after a decimal point in the parameters DataFrame. Is passed to `_build_estimation_table_footer` to get same number of trailing zeros as in parameters DataFrame and torender_latex for formatting tables in siunitx package. """ dfs, max_trail = _reindex_and_float_format_params( models, show_inference, confidence_intervals, number_format, add_trailing_zeros ) to_convert = [] if show_stars: for df, mod in zip(dfs, models): to_convert.append( pd.concat([df, mod["params"].reindex(df.index)["p_value"]], axis=1) ) else: to_convert = dfs # convert DataFrames to string series with inference and siginificance # information. to_concat = [ _convert_frame_to_string_series( df, significance_levels, show_stars, ) for df in to_convert ] df = pd.concat(to_concat, axis=1) df = _process_frame_indices( df=df, custom_param_names=custom_param_names, custom_index_names=custom_index_names, show_col_names=show_col_names, show_col_groups=show_col_groups, column_names=column_names, column_groups=column_groups, ) return df, max_trail def _build_estimation_table_footer( models, stats_options, significance_levels, show_stars, number_format, add_trailing_zeros, max_trail, ): """Create footer bloc of estimation table. Applies number formatting to parameters and summary statitistics. Concatinates infere values to parameter values if applicable, Adds significance stars if applicable. Args: models (list): List of dictionaries with keys 'params', 'info' and 'name'. stats_options (dict): A dictionary with displayed statistics names as keys, and statistics names to be retrieved from model['info'] as values significance_levels (list): a list of floats for p value's significance cutt-off values. number_format (int, str, iterable or callable): A callable, iterable, integer or callable that is used to apply string formatter(s) to floats in the table. add_trailing_zeros (bool): If True, format floats such that they haave same number of digits after the decimal point. max_trail (int): If add_trailing_zeros is True, add corresponding number of trailing zeros to floats in the stats DataFrame to have number of digits after a decimal point equal to max_trail for each float. Returns: footer (DataFrame): DataFrame with formatted strings of summary statistics to display at the bottom of estimation table. """ to_concat = [ _create_statistics_sr( mod, stats_options, significance_levels, show_stars, number_format, add_trailing_zeros, max_trail, ) for mod in models ] stats = pd.concat(to_concat, axis=1) return stats def _reindex_and_float_format_params( models, show_inference, confidence_intervals, number_format, add_trailing_zeros ): """Reindex all params DataFrames with a common index and apply number formatting.""" dfs = _get_params_frames_with_common_index(models) cols_to_format = _get_cols_to_format(show_inference, confidence_intervals) formatted_frames, max_trail = _apply_number_formatting_frames( dfs, cols_to_format, number_format, add_trailing_zeros ) return formatted_frames, max_trail def _get_params_frames_with_common_index(models): """Get a list of params frames, reindexed with a common index.""" dfs = [model["params"] for model in models] common_index = _get_common_index(dfs) out = [model["params"].reindex(common_index) for model in models] return out def _get_common_index(dfs): """Get common index from a list of DataFrames.""" common_index = [] for d_ in dfs: common_index += [ind for ind in d_.index.to_list() if ind not in common_index] return common_index def _get_cols_to_format(show_inference, confidence_intervals): """Get the list of names of columns that need to be formatted. By default, formatting is applied to parameter values. If inference values need to displayed, adds confidence intervals or standard erros to the list. """ cols = ["value"] if show_inference: if confidence_intervals: cols += ["ci_lower", "ci_upper"] else: cols.append("standard_error") return cols def _apply_number_formatting_frames(dfs, columns, number_format, add_trailing_zeros): """Apply string formatter to specific columns of a list of DataFrames.""" raw_formatted = [ _apply_number_format(df[columns], number_format, format_integers=False) for df in dfs ] max_trail = int(max([_get_digits_after_decimal(df) for df in raw_formatted])) if add_trailing_zeros: formatted = [ _apply_number_format(df, max_trail, format_integers=True) for df in raw_formatted ] else: formatted = raw_formatted return formatted, max_trail def _update_show_col_groups(show_col_groups, column_groups): """Set the value of show_col_groups to False or True given column_groups. Updates the default None to True if column_groups is not None. Sets to False otherwise. """ if show_col_groups is None: if column_groups is not None: show_col_groups = True else: show_col_groups = False return show_col_groups def _set_default_stats_options(stats_options): """Define some default summary statistics to display in estimation table.""" if stats_options is None: stats_options = { "n_obs": "Observations", "rsquared": "R$^2$", "rsquared_adj": "Adj. R$^2$", "resid_std_err": "Residual Std. Error", "fvalue": "F Statistic", } else: if not isinstance(stats_options, dict): raise TypeError( f"""stats_options can be of types dict or NoneType. Not: {type(stats_options)}.""" ) return stats_options def _get_model_names(processed_models): """Get names of model names if defined, set based on position otherwise. Args: processed_models (list): List of estimation results processed to dictionaries. Returns: names (list): List of model names given either by name attribute of each model if defined or the position (counting from 1) of each model in parentheses. """ names = [] for i, mod in enumerate(processed_models): if mod.get("name"): names.append(mod["name"]) else: names.append(f"({i + 1})") _check_order_of_model_names(names) return names def _check_order_of_model_names(model_names): """Check identically named models are adjacent. Args: model_names (list): List of model names. Raises: ValueError: if models that share a name are not next to each other. """ group_to_col_index = _create_group_to_col_position(model_names) for positions in group_to_col_index.values(): if positions != list(range(positions[0], positions[-1] + 1)): raise ValueError( "If there are repetitions in model_names, models with the " f"same name need to be adjacent. You provided: {model_names}" ) def _get_default_column_names_and_groups(model_names): """Get column names and groups to display in the estimation table. Args: model_names (list): List of model names. Returns: col_names (list): List of estimation column names to display in estimation table. Same as model_names if model_names are unique. Given by column position (counting from 1) in braces otherwise. col_groups (list or NoneType): If defined, list of strings unique values of which will define column groups. Not defined if model_names are unique. """ if len(set(model_names)) == len(model_names): col_groups = None col_names = model_names else: col_groups = model_names col_names = [f"({i + 1})" for i in range(len(model_names))] return col_names, col_groups def _customize_col_groups(default_col_groups, custom_col_groups): """Change default (inferred) column group titles using custom column groups. Args: default_col_groups (list or NoneType): The inferred column groups. custom_col_groups (list or dict): Dictionary mapping defautl column group titles to custom column group titles, if the defautl column groups are defined. Must be a list of the same lenght as models otherwise. Returns: col_groups (list): Column groups to display in estimation table. """ if custom_col_groups: if not default_col_groups: if not isinstance(custom_col_groups, list): raise ValueError( """With unique model names, multiple models can't be grouped under common group name. Provide list of unique group names instead, if you wish to add column level.""" ) col_groups = custom_col_groups else: if isinstance(custom_col_groups, list): col_groups = custom_col_groups elif isinstance(custom_col_groups, dict): col_groups = ( pd.Series(default_col_groups).replace(custom_col_groups).to_list() ) else: raise TypeError( f"""Invalid type for custom_col_groups. Can be either list or dictionary, or NoneType. Not: {type(col_groups)}.""" ) else: col_groups = default_col_groups return col_groups def _customize_col_names(default_col_names, custom_col_names): """Change default (inferred) column names using custom column names. Args: deafult_col_names (list): The default (inferred) column names. custom_col_names (list or dict): Dictionary mapping default column names to custom column names, or list to display as the name of each model column. Returns: column_names (list): The column names to display in the estimatino table. """ if not custom_col_names: col_names = default_col_names elif isinstance(custom_col_names, dict): col_names = list(pd.Series(default_col_names).replace(custom_col_names)) elif isinstance(custom_col_names, list): if not len(custom_col_names) == len(default_col_names): raise ValueError( f"""If provided as a list, custom_col_names should have same length as default_col_names. Lenght of custom_col_names {len(custom_col_names)} !=length of default_col_names {len(default_col_names)}""" ) elif any(isinstance(i, list) for i in custom_col_names): raise ValueError("Custom_col_names cannot be a nested list") col_names = custom_col_names else: raise TypeError( f"""Invalid type for custom_col_names. Can be either list or dictionary, or NoneType. Not: {col_names}.""" ) return col_names def _create_group_to_col_position(column_groups): """Get mapping from column groups to column positions. Args: column_names (list): The column groups to display in the estimatino table. Returns: group_to_col_index(dict): The mapping from column group titles to column positions. """ if column_groups is not None: group_to_col_index = {group: [] for group in list(set(column_groups))} for i, group in enumerate(column_groups): group_to_col_index[group].append(i) else: group_to_col_index = None return group_to_col_index def _convert_frame_to_string_series( df, significance_levels, show_stars, ): """Return processed value series with significance stars and inference information. Args: df (DataFrame): params DataFrame of the model significance_levels (list): see main docstring number_format (int, str, iterable or callable): see main docstring show_inference (bool): see main docstring confidence_intervals (bool): see main docstring show_stars (bool): see main docstring Returns: sr (pd.Series): string series with values and inferences. """ value_sr = df["value"] if show_stars: sig_bins = [-1, *sorted(significance_levels)] + [2] value_sr += "$^{" value_sr += ( pd.cut( df["p_value"], bins=sig_bins, labels=[ "*" * (len(significance_levels) - i) for i in range(len(significance_levels) + 1) ], ) .astype("str") .replace("nan", "") .replace(np.nan, "") ) value_sr += " }$" if "ci_lower" in df: ci_lower = df["ci_lower"] ci_upper = df["ci_upper"] inference_sr = "(" inference_sr += ci_lower inference_sr += r";" inference_sr += ci_upper inference_sr += ")" sr = _combine_series(value_sr, inference_sr) elif "standard_error" in df: standard_error = df["standard_error"] inference_sr = "(" + standard_error + ")" sr = _combine_series(value_sr, inference_sr) else: sr = value_sr # replace empty braces with empty string sr = sr.where(sr.apply(lambda x: bool(re.search(r"\d", x))), "") sr.name = "" return sr def _combine_series(value_sr, inference_sr): """Merge value and inference series. Return string series with parameter values and precision values below respective param values. Args: values_sr (Series): string series of estimated parameter values inference_sr (Series): string series of inference values Returns: series: combined string series of param and inference values """ value_df = value_sr.to_frame(name="") original_cols = value_df.columns value_df.reset_index(drop=False, inplace=True) index_names = [item for item in value_df.columns if item not in original_cols] # set the index to even numbers, starting at 0 value_df.index = value_df.index * 2 inference_df = inference_sr.to_frame(name="") inference_df.reset_index(drop=False, inplace=True) # set the index to odd numbers, starting at 1 inference_df.index = (inference_df.index * 2) + 1 inference_df[index_names[-1]] = "" df = pd.concat([value_df, inference_df]).sort_index() df.set_index(index_names, inplace=True, drop=True) return df[""] def _create_statistics_sr( model, stats_options, significance_levels, show_stars, number_format, add_trailing_zeros, max_trail, ): """Process statistics values, return string series. Args: model (estimation result): see main docstring stats_options (dict): see main docstring significance_levels (list): see main docstring show_stars (bool): see main docstring number_format (int, str, iterable or callable): see main docstring add_trailing_zeros (bool): If True, format floats such that they haave same number of digits after the decimal point. max_trail (int): If add_trailing_zeros is True, add corresponding number of trailing zeros to floats in the stats DataFrame to have number of digits after a decimal point equal to max_trail for each float. Returns: series: string series with summary statistics values and additional info if applicable. """ stats_values = {} stats_options = deepcopy(stats_options) if "show_dof" in stats_options: show_dof = stats_options.pop("show_dof") else: show_dof = None for k in stats_options: stats_values[stats_options[k]] = model["info"].get(k, np.nan) raw_formatted = _apply_number_format( pd.DataFrame(pd.Series(stats_values)), number_format, format_integers=False ) if add_trailing_zeros: formatted = _apply_number_format( raw_formatted, max_trail, format_integers=False ) else: formatted = raw_formatted stats_values = formatted.to_dict()[0] if "fvalue" in model["info"] and "F Statistic" in stats_values: if show_stars and "f_pvalue" in model["info"]: sig_bins = [-1, *sorted(significance_levels)] + [2] sig_icon_fstat = "*" * ( len(significance_levels) - np.digitize(model["info"]["f_pvalue"], sig_bins) + 1 ) stats_values["F Statistic"] = ( stats_values["F Statistic"] + "$^{" + sig_icon_fstat + "}$" ) if show_dof: fstat_str = "{{{}(df={};{})}}" stats_values["F Statistic"] = fstat_str.format( stats_values["F Statistic"], int(model["info"]["df_model"]), int(model["info"]["df_resid"]), ) if "resid_std_err" in model["info"] and "Residual Std. Error" in stats_values: if show_dof: rse_str = "{{{}(df={})}}" stats_values["Residual Std. Error"] = rse_str.format( stats_values["Residual Std. Error"], int(model["info"]["df_resid"]) ) stat_sr = pd.Series(stats_values) # the following is to make sure statistics dataframe has as many levels of # indices as the parameters dataframe. stat_ind = np.empty((len(stat_sr), model["params"].index.nlevels - 1), dtype=str) stat_ind = np.concatenate( [stat_sr.index.values.reshape(len(stat_sr), 1), stat_ind], axis=1 ).T stat_sr.index = pd.MultiIndex.from_arrays(stat_ind) return stat_sr.astype("str").replace("nan", "") def _process_frame_indices( df, custom_param_names, custom_index_names, show_col_names, show_col_groups, column_names, column_groups, ): """Process body DataFrame, customize the header. Args: df (DataFrame): string DataFrame with parameter values and inferences. custom_param_names (dict): see main docstring custom_index_names (list): see main docstring show_col_names (bool): see main docstring column_names (list): List of column names to display in estimation table. column_groups (list): List of column group titles to display in estimation table. Returns: processed_df (DataFrame): string DataFrame with customized header. """ # The column names of the df are empty strings. # If show_col_names is True, rename columns using column_names. # Add column level if show col_groups is True. if show_col_names: if show_col_groups: df.columns = pd.MultiIndex.from_tuples( [(i, j) for i, j in zip(column_groups, column_names)] ) else: df.columns = column_names if custom_index_names: if isinstance(custom_index_names, list): df.index.names = custom_index_names elif isinstance(custom_index_names, dict): df.rename_axis(index=custom_index_names, inplace=True) else: TypeError( f"""Invalid custom_index_names can be of type either list or dict, or NoneType. Not: {type(custom_index_names)}.""" ) if custom_param_names: ind = df.index.to_frame() ind = ind.replace(custom_param_names) df.index = pd.MultiIndex.from_frame(ind) return df def _generate_notes_latex( append_notes, notes_label, significance_levels, custom_notes, df ): """Generate the LaTex script of the notes section. Args: append_notes (bool): see main docstring notes_label (str): see main docstring significance_levels (list): see main docstring custom_notes (str): see main docstring df (DataFrame): params DataFrame of estimation model Returns: notes_latex (str): a string with LaTex script """ n_levels = df.index.nlevels n_columns = len(df.columns) significance_levels = sorted(significance_levels) notes_text = "" if append_notes: notes_text += "\\midrule\n" notes_text += "\\textit{{{}}} & \\multicolumn{{{}}}{{r}}{{".format( notes_label, str(n_columns + n_levels - 1) ) # iterate over penultimate significance_lelvels since last item of legend # is not followed by a semi column for i in range(len(significance_levels) - 1): star = "*" * (len(significance_levels) - i) notes_text += f"$^{{{star}}}$p$<${significance_levels[i]};" notes_text += "$^{*}$p$<$" + str(significance_levels[-1]) + "} \\\\\n" if custom_notes: amp_n = "&" * n_levels if isinstance(custom_notes, list): if not all(isinstance(n, str) for n in custom_notes): not_str_notes = [n for n in custom_notes if not isinstance(n, str)] not_str_notes_types = [type(n) for n in not_str_notes] raise ValueError( f"""Each custom note can only be of string type. The following notes: {not_str_notes} are of types {not_str_notes_types} respectively.""" ) for n in custom_notes: notes_text += """ {}\\multicolumn{{{}}}{{r}}\\textit{{{}}}\\\\\n""".format( amp_n, n_columns, n ) elif isinstance(custom_notes, str): notes_text += "{}\\multicolumn{{{}}}{{r}}\\textit{{{}}}\\\\\n".format( amp_n, n_columns, custom_notes ) else: raise TypeError( f"""Custom notes can be either a string or a list of strings. Not: {type(custom_notes)}.""" ) return notes_text def _generate_notes_html( append_notes, notes_label, significance_levels, custom_notes, df ): """Generate the html script of the notes section of the estimation table. Args: append_notes (bool): see main docstring notes_label (str): see main docstring significance_levels (list): see main docstring custom_notes (str): see main docstring df (DataFrame): params DataFrame of estimation model Returns: notes_latex (str): a string with html script """ n_levels = df.index.nlevels n_columns = len(df.columns) significance_levels = sorted(significance_levels) notes_text = """<tr><td colspan="{}" style="border-bottom: 1px solid black"> </td></tr>""".format( n_columns + n_levels ) if append_notes: notes_text += """ <tr><td style="text-align: left">{}</td><td colspan="{}" style="text-align: right">""".format( notes_label, n_columns + n_levels - 1 ) for i in range(len(significance_levels) - 1): stars = "*" * (len(significance_levels) - i) notes_text += f"<sup>{stars}</sup>p&lt;{significance_levels[i]}; " notes_text += f"""<sup>*</sup>p&lt;{significance_levels[-1]} </td>""" if custom_notes: if isinstance(custom_notes, list): if not all(isinstance(n, str) for n in custom_notes): not_str_notes = [n for n in custom_notes if not isinstance(n, str)] not_str_notes_types = [type(n) for n in not_str_notes] raise ValueError( f"""Each custom note can only be of string type. The following notes: {not_str_notes} are of types {not_str_notes_types} respectively.""" ) notes_text += """ <tr><td></td><td colspan="{}"style="text-align: right">{}</td></tr> """.format( n_columns + n_levels - 1, custom_notes[0] ) if len(custom_notes) > 1: for i in range(1, len(custom_notes)): notes_text += """ <tr><td></td><td colspan="{}"style="text-align: right"> {}</td></tr> """.format( n_columns + n_levels - 1, custom_notes[i] ) elif isinstance(custom_notes, str): notes_text += """ <tr><td></td><td colspan="{}"style="text-align: right">{}</td></tr> """.format( n_columns + n_levels - 1, custom_notes ) else: raise TypeError( f"""Custom notes can be either a string or a list of strings, not {type(custom_notes)}.""" ) return notes_text def _extract_params_from_sm(model): """Convert statsmodels like estimation result to estimagic like params dataframe.""" to_concat = [] params_list = ["params", "pvalues", "bse"] for col in params_list: to_concat.append(getattr(model, col)) to_concat.append(model.conf_int()) params_df = pd.concat(to_concat, axis=1) params_df.columns = ["value", "p_value", "standard_error", "ci_lower", "ci_upper"] return params_df def _extract_info_from_sm(model): """Process statsmodels estimation result to retrieve summary statistics as dict.""" info = {} key_values = [ "rsquared", "rsquared_adj", "fvalue", "f_pvalue", "df_model", "df_resid", ] for kv in key_values: info[kv] = getattr(model, kv) info["name"] = model.model.endog_names info["resid_std_err"] = np.sqrt(model.scale) info["n_obs"] = model.df_model + model.df_resid + 1 return info def _apply_number_format(df_raw, number_format, format_integers): """Apply string format to DataFrame cells. Args: df_raw (DataFrame): The DataFrame with float values to format. number_format (str, list, tuple, callable or int): User defined number format to apply to the DataFrame. format_integers (bool): Apply number format also to integers Returns: df_formatted (DataFrame): Formatted DataFrame. """ processed_format = _process_number_format(number_format) df_raw = df_raw.copy(deep=True) if isinstance(processed_format, (list, tuple)): df_formatted = df_raw.copy(deep=True).astype("float") for formatter in processed_format[:-1]: df_formatted = pd_df_map(df_formatted, formatter.format).astype("float") df_formatted = pd_df_map( df_formatted.astype("float"), processed_format[-1].format ) elif isinstance(processed_format, str): df_formatted = pd_df_map( df_raw.astype("str"), partial(_format_non_scientific_numbers, format_string=processed_format), ) elif callable(processed_format): df_formatted = pd_df_map(df_raw, processed_format) # Don't format integers: set to original value if not format_integers: integer_locs = pd_df_map(df_raw, _is_integer) df_formatted[integer_locs] = pd_df_map( df_raw[integer_locs].astype(float), "{:.0f}".format ) return df_formatted def _format_non_scientific_numbers(number_string, format_string): """Apply number format if the number string is not in scientific format.""" if "e" in number_string: out = number_string else: out = format_string.format(float(number_string)) return out def _process_number_format(raw_format): """Process the user define formatter. Reduces cases for number format in apply_number_format. """ if isinstance(raw_format, str): processed_format = [raw_format] elif isinstance(raw_format, int): processed_format = f"{{0:.{raw_format}f}}" elif callable(raw_format) or isinstance(raw_format, (list, tuple)): processed_format = raw_format else: raise TypeError( f"""Number format can be either of [str, int, tuple, list, callable] types. Not: {type(raw_format)}.""" ) return processed_format def _get_digits_after_decimal(df): """Get the maximum number of digits after a decimal point in a DataFrame.""" max_trail = 0 for c in df.columns: try: trail_length = ( ( df[c][~df[c].astype("str").str.contains("e")] .astype("str") .str.split(".", expand=True)[1] .astype("str") .replace("None", "") ) .str.len() .max() ) except KeyError: trail_length = 0 if trail_length > max_trail: max_trail = trail_length return max_trail def _center_align_integers_and_non_numeric_strings(sr): """Align integer numbers and strings at the center of model column.""" sr = deepcopy(sr) for i in sr.index: if _is_integer(sr[i]): sr[i] = f"\\multicolumn{{1}}{{c}}{{{str(int(float(sr[i])))}}}" else: string_without_stars = sr[i].split("$", 1)[0] if not string_without_stars.replace(".", "").isnumeric(): sr[i] = f"\\multicolumn{{1}}{{c}}{{{sr[i]}}}" return sr def _get_updated_styler( df, show_index_names, show_col_names, show_col_groups, escape_special_characters ): """Return pandas.Styler object based ont the data and styling options.""" styler = df.style if not show_index_names: styler = styler.hide(names=True) if not show_col_names: styler = styler.hide(axis=1) if not show_col_groups: styler = styler.hide(axis=1, level=0) for ax in [0, 1]: styler = styler.format_index(escape=escape_special_characters, axis=ax) return styler def _is_integer(num): """Check if number is an integer (including a float with only zeros as digits)""" try: out = int(float(num)) == float(num) except ValueError: out = False return out