Source code for synthgauge.metrics.propensity

"""Propensity-based utility metrics."""

import warnings
from collections import namedtuple

import numpy as np
import pandas as pd
import scipy.special
from scipy.stats import ks_2samp
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeClassifier

from ..utils import df_combine


def _combine_encode_and_pop(real, synth):
    """Get the combined, encoded real and synthetic data, and their
    origins.

    Parameters
    ----------
    real : pandas.DataFrame
        Dataframe containing the real data.
    synth : pandas.DataFrame
        Dataframe containing the synthetic data.

    Returns
    -------
    combined : pandas.DataFrame
        The combined data with categorical columns one-hot encoded.
    indicator : numpy.ndarray
        An indicator for whether the data is real (0) or synthetic (1).
    """

    combined = df_combine(real, synth, source_val_real=0, source_val_synth=1)
    combined = pd.get_dummies(combined, drop_first=True)
    indicator = combined.pop("source").values

    return combined, indicator


def _get_propensity_scores(data, labels, method, **kwargs):
    """Fit a propensity model to the data and extract its scores.

    Parameters
    ----------
    data : pandas.DataFrame
        Dataframe to fit propensity model to.
    labels : numpy.ndarray
        Indicator for which data are real (0) or synthetic (1).
    method : {"cart", "logr"}
        Which propensity model to use.
    **kwargs : dict, optional
        Keyword arguments passed to propensity model.

    Returns
    -------
    scores : numpy.ndarray
        Propensity score for each point in `data`.
    """

    if method == "logr":
        model = LogisticRegression
        data = PolynomialFeatures(
            2, interaction_only=True, include_bias=False
        ).fit_transform(data)

    if method == "cart":
        model = DecisionTreeClassifier

    scores = model(**kwargs).fit(data, labels).predict_proba(data)[:, 1]

    return scores


[docs]def pmse(combined, indicator, method, **kwargs):
    """Calculate the propensity score mean-squared error (pMSE).

    Parameters
    ----------
    combined : pandas.DataFrame
        The combined set of real and synthetic data.
    indicator : numpy.ndarray
        An indicator for which data are real (0) or synthetic (1).
    method : {"cart", "logr"}
        Which propensity model to use. Must be either CART (`"cart"`) or
        logistic regression with first-order interactions (`"logr"`).
    **kwargs : dict, optional
        Keyword arguments passed to propensity model.

    Returns
    -------
    float
        Propensity score mean-squared error.

    See Also
    --------
    sklearn.linear_model.LogisticRegression
    sklearn.tree.DecisionTreeClassifier

    Notes
    -----
    Propensity scores represent probabilities of group membership. By
    modelling whether an example is synthetic or not, we can use
    propensity scores as a measure of utility.

    This returns zero if the distributions are identical, and is bounded
    above by :math:`1 - c` if they are nothing alike, where :math:`c` is
    the proportion of the data that is synthetic. This method is
    therefore good for comparing multiple synthetic datasets. However,
    as this is not a test, there is no threshold distance below which we
    can claim the distributions are statistically the same.

    This function assumes that some preprocessing has been carried out
    so that the data is ready to be passed to the classification
    function. Encoding of categorical data is performed, but, for
    example, scaling is not. Without this, erroneous results may be
    returned. The logistic regression can fail to converge if many
    variables are considered. Anecdotally, this doesn't seem to
    drastically impact the propensity scores, although this should be
    investigated formally.

    Using a CART model as a classifier is recommended in the literature
    however we also support the use of logistic regression. For further
    details, see: https://doi.org/10.1111/rssa.12358
    """

    scores = _get_propensity_scores(combined, indicator, method, **kwargs)
    ideal = indicator.mean()

    return np.mean(np.square(scores - ideal))


def _pmse_logr_statistics(combined, indicator):
    """Calculate the location and scale of pMSE in the null case where
    the real and synthetic datasets are formed from identical processes.

    Parameters
    ----------

    combined : pandas.DataFrame
        Dataframe containing the combined real and synthetic data.
    indicator : numpy.ndarray
        Indicator for whether data are real (0) or synthetic (1).

    Returns
    -------
    loc : float
        Expectation of pMSE in the null case.
    scale : float
        Standard deviation of pMSE in the null case.

    Notes
    -----
    It has been shown that the null case is distributed as a multiple of
    a :math:`\\chi^2` distribution with :math:`k-1` degrees of freedom.

    Therefore, its expectation is:

    .. math::

        E(pMSE) = \\frac{(k - 1)(1 - c)^2}{N}

    and its standard deviation is:

    .. math::

        sd(pMSE) = \\frac{c \\sqrt{2(k - 1)} (1 - c)^2}{N}

    where :math:`k` is the number of predictors used in the model,
    :math:`c` is the proportion of synthetic data, and :math:`N` is the
    total number of data points.

    Here, all features and first-order interactions are used in the
    model. Let :math:`m` be the number of features, then:

    .. math::

        k = m + \\binom{m}{2}

    Further explanation and derivation of these results can be found at:
    https://doi.org/10.1111/rssa.12358
    """

    num_rows, num_cols = combined.shape
    num_predictors = num_cols + scipy.special.binom(num_cols, 2)
    prop_synth = indicator.mean()

    loc = (num_predictors - 1) * (1 - prop_synth) ** 2 / num_rows
    scale = (
        prop_synth
        * np.sqrt(2 * (num_predictors - 1))
        * (1 - prop_synth) ** 2
        / num_rows
    )

    return loc, scale


def _pmse_cart_stats_boot(combined, indicator, trials, **kwargs):
    """Null statistic estimation for CART propensity via bootstrapping.

    Estimate the location and scale of pMSE in the null case by taking
    several bootstrapped samples of the real data, each being twice the
    length of the original data. Half of the data are listed as real
    while the other half are synthetic. These labelled data are then run
    through the usual CART pMSE calculation procedure to produce a set.

    The set of pMSE values are then summarised to give the estimated
    mean and standard deviation.

    Parameters
    ----------
    combined : pandas.DataFrame
        Dataframe containing the stacked real and synthetic data.
    trials : int
        Number of samples to take in the estimate.
    **kwargs : dict, optional
        Keyword arguments passed to
        `sklearn.tree.DecisionTreeClassifier`.

    Returns
    -------
    loc : float
        Estimated expectation of pMSE in the null case.
    scale : float
        Estimated standard deviation of pMSE in the null case.

    Notes
    -----
    This estimation method removes any dependency on the synthetic data
    being assessed, improving on the permutation-based method set out in
    https://doi.org/10.1111/rssa.12358.

    A description of this method and its justification are presented in
    https://doi.org/10.29012/jpc.748. This implementation is adapted
    from the source code submitted with the aforementioned paper:
    https://github.com/ClaireMcKayBowen/Code-for-NIST-PSCR-Differential-Privacy-Synthetic-Data-Challenge

    Note that the `random_state` keyword argument is used to
    (independently) bootstrap samples and to fit the CART model. Without
    specifying this, the results will not be reproducible.
    """

    rng = np.random.default_rng(kwargs.get("random_state", None))
    original = combined[indicator == 0]
    rows = len(original)

    pmses = []
    for perm in range(trials):
        sampled = original.iloc[rng.integers(rows, size=2 * rows), :]
        indicator = np.repeat([0, 1], rows)
        pmses.append(pmse(sampled, indicator, method="cart", **kwargs))

    return np.mean(pmses), np.std(pmses)


def _pmse_cart_stats_perm(combined, indicator, num_perms, **kwargs):
    """Null statistic estimation for CART propensity via permutations.

    Estimate the location and scale of pMSE in the null case by
    repeating pMSE calculations on permuations of the indicator column
    using a CART model. The set of calculations are then summarised
    using the mean or standard deviation, respectively.

    Parameters
    ----------
    combined : pandas.DataFrame
        Dataframe containing the combined real and synthetic data.
    indicator : numpy.ndarray
        Indicator for whether data are real (0) or synthetic (1).
    num_perms : int
        The number of permutations to consider.
    **kwargs : dict, optional
        Keyword arguments passed to
        `sklearn.tree.DecisionTreeClassifer`.

    Returns
    -------
    loc : float
        Estimated expectation of pMSE in the null case.
    scale : float
        Estimated standard deviation of pMSE in the null case.

    Notes
    -----
    When using a CART propensity model, the number of predictors is
    unknown and the results used in `_pmse_logr_statistics` do not
    apply. To circumvent this, taking several permutations should
    approximate the results for "properly" synthesised data without
    knowing :math:`k` a priori. Further details of this approach are
    available in https://doi.org/10.1111/rssa.12358.

    Note that the `random_state` keyword argument is used to
    (independently) create the permutations and to fit the CART model.
    Without specifying this, the results will not be reproducible.
    """

    rng = np.random.default_rng(kwargs.get("random_state", None))

    pmses = []
    for _ in range(num_perms):
        rng.shuffle(indicator)
        pmses.append(pmse(combined, indicator, method="cart", **kwargs))

    return np.mean(pmses), np.std(pmses)


def _pmse_cart_statistics(combined, indicator, num_perms, estimator, **kwargs):
    """Null statistic estimation for CART-based propensity scores.

    Estimation can either be carried out using permutations of the
    combined indicator or via bootstrapping. The former is the original
    method presented in https://doi.org/10.1111/rssa.12358, while the
    latter (from https://doi.org/10.29012/jpc.748) improves on this by
    removing dependence on the synthetic data.

    Parameters
    ----------
    combined : pandas.DataFrame
        Dataframe containing the real and synthetic data.
    indicator : numpy.ndarray
        Indicator for whether data are real (0) or synthetic (1).
    num_perms : int
        Number of trials to consider in the estimate.
    estimator : {"perm", "boot"}
        Which estimation process to use. By default, permutations are
        used to ensure back-compatibility.
    **kwargs : dict, optional
        Keyword arguments to be passed to
        `sklearn.tree.DecisionTreeClassifier`.

    Returns
    -------
    loc : float
        Estimated expectation of pMSE in the null case.
    scale : float
        Estimated standard deviation of pMSE in the null case.

    Warns
    -----
    FutureWarning
        The permutation method is flawed and will be removed in a
        future minor version. See https://doi.org/10.29012/jpc.748 for
        details.

    Raises
    ------
    ValueError
        If the estimation method, `estimator`, is not valid.
    """

    if estimator == "perm":
        message = (
            "The permutation method is flawed and will be removed in a future "
            "release. Consider using `estimator='boot'` instead."
        )
        warnings.warn(message, FutureWarning)

        loc, scale = _pmse_cart_stats_perm(
            combined, indicator, num_perms, **kwargs
        )

    elif estimator == "boot":
        loc, scale = _pmse_cart_stats_boot(
            combined, indicator, num_perms, **kwargs
        )

    else:
        raise ValueError("Estimation method must be `perm` or `boot`.")

    return loc, scale


[docs]def pmse_ratio(
    combined, indicator, method, num_perms=None, estimator="perm", **kwargs
):
    """The propensity score mean-squared error ratio.

    This is the ratio of observed pMSE to that expected under the null
    case, i.e.

    .. math::

        ratio(pMSE) = \\frac{pMSE}{E(pMSE)}

    Parameters
    ----------
    combined : pandas.DataFrame
        Dataframe containing the combined real and synthetic data.
    indicator : numpy.ndarray
        Indicator for whether data are real (0) or synthetic (1).
    method : {"cart", "logr"}
        Which propensity model to use. Must be either CART (`"cart"`) or
        logistic regression with first-order interactions (`"logr"`).
    num_perms : int, optional
        Number of permutations to consider when estimating the null case
        statistics with a CART model.
    estimator : {"perm", "boot"}
        Which estimation process to use with a CART model. By default,
        permutations are used to ensure back-compatibility.
    **kwargs : dict, optional
        Keyword arguments passed to the propensity model classifier.

    Returns
    -------
    float
        The observed-to-null pMSE ratio.

    Notes
    -----
    The interpretation of this metric makes more sense for synthetic
    data. The pMSE alone gives better utility as the value gets closer
    to zero, which is only attainable when the datasets are identical.
    However, when generating synthetic data, we do not want to produce
    identical entries. Rather, we want to achieve similarity between the
    distributions of the real and synthetic datasets.

    This ratio tends towards one when this is achieved, and increases
    otherwise.

    Note that the `random_state` keyword argument is used to
    (independently) create the permutations and to fit the model when
    using a CART model. Without specifying this, the results will not be
    reproducible.
    """

    observed = pmse(combined, indicator, method, **kwargs)

    if method == "logr":
        loc, _ = _pmse_logr_statistics(combined, indicator)
    if method == "cart":
        loc, _ = _pmse_cart_statistics(
            combined, indicator, num_perms, estimator, **kwargs
        )

    return observed / loc


[docs]def pmse_standardised(
    combined, indicator, method, num_perms=None, estimator="perm", **kwargs
):
    """The standardised propensity score mean-squared error.

    This takes the observed pMSE and standardises it against the null
    case, i.e.

    .. math::

        stand(pMSE) = (pMSE - E(pMSE)) / sd(pMSE)

    Parameters
    ----------
    combined : pandas.DataFrame
        Dataframe containing the combined real and synthetic data.
    indicator : numpy.ndarray
        Indicator for whether data are real (0) or synthetic (1).
    method : {"cart", "logr"}
        Which propensity model to use. Must be either CART (`"cart"`) or
        logistic regression with first-order interactions (`"logr"`).
    num_perms : int, optional
        Number of permutations to consider when estimating the null case
        statistics with a CART model.
    estimator : {"perm", "boot"}
        Which estimation process to use with a CART model. By default,
        permutations are used to ensure back-compatibility.
    **kwargs : dict, optional
        Keyword arguments passed to the propensity model.

    Returns
    -------
    float
        The null-standardised pMSE.

    Notes
    -----
    The interpretation of this metric makes more sense for synthetic
    data. The pMSE alone indicates better utility as it gets closer to
    zero, which is only attainable when the datasets are identical.
    However, when generating synthetic data, we do not want to produce
    identical entries. Rather, we want to achieve similarity between the
    distributions of the real and synthetic datasets.

    This standardised value tends towards zero when this is achieved,
    and increases in magnitude otherwise.

    Note that the `random_state` keyword argument is used to
    (independently) create the permutations and to fit the model when
    using a CART model. Without specifying this, the results will not be
    reproducible.
    """

    observed = pmse(combined, indicator, method, **kwargs)

    if method == "logr":
        loc, scale = _pmse_logr_statistics(combined, indicator)
    if method == "cart":
        loc, scale = _pmse_cart_statistics(
            combined, indicator, num_perms, estimator, **kwargs
        )

    return (observed - loc) / scale


[docs]def propensity_metrics(
    real,
    synth,
    method="cart",
    feats=None,
    num_perms=20,
    estimator="perm",
    **kwargs,
):
    """Propensity score-based metrics.

    This function calculates three metrics based on the propensity score
    mean-squared error (pMSE), all of which quantify utility by
    measuring the distinguishability of the synthetic data. That is, how
    readily real and synthetic data can be identified.

    To do this, the datasets are combined and their origins tracked by a
    boolean indicator. This combined dataset is then used to fit a
    binary classification model (CART or logistic regression with
    first-order interactions) with the indicator as the target. The
    propensity score for each row is then extracted and summarised to
    give a metric.

    The returned metrics are the observed pMSE along with the pMSE ratio
    and standardised pMSE. These second two metrics are given relative
    to the null case where the real and synthetic data are produced from
    identical processes.

    Parameters
    ----------
    real : pandas.DataFrame
        Dataframe containing the real data.
    synth : pandas.DataFrame
        Dataframe containing the synthetic data.
    method : {"cart", "logr"}, default "cart"
        Which propensity model to use. Must be either CART (`"cart"`) or
        logistic regression with first-order interactions (`"logr"`).
    feats : list of str or None, default None
        List of features in the dataset to be used in the propensity
        model. If `None` (default), all common features are used.
    num_perms : int, default 20
        Number of permutations to consider when estimating the null case
        statistics with a CART model.
    estimator : {"perm", "boot"}
        Which estimation process to use with a CART model. By default,
        permutations are used to ensure back-compatibility.
    **kwargs : dict, optional
        Keyword arguments passed to the propensity model.

    Returns
    -------
    observed : float
        The observed pMSE.
    standard : float
        The null-standardised pMSE.
    ratio : float
        The observed-null pMSE ratio.

    Raises
    ------
    ValueError
        If `method` is not one of `'cart'` or `'logr'`.

    See Also
    --------
    sklearn.linear_model.LogisticRegression
    sklearn.tree.DecisionTreeClassifier
    synthgauge.metrics.propensity.pmse
    synthgauge.metrics.propensity.pmse_ratio
    synthgauge.metrics.propensity.pmse_standardised

    Notes
    -----
    For the CART model, `sklearn.tree.DecisionTreeClassifier` is used.
    Meanwhile, the logistic regression model uses
    `sklearn.linear_model.LogisticRegression`.

    Note that the `random_state` keyword argument is used to
    (independently) create the permutations and to fit the model when
    using a CART model. Without specifying this, the results will not be
    reproducible.

    Details on these metrics can be found at:
    https://doi.org/10.1111/rssa.12358
    """

    if method not in ("cart", "logr"):
        raise ValueError(
            f"Propensity method must be 'cart' or 'logr' not {method}."
        )

    feats = feats or real.columns.intersection(synth.columns)
    combined, indicator = _combine_encode_and_pop(real[feats], synth[feats])

    if method == "logr":
        loc, scale = _pmse_logr_statistics(combined, indicator)
    if method == "cart":
        loc, scale = _pmse_cart_statistics(
            combined, indicator, num_perms, estimator, **kwargs
        )

    observed = pmse(combined, indicator, method, **kwargs)
    standard = (observed - loc) / scale
    ratio = observed / loc

    PropensityResult = namedtuple(
        "PropensityResult",
        ("pmse", "pmse_standardised", "pmse_ratio"),
    )

    return PropensityResult(observed, standard, ratio)


[docs]def specks(real, synth, classifier, **kwargs):
    """Propensity score comparison via the Kolmogorov-Smirnov distance.

    The SPECKS metric was originally presented in
    https://arxiv.org/pdf/1803.06763.pdf and works as follows:

        1. Stack the real and synthetic data, and create a variable
           indicating whether each record is real (0) or synthetic (1).
        2. Calculate the propensity score for each record using a binary
           classifier on the indicator variable.
        3. Compute the Kolmogorov-Smirnov distance between the empirical
           CDFs for the real and synthetic propensity scores.

    The Kolmogorov-Smirnov distance is defined as the maximum difference
    between two empirical distributions. Therefore, it is bounded
    between zero and one. If the synthetic data properly resembles the
    original data then they will be indistinguishable, leading to close
    empirical CDFs.

    Parameters
    ----------
    real : pandas.DataFrame
        Dataframe containing the real data.
    synth : pandas.DataFrame
        Dataframe containing the synthetic data.
    classifier : scikit-learn estimator
        Any `scikit-learn`-style classifier class with a `predict_proba`
        method.
    **kwargs : dict, optional
        Keyword arguments to be passed to `classifer`.

    Returns
    -------
    float
        The Kolmogorov-Smirnov distance between the real and synthetic
        propensity score CDFs.

    Notes
    -----
    The combined dataset is one-hot-encoded before being passed to the
    classifier so categorical features can be handled.

    The paper introducing SPECKS has also been published in METRON:
    https://doi.org/10.1007/s40300-021-00201-0.
    """

    combined, indicator = _combine_encode_and_pop(real, synth)
    scores = (
        classifier(**kwargs)
        .fit(combined, indicator)
        .predict_proba(combined)[:, 1]
    )

    return ks_2samp(scores[indicator == 0], scores[indicator == 1]).statistic