"""Propensity-based utility metrics."""
import warnings
from collections import namedtuple
import numpy as np
import pandas as pd
import scipy.special
from scipy.stats import ks_2samp
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeClassifier
from ..utils import df_combine
def _combine_encode_and_pop(real, synth):
"""Get the combined, encoded real and synthetic data, and their
origins.
Parameters
----------
real : pandas.DataFrame
Dataframe containing the real data.
synth : pandas.DataFrame
Dataframe containing the synthetic data.
Returns
-------
combined : pandas.DataFrame
The combined data with categorical columns one-hot encoded.
indicator : numpy.ndarray
An indicator for whether the data is real (0) or synthetic (1).
"""
combined = df_combine(real, synth, source_val_real=0, source_val_synth=1)
combined = pd.get_dummies(combined, drop_first=True)
indicator = combined.pop("source").values
return combined, indicator
def _get_propensity_scores(data, labels, method, **kwargs):
"""Fit a propensity model to the data and extract its scores.
Parameters
----------
data : pandas.DataFrame
Dataframe to fit propensity model to.
labels : numpy.ndarray
Indicator for which data are real (0) or synthetic (1).
method : {"cart", "logr"}
Which propensity model to use.
**kwargs : dict, optional
Keyword arguments passed to propensity model.
Returns
-------
scores : numpy.ndarray
Propensity score for each point in `data`.
"""
if method == "logr":
model = LogisticRegression
data = PolynomialFeatures(
2, interaction_only=True, include_bias=False
).fit_transform(data)
if method == "cart":
model = DecisionTreeClassifier
scores = model(**kwargs).fit(data, labels).predict_proba(data)[:, 1]
return scores
[docs]def pmse(combined, indicator, method, **kwargs):
"""Calculate the propensity score mean-squared error (pMSE).
Parameters
----------
combined : pandas.DataFrame
The combined set of real and synthetic data.
indicator : numpy.ndarray
An indicator for which data are real (0) or synthetic (1).
method : {"cart", "logr"}
Which propensity model to use. Must be either CART (`"cart"`) or
logistic regression with first-order interactions (`"logr"`).
**kwargs : dict, optional
Keyword arguments passed to propensity model.
Returns
-------
float
Propensity score mean-squared error.
See Also
--------
sklearn.linear_model.LogisticRegression
sklearn.tree.DecisionTreeClassifier
Notes
-----
Propensity scores represent probabilities of group membership. By
modelling whether an example is synthetic or not, we can use
propensity scores as a measure of utility.
This returns zero if the distributions are identical, and is bounded
above by :math:`1 - c` if they are nothing alike, where :math:`c` is
the proportion of the data that is synthetic. This method is
therefore good for comparing multiple synthetic datasets. However,
as this is not a test, there is no threshold distance below which we
can claim the distributions are statistically the same.
This function assumes that some preprocessing has been carried out
so that the data is ready to be passed to the classification
function. Encoding of categorical data is performed, but, for
example, scaling is not. Without this, erroneous results may be
returned. The logistic regression can fail to converge if many
variables are considered. Anecdotally, this doesn't seem to
drastically impact the propensity scores, although this should be
investigated formally.
Using a CART model as a classifier is recommended in the literature
however we also support the use of logistic regression. For further
details, see: https://doi.org/10.1111/rssa.12358
"""
scores = _get_propensity_scores(combined, indicator, method, **kwargs)
ideal = indicator.mean()
return np.mean(np.square(scores - ideal))
def _pmse_logr_statistics(combined, indicator):
"""Calculate the location and scale of pMSE in the null case where
the real and synthetic datasets are formed from identical processes.
Parameters
----------
combined : pandas.DataFrame
Dataframe containing the combined real and synthetic data.
indicator : numpy.ndarray
Indicator for whether data are real (0) or synthetic (1).
Returns
-------
loc : float
Expectation of pMSE in the null case.
scale : float
Standard deviation of pMSE in the null case.
Notes
-----
It has been shown that the null case is distributed as a multiple of
a :math:`\\chi^2` distribution with :math:`k-1` degrees of freedom.
Therefore, its expectation is:
.. math::
E(pMSE) = \\frac{(k - 1)(1 - c)^2}{N}
and its standard deviation is:
.. math::
sd(pMSE) = \\frac{c \\sqrt{2(k - 1)} (1 - c)^2}{N}
where :math:`k` is the number of predictors used in the model,
:math:`c` is the proportion of synthetic data, and :math:`N` is the
total number of data points.
Here, all features and first-order interactions are used in the
model. Let :math:`m` be the number of features, then:
.. math::
k = m + \\binom{m}{2}
Further explanation and derivation of these results can be found at:
https://doi.org/10.1111/rssa.12358
"""
num_rows, num_cols = combined.shape
num_predictors = num_cols + scipy.special.binom(num_cols, 2)
prop_synth = indicator.mean()
loc = (num_predictors - 1) * (1 - prop_synth) ** 2 / num_rows
scale = (
prop_synth
* np.sqrt(2 * (num_predictors - 1))
* (1 - prop_synth) ** 2
/ num_rows
)
return loc, scale
def _pmse_cart_stats_boot(combined, indicator, trials, **kwargs):
"""Null statistic estimation for CART propensity via bootstrapping.
Estimate the location and scale of pMSE in the null case by taking
several bootstrapped samples of the real data, each being twice the
length of the original data. Half of the data are listed as real
while the other half are synthetic. These labelled data are then run
through the usual CART pMSE calculation procedure to produce a set.
The set of pMSE values are then summarised to give the estimated
mean and standard deviation.
Parameters
----------
combined : pandas.DataFrame
Dataframe containing the stacked real and synthetic data.
trials : int
Number of samples to take in the estimate.
**kwargs : dict, optional
Keyword arguments passed to
`sklearn.tree.DecisionTreeClassifier`.
Returns
-------
loc : float
Estimated expectation of pMSE in the null case.
scale : float
Estimated standard deviation of pMSE in the null case.
Notes
-----
This estimation method removes any dependency on the synthetic data
being assessed, improving on the permutation-based method set out in
https://doi.org/10.1111/rssa.12358.
A description of this method and its justification are presented in
https://doi.org/10.29012/jpc.748. This implementation is adapted
from the source code submitted with the aforementioned paper:
https://github.com/ClaireMcKayBowen/Code-for-NIST-PSCR-Differential-Privacy-Synthetic-Data-Challenge
Note that the `random_state` keyword argument is used to
(independently) bootstrap samples and to fit the CART model. Without
specifying this, the results will not be reproducible.
"""
rng = np.random.default_rng(kwargs.get("random_state", None))
original = combined[indicator == 0]
rows = len(original)
pmses = []
for perm in range(trials):
sampled = original.iloc[rng.integers(rows, size=2 * rows), :]
indicator = np.repeat([0, 1], rows)
pmses.append(pmse(sampled, indicator, method="cart", **kwargs))
return np.mean(pmses), np.std(pmses)
def _pmse_cart_stats_perm(combined, indicator, num_perms, **kwargs):
"""Null statistic estimation for CART propensity via permutations.
Estimate the location and scale of pMSE in the null case by
repeating pMSE calculations on permuations of the indicator column
using a CART model. The set of calculations are then summarised
using the mean or standard deviation, respectively.
Parameters
----------
combined : pandas.DataFrame
Dataframe containing the combined real and synthetic data.
indicator : numpy.ndarray
Indicator for whether data are real (0) or synthetic (1).
num_perms : int
The number of permutations to consider.
**kwargs : dict, optional
Keyword arguments passed to
`sklearn.tree.DecisionTreeClassifer`.
Returns
-------
loc : float
Estimated expectation of pMSE in the null case.
scale : float
Estimated standard deviation of pMSE in the null case.
Notes
-----
When using a CART propensity model, the number of predictors is
unknown and the results used in `_pmse_logr_statistics` do not
apply. To circumvent this, taking several permutations should
approximate the results for "properly" synthesised data without
knowing :math:`k` a priori. Further details of this approach are
available in https://doi.org/10.1111/rssa.12358.
Note that the `random_state` keyword argument is used to
(independently) create the permutations and to fit the CART model.
Without specifying this, the results will not be reproducible.
"""
rng = np.random.default_rng(kwargs.get("random_state", None))
pmses = []
for _ in range(num_perms):
rng.shuffle(indicator)
pmses.append(pmse(combined, indicator, method="cart", **kwargs))
return np.mean(pmses), np.std(pmses)
def _pmse_cart_statistics(combined, indicator, num_perms, estimator, **kwargs):
"""Null statistic estimation for CART-based propensity scores.
Estimation can either be carried out using permutations of the
combined indicator or via bootstrapping. The former is the original
method presented in https://doi.org/10.1111/rssa.12358, while the
latter (from https://doi.org/10.29012/jpc.748) improves on this by
removing dependence on the synthetic data.
Parameters
----------
combined : pandas.DataFrame
Dataframe containing the real and synthetic data.
indicator : numpy.ndarray
Indicator for whether data are real (0) or synthetic (1).
num_perms : int
Number of trials to consider in the estimate.
estimator : {"perm", "boot"}
Which estimation process to use. By default, permutations are
used to ensure back-compatibility.
**kwargs : dict, optional
Keyword arguments to be passed to
`sklearn.tree.DecisionTreeClassifier`.
Returns
-------
loc : float
Estimated expectation of pMSE in the null case.
scale : float
Estimated standard deviation of pMSE in the null case.
Warns
-----
FutureWarning
The permutation method is flawed and will be removed in a
future minor version. See https://doi.org/10.29012/jpc.748 for
details.
Raises
------
ValueError
If the estimation method, `estimator`, is not valid.
"""
if estimator == "perm":
message = (
"The permutation method is flawed and will be removed in a future "
"release. Consider using `estimator='boot'` instead."
)
warnings.warn(message, FutureWarning)
loc, scale = _pmse_cart_stats_perm(
combined, indicator, num_perms, **kwargs
)
elif estimator == "boot":
loc, scale = _pmse_cart_stats_boot(
combined, indicator, num_perms, **kwargs
)
else:
raise ValueError("Estimation method must be `perm` or `boot`.")
return loc, scale
[docs]def pmse_ratio(
combined, indicator, method, num_perms=None, estimator="perm", **kwargs
):
"""The propensity score mean-squared error ratio.
This is the ratio of observed pMSE to that expected under the null
case, i.e.
.. math::
ratio(pMSE) = \\frac{pMSE}{E(pMSE)}
Parameters
----------
combined : pandas.DataFrame
Dataframe containing the combined real and synthetic data.
indicator : numpy.ndarray
Indicator for whether data are real (0) or synthetic (1).
method : {"cart", "logr"}
Which propensity model to use. Must be either CART (`"cart"`) or
logistic regression with first-order interactions (`"logr"`).
num_perms : int, optional
Number of permutations to consider when estimating the null case
statistics with a CART model.
estimator : {"perm", "boot"}
Which estimation process to use with a CART model. By default,
permutations are used to ensure back-compatibility.
**kwargs : dict, optional
Keyword arguments passed to the propensity model classifier.
Returns
-------
float
The observed-to-null pMSE ratio.
Notes
-----
The interpretation of this metric makes more sense for synthetic
data. The pMSE alone gives better utility as the value gets closer
to zero, which is only attainable when the datasets are identical.
However, when generating synthetic data, we do not want to produce
identical entries. Rather, we want to achieve similarity between the
distributions of the real and synthetic datasets.
This ratio tends towards one when this is achieved, and increases
otherwise.
Note that the `random_state` keyword argument is used to
(independently) create the permutations and to fit the model when
using a CART model. Without specifying this, the results will not be
reproducible.
"""
observed = pmse(combined, indicator, method, **kwargs)
if method == "logr":
loc, _ = _pmse_logr_statistics(combined, indicator)
if method == "cart":
loc, _ = _pmse_cart_statistics(
combined, indicator, num_perms, estimator, **kwargs
)
return observed / loc
[docs]def pmse_standardised(
combined, indicator, method, num_perms=None, estimator="perm", **kwargs
):
"""The standardised propensity score mean-squared error.
This takes the observed pMSE and standardises it against the null
case, i.e.
.. math::
stand(pMSE) = (pMSE - E(pMSE)) / sd(pMSE)
Parameters
----------
combined : pandas.DataFrame
Dataframe containing the combined real and synthetic data.
indicator : numpy.ndarray
Indicator for whether data are real (0) or synthetic (1).
method : {"cart", "logr"}
Which propensity model to use. Must be either CART (`"cart"`) or
logistic regression with first-order interactions (`"logr"`).
num_perms : int, optional
Number of permutations to consider when estimating the null case
statistics with a CART model.
estimator : {"perm", "boot"}
Which estimation process to use with a CART model. By default,
permutations are used to ensure back-compatibility.
**kwargs : dict, optional
Keyword arguments passed to the propensity model.
Returns
-------
float
The null-standardised pMSE.
Notes
-----
The interpretation of this metric makes more sense for synthetic
data. The pMSE alone indicates better utility as it gets closer to
zero, which is only attainable when the datasets are identical.
However, when generating synthetic data, we do not want to produce
identical entries. Rather, we want to achieve similarity between the
distributions of the real and synthetic datasets.
This standardised value tends towards zero when this is achieved,
and increases in magnitude otherwise.
Note that the `random_state` keyword argument is used to
(independently) create the permutations and to fit the model when
using a CART model. Without specifying this, the results will not be
reproducible.
"""
observed = pmse(combined, indicator, method, **kwargs)
if method == "logr":
loc, scale = _pmse_logr_statistics(combined, indicator)
if method == "cart":
loc, scale = _pmse_cart_statistics(
combined, indicator, num_perms, estimator, **kwargs
)
return (observed - loc) / scale
[docs]def propensity_metrics(
real,
synth,
method="cart",
feats=None,
num_perms=20,
estimator="perm",
**kwargs,
):
"""Propensity score-based metrics.
This function calculates three metrics based on the propensity score
mean-squared error (pMSE), all of which quantify utility by
measuring the distinguishability of the synthetic data. That is, how
readily real and synthetic data can be identified.
To do this, the datasets are combined and their origins tracked by a
boolean indicator. This combined dataset is then used to fit a
binary classification model (CART or logistic regression with
first-order interactions) with the indicator as the target. The
propensity score for each row is then extracted and summarised to
give a metric.
The returned metrics are the observed pMSE along with the pMSE ratio
and standardised pMSE. These second two metrics are given relative
to the null case where the real and synthetic data are produced from
identical processes.
Parameters
----------
real : pandas.DataFrame
Dataframe containing the real data.
synth : pandas.DataFrame
Dataframe containing the synthetic data.
method : {"cart", "logr"}, default "cart"
Which propensity model to use. Must be either CART (`"cart"`) or
logistic regression with first-order interactions (`"logr"`).
feats : list of str or None, default None
List of features in the dataset to be used in the propensity
model. If `None` (default), all common features are used.
num_perms : int, default 20
Number of permutations to consider when estimating the null case
statistics with a CART model.
estimator : {"perm", "boot"}
Which estimation process to use with a CART model. By default,
permutations are used to ensure back-compatibility.
**kwargs : dict, optional
Keyword arguments passed to the propensity model.
Returns
-------
observed : float
The observed pMSE.
standard : float
The null-standardised pMSE.
ratio : float
The observed-null pMSE ratio.
Raises
------
ValueError
If `method` is not one of `'cart'` or `'logr'`.
See Also
--------
sklearn.linear_model.LogisticRegression
sklearn.tree.DecisionTreeClassifier
synthgauge.metrics.propensity.pmse
synthgauge.metrics.propensity.pmse_ratio
synthgauge.metrics.propensity.pmse_standardised
Notes
-----
For the CART model, `sklearn.tree.DecisionTreeClassifier` is used.
Meanwhile, the logistic regression model uses
`sklearn.linear_model.LogisticRegression`.
Note that the `random_state` keyword argument is used to
(independently) create the permutations and to fit the model when
using a CART model. Without specifying this, the results will not be
reproducible.
Details on these metrics can be found at:
https://doi.org/10.1111/rssa.12358
"""
if method not in ("cart", "logr"):
raise ValueError(
f"Propensity method must be 'cart' or 'logr' not {method}."
)
feats = feats or real.columns.intersection(synth.columns)
combined, indicator = _combine_encode_and_pop(real[feats], synth[feats])
if method == "logr":
loc, scale = _pmse_logr_statistics(combined, indicator)
if method == "cart":
loc, scale = _pmse_cart_statistics(
combined, indicator, num_perms, estimator, **kwargs
)
observed = pmse(combined, indicator, method, **kwargs)
standard = (observed - loc) / scale
ratio = observed / loc
PropensityResult = namedtuple(
"PropensityResult",
("pmse", "pmse_standardised", "pmse_ratio"),
)
return PropensityResult(observed, standard, ratio)
[docs]def specks(real, synth, classifier, **kwargs):
"""Propensity score comparison via the Kolmogorov-Smirnov distance.
The SPECKS metric was originally presented in
https://arxiv.org/pdf/1803.06763.pdf and works as follows:
1. Stack the real and synthetic data, and create a variable
indicating whether each record is real (0) or synthetic (1).
2. Calculate the propensity score for each record using a binary
classifier on the indicator variable.
3. Compute the Kolmogorov-Smirnov distance between the empirical
CDFs for the real and synthetic propensity scores.
The Kolmogorov-Smirnov distance is defined as the maximum difference
between two empirical distributions. Therefore, it is bounded
between zero and one. If the synthetic data properly resembles the
original data then they will be indistinguishable, leading to close
empirical CDFs.
Parameters
----------
real : pandas.DataFrame
Dataframe containing the real data.
synth : pandas.DataFrame
Dataframe containing the synthetic data.
classifier : scikit-learn estimator
Any `scikit-learn`-style classifier class with a `predict_proba`
method.
**kwargs : dict, optional
Keyword arguments to be passed to `classifer`.
Returns
-------
float
The Kolmogorov-Smirnov distance between the real and synthetic
propensity score CDFs.
Notes
-----
The combined dataset is one-hot-encoded before being passed to the
classifier so categorical features can be handled.
The paper introducing SPECKS has also been published in METRON:
https://doi.org/10.1007/s40300-021-00201-0.
"""
combined, indicator = _combine_encode_and_pop(real, synth)
scores = (
classifier(**kwargs)
.fit(combined, indicator)
.predict_proba(combined)[:, 1]
)
return ks_2samp(scores[indicator == 0], scores[indicator == 1]).statistic