Source code for synthgauge.metrics.density

"""Mean absolute difference in feature densities."""

import numpy as np

from .. import utils


def _feature_density_diff(real, synth, feature, bins=10):
    """Computes the difference between real and synth feature densities.

    For the specified feature the density is computed across `bins` in
    both the real and synthetic data. The per-bin difference is computed
    and returned along with the bin edges that were used.

    Prior to calculating the densities. all values are converted to
    numeric via `synthgauge.utils.cat_encode`.

    Parameters
    ----------
    real : pandas.DataFrame
        Dataframe containing the real data.
    synth : pandas.DataFrame
        Dataframe containing the synthetic data.
    feature : str
        The feature that will be used to compute the density.
    bins : str or int, default 10
        Bins to use for computing the density. This value is passed
        to `numpy.histogram_bin_edges` so can be any value accepted by
        that function. Default uses 10 bins.

    Returns
    -------
    hist_diff : numpy.ndarray
        The difference in feature density for each of the bins.
    bin_edges : numpy.ndarray
        The edges of the bins.
    """

    combined = utils.df_combine(real, synth, feats=[feature])
    encoded, _ = utils.cat_encode(combined, feats=[feature], return_all=True)
    enc_real, enc_synth = utils.df_separate(encoded, "source")

    bin_edges = np.histogram_bin_edges(encoded[feature], bins=bins)

    real_hist, _ = np.histogram(
        enc_real[feature], bins=bin_edges, density=True
    )
    synth_hist, _ = np.histogram(
        enc_synth[feature], bins=bin_edges, density=True
    )

    hist_diff = synth_hist - real_hist

    return hist_diff, bin_edges


[docs]def feature_density_mad(real, synth, feats=None, bins=10): """Mean absolute difference of feature densities. For each feature the difference between the density across the bins within `real` and `synth` is calculated. Finally the MAE across all features and bins is calculated. A value close to 0 indicates that the real and synthetic datasets have a similar set of feature distributions. Parameters ---------- real : pandas.DataFrame DataFrame containing the real data. synth : pandas.DataFrame DataFrame containing the sythetic data. feats : list of str or None, default None The features that will be used to compute the densities. If `None` (default), all common features are used. bins : str or int, default 10 Binning method for discretising the data. Can be anything accepted by `numpy.histogram_bin_edges`. Default uses 10 bins. Returns ------- float Mean absolute error of feature densities. """ feats = feats or real.columns.intersection(synth.columns) diffs = [ _feature_density_diff(real, synth, feat, bins)[0] for feat in feats ] return np.mean(np.abs(np.concatenate(diffs)))