Source code for synthgauge.metrics.privacy

"""Privacy metrics."""

import numpy as np
from sklearn.neighbors import LocalOutlierFactor, NearestNeighbors

from .. import utils


def _get_weap_scores(synth, key, target):
    """Within Equivalence class Attribution Probabilities (WEAP).

    For each record in the synthetic dataset, this function returns the
    proportion across the whole dataset that a set of `key` values are
    matched with this `target` value.

    Parameters
    ----------
    synth : pandas.DataFrame
        Dataframe containing the synthetic data.
    key : list of str
        List of features in `synth` to use as the key.
    target : str
        Feature to use as the target.

    Returns
    -------
    pandas.Series
        A series object containing the WEAP scores for each record in
        `synth`.

    Notes
    -----
    This function is intended to only be used within `TCAP()` to
    determine which synthetic records are most likely to pose an
    attribution risk.
    """

    synth = synth.copy()
    key_and_target = [*key, target]

    key_target_vc = synth.value_counts(subset=key_and_target).reset_index()
    key_target_vc.columns = key_and_target + ["key_target_frequency"]
    key_vc = synth.value_counts(key).reset_index()
    key_vc.columns = key + ["key_frequency"]

    synth = synth.merge(key_target_vc)
    synth = synth.merge(key_vc)

    return synth["key_target_frequency"] / synth["key_frequency"]


[docs]def tcap_score(real, synth, key, target): """Target Correct Attribution Probability (TCAP) score. This privacy metric calculates the average chance that the key-target pairings in a synthetic dataset reveal the true key-target pairings in associated real dataset. Parameters ---------- real : pandas.DataFrame Dataframe containing the real data. synth : pandas.DataFrame Dataframe containing the synthetic data. key : list of str List of features in `synth` to use as the key. target : str Feature to use as the target. Returns ------- float The average TCAP score across the dataset. Notes ----- This metric provides an estimate of how well an intruder could infer attributes of groups in the real dataset by studying the synthetic. The choices for `key` and `target` will vary depending on the dataset in question but we would suggest the `key` features are those that could be readily available to an outsider and the `target` feature is one we wouldn't want them finding out, such as a protected characteristic. This method only works with categorical data, so binning of continuous data may be required. Full details may be found in: Taub and Elliott (2019). The Synthetic Data Challenge. The Hague, The Netherlands: Joint UNECE/Eurostat Work Session on Statistical Data Confidentiality, Session 3. """ scores = _get_weap_scores(synth, key, target) if sum(scores == 1) == 0: return 0 synth_reduced = synth[scores == 1][[*key, target]] synth_reduced.drop_duplicates(inplace=True) synth_reduced.rename( columns={target: target + "_synthguess"}, inplace=True ) combined = real.merge(synth_reduced, how="left", on=key) target_matches = combined[target] == combined[target + "_synthguess"] return np.mean(target_matches)
def _find_outliers(data, threshold, n_neighbours): """Identify local outliers using the nearest-neighbour principle. Parameters ---------- data : pandas.DataFrame Dataframe to be assessed for outliers. threshold : float Float influencing classification of outliers. Increasing this threshold means that fewer points are considered outliers. n_neighbours : int Number of neighbours to consider in outlier detection. Returns ------- outlier_bool : list of bool List indicating which rows of `data` are outliers. Notes ----- Most inliers will have an outlier factor of less than one, however there are no clear rules that determine when a data point is an outlier. This is likely to vary from dataset to dataset and, as such, we recommend tuning `outlier_factor_threshold` to suit. """ lof = LocalOutlierFactor(n_neighbors=n_neighbours) lof.fit_predict(data) outlier_factor = -lof.negative_outlier_factor_ outlier_bool = outlier_factor < threshold return outlier_bool
[docs]def min_nearest_neighbour( real, synth, feats=None, outliers_only=True, threshold=2, n_neighbours=5, ): """Minimum nearest-neighbour distance. This privacy metric returns the smallest distance between any point in the real dataset and any point in the synthetic dataset. There is an option to only consider the outliers in the real dataset as these perhaps pose more of a privacy concern. Parameters ---------- real : pandas.DataFrame Dataframe containing the real data. synth : pandas.DataFrame Dataframe containing the synthetic data. feats : list of str or None, default None Features in `real` and `synth` to use when calculating distance. If `None` (default), all common features are used. outliers_only : bool, default True Boolean indicating whether to filter out the real data inliers (default) or not. threshold : number, default 2 Outlier decision threshold. Increase to include fewer points from `real` in nearest-neighbour calculations. n_neighbours : int, default 5 Number of neighbours to consider when identifying local outliers. Returns ------- float Minimum Manhattan distance between `real` and `synth` data. Notes ----- This privacy metric provides an insight into whether the synthetic dataset is too similar to the real dataset. It does this by calculating the minimum distance between the real records and the synthetic records. This metric assumes that categorical data is ordinal during distance calculations, or that it has already been suitably one-hot-encoded. """ combined = utils.df_combine(real, synth, feats=feats) combined_recode, _ = utils.cat_encode(combined, return_all=True) real, synth = utils.df_separate( combined_recode, source_col_name="source", source_val_real=0, source_val_synth=1, ) if outliers_only: outliers = _find_outliers(real, threshold, n_neighbours) real = real[outliers] neigh = NearestNeighbors(n_neighbors=1, radius=100, p=1).fit(real) distances, _ = neigh.kneighbors(synth, return_distance=True) return np.min(distances)
def _get_sample(data, feats, n_samples, seed, label): """Take a sample from the data and count the feature frequencies.""" return ( data[feats] .sample(n_samples, random_state=seed) .assign(**{f"{label}_count": 1}) .groupby(feats) .count() .reset_index() )
[docs]def sample_overlap_score( real, synth, feats=None, sample_size=0.2, runs=5, seed=None, score_type="unique", ): """Return percentage of overlap between real and synth data based on random sampling. Samples from both the real and synthetic datasets are compared for similarity. This similarity, or overlap score, is based on the exact matches of real data records within the synthetic data. Parameters ---------- real : pandas.DataFrame Dataframe containing the real data. synth : pandas.DataFrame Dataframe containing the synthetic data. feats : list of str or None, default None The features used to match records. If `None` (default), all common features are used. sample_size : float or int, default 0.2 The ratio (if `sample_size` between 0 and 1) or count (`sample_size` > 1) of records to sample. Default is 0.2 (20%). runs : int, default 5 The number of sampling runs to use when computing the score. seed : int, optional Random number seed used for sampling. score_type : {"unique", "sample"}, default "unique" Method used for calculating the overlap score. If "unique" (default), the score is the percentage of unique records in the real sample that have a match within the synthetic data. If "sample", the score is the percentage of all records within the real sample that have a match within the synth sample. Returns ------- overlap_score : float Estimated overlap score between `real` and `synth`. """ feats = feats or real.columns.intersection(synth.columns).to_list() min_num_rows = min(real.shape[0], synth.shape[0]) if 0 <= sample_size <= 1: n_samples = int(min_num_rows * sample_size) else: n_samples = min(min_num_rows, sample_size) scores = [] for _ in range(runs): sample_real = _get_sample(real, feats, n_samples, seed, "real") sample_synth = _get_sample(synth, feats, n_samples, seed, "synth") duplicates = sample_real.merge( sample_synth, how="left", on=feats, suffixes=("_real", "_synth"), indicator="_match", ) if score_type == "unique": score = duplicates._match.value_counts(normalize=True).both if score_type == "sample": score = ( duplicates[duplicates._match == "both"].real_count.sum() / n_samples ) scores.append(score) return np.mean(scores)