Source code for synthgauge.metrics.classification

"""Utility metrics using `scikit-learn`-style classifiers."""

import inspect
from collections import namedtuple

import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler


def _make_preprocessor(data, feats):
    """Make a pre-processing pipe for transforming numeric and
    categorical columns.

    Parameters
    ----------
    data : pandas.DataFrame
        The dataset containing at the least the columns in `feats`.
    feats : list of str
        A list of columns in `data` to be separated by data type.

    Returns
    -------
    preprocessor : sklearn.pipeline.Pipeline
        The pre-processing pipeline.
    """

    numeric = data[feats].select_dtypes(include="number").columns
    categorical = data[feats].select_dtypes(exclude="number").columns

    numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
    categorical_transformer = Pipeline(steps=[("encoder", OneHotEncoder())])

    preprocessor = ColumnTransformer(
        transformers=[
            ("numeric", numeric_transformer, numeric),
            ("categorical", categorical_transformer, categorical),
        ]
    )

    return preprocessor


def _make_pipeline(classifier, seed, preprocessor, **kwargs):
    """Create the pipeline of data pre-processing and classification.

    Parameters
    ----------
    classifier : scikit-learn estimator
        The `scikit-learn`-style class to be used as the classifier.
    seed : int
        Random seed to use for reproducibility. Only used if
        `random_state` is a parameter of `classifier`.
    preprocessor : sklearn.pipeline.Pipeline
        The pre-processing pipeline.
    **kwargs : dict, optinal
        featsword arguments for `classifier`.

    Returns
    -------
    pipeline : sklearn.pipeline.Pipeline
        A complete classification pipeline.
    """

    classifier_params = inspect.signature(classifier).parameters
    if classifier_params.get("random_state", None):
        kwargs["random_state"] = seed

    pipeline = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("classifier", classifier(**kwargs)),
        ]
    )

    return pipeline


def _get_scores(test, pred):
    """Calculate the precision, recall and f1 scores for a set of
    predicted values.

    Parameters
    ----------
    test : array_like
        Labels from the test set.
    pred : array_like
        Predicted labels.

    Returns
    -------
    scores : list
        The precision, recall and f1 score given the test set and
        predicted labels.
    """

    scores = [
        score_func(test, pred, average="macro")
        for score_func in (precision_score, recall_score, f1_score)
    ]

    return scores


[docs]def classification_comparison( real, synth, feats, target, classifier, test_prop=0.2, random_state=None, **kwargs ): """Classification utility metric. This metric fits two (identical) classification models to `real` and `synth`, and then tests them both against withheld `real` data. We obtain utility scores by subtracting the precision, recall and f1 scores of the "synthetic" model predictions from the "real" model's. Parameters ---------- real : pandas.DataFrame Dataframe containing the real data. synth : pandas.DataFrame Dataframe containing the synthetic data. feats : list of str List of column names to use as the input in the classification. target : str Column to use as target in the classification. classifier : scikit-learn estimator Classifier class with `fit` and `predict` methods. test_prop : float or int, default 0.2 If `float`, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If `int`, represents the absolute number of test samples. random_state : int, optional Random seed for shuffling during the train-test split, and for the classification algorithm itself. **kwargs : dict, optional featsword arguments passed to the classifier. Returns ------- precision_difference : float Precision of the real model subtracted by that of the synthetic model. recall_difference : float Recall of the real model subtracted by that of the synthetic model. f1_difference : float f1 score of the real model subtracted by that of the synthetic model. Notes ----- Some preprocessing is carried out before the models are trained. Numeric features are scaled and categorical features are one-hot-encoded. A score of zero tells us the synthetic data is just as good as the real at training the given classification model. Increases in these scores indicate poorer utility. """ real_X_train, real_X_test, real_y_train, real_y_test = train_test_split( real[feats], real[target], test_size=test_prop, random_state=random_state, ) synth_X_train, _, synth_y_train, _ = train_test_split( synth[feats], synth[target], test_size=test_prop, random_state=random_state, ) # preprocessing preprocessor = _make_preprocessor(real, feats) # train real model, test on real real_pipeline = _make_pipeline( classifier, random_state, preprocessor, **kwargs ).fit(real_X_train, real_y_train.values.ravel()) y_real_predicts_real = real_pipeline.predict(real_X_test) # train synth model, test on real synth_pipeline = _make_pipeline( classifier, random_state, preprocessor, **kwargs ).fit(synth_X_train, synth_y_train.values.ravel()) y_synth_predicts_real = synth_pipeline.predict(real_X_test) # compare results real_scores = _get_scores(real_y_test, y_real_predicts_real) synth_scores = _get_scores(real_y_test, y_synth_predicts_real) score_differences = np.subtract(real_scores, synth_scores) ClassificationResult = namedtuple( "ClassificationResult", ("precision_difference", "recall_difference", "f1_difference"), ) return ClassificationResult(*score_differences)