Source code for synthgauge.utils

"""Utility functions for handling real and synthetic data."""

import warnings

import pandas as pd


[docs]def df_combine(
    real,
    synth,
    feats=None,
    source_col_name="source",
    source_val_real="real",
    source_val_synth="synth",
):
    """Combine separate dataframes of real and synthetic data.

    The dataframes are concatenated along the first axis (rows) and a
    source column is added to distinguish the real data from the
    synthetic data. Optionally, specific features can be selected.

    Parameters
    ----------
    real : pandas.DataFrame
        Dataframe containing the real data.
    synth : pandas.DataFrame
        Dataframe containing the synthetic data.
    feats : list of str or None, default None
        Features to combine. If `None` (default), all common features
        are used.
    source_col_name : str, default "source"
        Name of the source column. This is added to the combined dataset
        and filled with the `source_val_real` and `source_val_synth`
        values to signify the real and synthetic data respectively.
        Defaults to `"source"`.
    source_val_real : any, default "real"
        Value to use in `source_col_name` column to signify the real
        data. Defaults to `"real"`.
    source_val_synth : any, default "synth"
        Value to use in `source_col_name` column to signify the
        synthetic data. Defaults to `"synth"`.

    Returns
    -------
    combined : pandas.DataFrame
        The combined dataframe.
    """

    feats = feats or real.columns.intersection(synth.columns)

    real = real[feats].copy()
    real[source_col_name] = source_val_real

    synth = synth[feats].copy()
    synth[source_col_name] = source_val_synth

    combined = pd.concat([real, synth], ignore_index=True)

    return combined


[docs]def df_separate(
    data,
    source_col_name,
    feats=None,
    source_val_real="real",
    source_val_synth="synth",
    drop_source_col=True,
):
    """Separate a dataframe into real and synthetic data.

    The dataframe is split using a source column and real and synthetic
    flags. Optionally, specific features can be selected.

    Parameters
    ----------
    data : pandas.DataFrame
        Dataframe to split into real and synthetic components.
    source_col_name : str
        Name of the column used to signify real versus synthetic data.
    feats : list of str or None, default None
        Features to separate. If `None` (default), uses all features.
    source_val_real : any, default "real"
        Value in `source_col_name` column signifying the real data.
        Defaults to `"real"`.
    source_val_synth : any, default "synth"
        Value in `source_col_name` column signifying the synthetic data.
        Defaults to `"synth"`.
    drop_source_col : bool, default True
        Indicates whether the `source_col_name` column should be
        dropped from the outputs (default) or not.

    Returns
    -------
    real : pandas.DataFrame
        Dataframe containing the real data.
    synth : pandas.DataFrame
        Dataframe containing the synthetic data.
    """

    feats = list(feats) if feats is not None else list(data.columns)
    feats.append(source_col_name)

    real = data[data[source_col_name] == source_val_real][feats].copy()
    synth = data[data[source_col_name] == source_val_synth][feats].copy()

    if drop_source_col:
        real.drop(columns=source_col_name, inplace=True, errors="ignore")
        synth.drop(columns=source_col_name, inplace=True, errors="ignore")

    return real, synth


[docs]def launder(real, synth, feats=None, suffix_real="real", suffix_synth="synth"):
    """Launder feature names and optionally filter.

    To provide clear distinction between the real and synthetic
    features, each dataframe is updated to append suffixes to the
    feature names. Optionally, specific features can be selected.

    Parameters
    ----------
    real : pandas.DataFrame
        Dataframe containing the real data.
    synth : pandas.DataFrame
        Dataframe containing the synthetic data.
    feats : list of str or None, default None
        Features to launder. If `None` (default), all common features
        are used.
    suffix_real : str, default "real"
        Suffix to append to columns in `real`. Default is `"real"`.
    suffix_synth : str, default "synth"
        Suffix to append to columns in `synth`. Default is `"synth"`.

    Returns
    -------
    real : pandas.DataFrame
        Dataframe containing the laundered real data.
    synth : pandas.DataFrame
        Dataframe containing the laundered synthetic data.
    """

    feats = feats or real.columns.intersection(synth.columns)

    real = real[feats].copy()
    synth = synth[feats].copy()

    real.columns = [f"{c}_{suffix_real}" for c in real.columns]
    synth.columns = [f"{c}_{suffix_synth}" for c in synth.columns]

    return real, synth


[docs]def cat_encode(
    df, feats=None, return_all=False, convert_only=False, force=False
):
    """Convert object features to categories.

    Generates a new version of the input dataframe with the specified
    features categorically encoded with integer labels. Optionally, the
    features can be returned as `category` data type with no encoding.

    Before performing the conversion, a check is made to identify any
    speficied features that are not `object`-type and thus less suited
    to categorical encoding. A warning is raised for these features and
    they will be ignored from subsequent encoding steps unless `force`
    is set to `True`.

    Parameters
    ----------
    df : pandas.DataFrame
        Input dataframe to be converted.
    feats : list of str or None, default None
        Features in `df` to convert to categorical. If `None` (default),
        all object-type columns are selected.
    return_all : bool, default False
        If `True`, all features in `df` will be returned regardless of
        whether they were converted. If `False` (default), only the
        converted features are returned.
    convert_only : bool, default False
        If `True`, the features will only be converted to the `category`
        data-type without being integer-encoded.
    force : bool, default False
        If `True`, all features in `feats` will be encoded regardless of
        their data-type.

    Warns
    -----
    UserWarning
        A warning is given if any of the features in `feats` are not of
        an `object` data type.

    Returns
    -------
    out_df : pandas.DataFrame
        Dataframe with (at least) the converted features.
    cat_dict : dict or NoneType
        A dictionary mapping each encoded feature to its categories. If
        `convert_only=True`, returns as `None`.
    """

    all_cat_cols = df.select_dtypes(include=("object", "category")).columns
    feats = pd.Index(feats) if feats is not None else all_cat_cols

    # Check for non-object type features
    non_cat_cols = feats.difference(all_cat_cols)
    if non_cat_cols.any():
        warnings.warn(
            "Selected features include non-object types: "
            f"{non_cat_cols.to_list()}."
            "\nIs this intended? If so, rerun with `force=True`. "
            "If not, they will be dropped, unless `return_all=True`, "
            "where they will pass through unchanged."
        )

    cat_dict = {} if not convert_only else None

    feats_to_encode = feats if force else feats.difference(non_cat_cols)
    out_df = df.copy() if return_all else df[feats_to_encode].copy()

    for feature in feats_to_encode:
        out_df[feature] = out_df[feature].astype("category")

        if not convert_only:
            feature_cat = out_df[feature].cat
            cat_dict[feature] = feature_cat.categories
            out_df[feature] = feature_cat.codes

    return out_df, cat_dict