Source code for synthgauge.utils

"""Utility functions for handling real and synthetic data."""

import warnings

import pandas as pd


[docs]def df_combine( real, synth, feats=None, source_col_name="source", source_val_real="real", source_val_synth="synth", ): """Combine separate dataframes of real and synthetic data. The dataframes are concatenated along the first axis (rows) and a source column is added to distinguish the real data from the synthetic data. Optionally, specific features can be selected. Parameters ---------- real : pandas.DataFrame Dataframe containing the real data. synth : pandas.DataFrame Dataframe containing the synthetic data. feats : list of str or None, default None Features to combine. If `None` (default), all common features are used. source_col_name : str, default "source" Name of the source column. This is added to the combined dataset and filled with the `source_val_real` and `source_val_synth` values to signify the real and synthetic data respectively. Defaults to `"source"`. source_val_real : any, default "real" Value to use in `source_col_name` column to signify the real data. Defaults to `"real"`. source_val_synth : any, default "synth" Value to use in `source_col_name` column to signify the synthetic data. Defaults to `"synth"`. Returns ------- combined : pandas.DataFrame The combined dataframe. """ feats = feats or real.columns.intersection(synth.columns) real = real[feats].copy() real[source_col_name] = source_val_real synth = synth[feats].copy() synth[source_col_name] = source_val_synth combined = pd.concat([real, synth], ignore_index=True) return combined
[docs]def df_separate( data, source_col_name, feats=None, source_val_real="real", source_val_synth="synth", drop_source_col=True, ): """Separate a dataframe into real and synthetic data. The dataframe is split using a source column and real and synthetic flags. Optionally, specific features can be selected. Parameters ---------- data : pandas.DataFrame Dataframe to split into real and synthetic components. source_col_name : str Name of the column used to signify real versus synthetic data. feats : list of str or None, default None Features to separate. If `None` (default), uses all features. source_val_real : any, default "real" Value in `source_col_name` column signifying the real data. Defaults to `"real"`. source_val_synth : any, default "synth" Value in `source_col_name` column signifying the synthetic data. Defaults to `"synth"`. drop_source_col : bool, default True Indicates whether the `source_col_name` column should be dropped from the outputs (default) or not. Returns ------- real : pandas.DataFrame Dataframe containing the real data. synth : pandas.DataFrame Dataframe containing the synthetic data. """ feats = list(feats) if feats is not None else list(data.columns) feats.append(source_col_name) real = data[data[source_col_name] == source_val_real][feats].copy() synth = data[data[source_col_name] == source_val_synth][feats].copy() if drop_source_col: real.drop(columns=source_col_name, inplace=True, errors="ignore") synth.drop(columns=source_col_name, inplace=True, errors="ignore") return real, synth
[docs]def launder(real, synth, feats=None, suffix_real="real", suffix_synth="synth"): """Launder feature names and optionally filter. To provide clear distinction between the real and synthetic features, each dataframe is updated to append suffixes to the feature names. Optionally, specific features can be selected. Parameters ---------- real : pandas.DataFrame Dataframe containing the real data. synth : pandas.DataFrame Dataframe containing the synthetic data. feats : list of str or None, default None Features to launder. If `None` (default), all common features are used. suffix_real : str, default "real" Suffix to append to columns in `real`. Default is `"real"`. suffix_synth : str, default "synth" Suffix to append to columns in `synth`. Default is `"synth"`. Returns ------- real : pandas.DataFrame Dataframe containing the laundered real data. synth : pandas.DataFrame Dataframe containing the laundered synthetic data. """ feats = feats or real.columns.intersection(synth.columns) real = real[feats].copy() synth = synth[feats].copy() real.columns = [f"{c}_{suffix_real}" for c in real.columns] synth.columns = [f"{c}_{suffix_synth}" for c in synth.columns] return real, synth
[docs]def cat_encode( df, feats=None, return_all=False, convert_only=False, force=False ): """Convert object features to categories. Generates a new version of the input dataframe with the specified features categorically encoded with integer labels. Optionally, the features can be returned as `category` data type with no encoding. Before performing the conversion, a check is made to identify any speficied features that are not `object`-type and thus less suited to categorical encoding. A warning is raised for these features and they will be ignored from subsequent encoding steps unless `force` is set to `True`. Parameters ---------- df : pandas.DataFrame Input dataframe to be converted. feats : list of str or None, default None Features in `df` to convert to categorical. If `None` (default), all object-type columns are selected. return_all : bool, default False If `True`, all features in `df` will be returned regardless of whether they were converted. If `False` (default), only the converted features are returned. convert_only : bool, default False If `True`, the features will only be converted to the `category` data-type without being integer-encoded. force : bool, default False If `True`, all features in `feats` will be encoded regardless of their data-type. Warns ----- UserWarning A warning is given if any of the features in `feats` are not of an `object` data type. Returns ------- out_df : pandas.DataFrame Dataframe with (at least) the converted features. cat_dict : dict or NoneType A dictionary mapping each encoded feature to its categories. If `convert_only=True`, returns as `None`. """ all_cat_cols = df.select_dtypes(include=("object", "category")).columns feats = pd.Index(feats) if feats is not None else all_cat_cols # Check for non-object type features non_cat_cols = feats.difference(all_cat_cols) if non_cat_cols.any(): warnings.warn( "Selected features include non-object types: " f"{non_cat_cols.to_list()}." "\nIs this intended? If so, rerun with `force=True`. " "If not, they will be dropped, unless `return_all=True`, " "where they will pass through unchanged." ) cat_dict = {} if not convert_only else None feats_to_encode = feats if force else feats.difference(non_cat_cols) out_df = df.copy() if return_all else df[feats_to_encode].copy() for feature in feats_to_encode: out_df[feature] = out_df[feature].astype("category") if not convert_only: feature_cat = out_df[feature].cat cat_dict[feature] = feature_cat.categories out_df[feature] = feature_cat.codes return out_df, cat_dict