Source code for synthgauge.plot

"""Functions for visually evaluating synthetic data."""

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from pandas.core.dtypes.common import is_numeric_dtype

from .metrics.correlation import _pairwise_cramers_v
from .metrics.density import _feature_density_diff
from .utils import cat_encode

sns.set_theme()


[docs]def plot_histograms(df, feats=None, groupby=None, figcols=2, figsize=None): """Plot feature distributions. Plot a histogram (or countplot for categorical data) for each feature. Where multiple features are provided a grid will be generated to store all the plots. Optionally, a groupby feature can be specified to apply a grouping prior to calculating the distribution. Parameters ---------- df : pandas.DataFrame Dataframe containing the feature(s) to be plotted. feats : list of str or None, default None Features in to plot. If `None` (default), all features are used. groupby : str, optional Feature on which to group data. figcols : int, default 2 Number of columns to use in the figure. figsize : tuple of float, optional Size of figure in inches `(width, height)`. Defaults to `matplotlib.pyplot.rcParams["figure.figsize"]`. Returns ------- matplotlib.figure.Figure """ feats = feats or df.columns n_rows = int(np.ceil(len(feats) / figcols)) fig, axes = plt.subplots(n_rows, figcols, figsize=figsize) for feat, ax in zip(feats, axes.ravel()): plotter = sns.histplot if is_numeric_dtype(df[feat]) else sns.countplot plotter(data=df, x=feat, ax=ax, hue=groupby) # Turn off axes with no data for ax in axes.ravel(): if not ax.has_data(): ax.set_visible(False) fig.tight_layout() return fig
def _order_categorical(feat): """Order a feature only if it is categorical. Parameters ---------- feat : pd.Series The feature to be ordered. Returns ------- pd.Series The ordered feature. """ return feat.cat.as_ordered() if hasattr(feat, "cat") else feat
[docs]def plot_joint( df, x, y, groupby=None, x_bins="auto", y_bins="auto", figsize=6 ): """Plot bivariate and univariate graphs. Convenience function that leverages `seaborn`. For more granular control, refer to `seaborn.JointGrid` and `seaborn.jointplot`. Parameters ---------- df : pandas.DataFrame DataFrame containing the feature(s) to plot. x : str Feature to plot on the x-axis and -margin. y : str Feature to plot on the y-axis and -margin. groupby : str, optional Feature on which to group data. x_bins, y_bins : array_like or int or str, default "auto" Binning method for axis. If `array_like`, must be sequence of bin edges. If `int`, specifies the number of bins to use. If `str`, can be anything accepted by `numpy.histogram_bin_edges`. Defaults to `"auto"`. figsize: int, default 6 Size of each side of the figure in inches (it will be square). Defaults to six inches. Returns ------- seaborn.axisgrid.JointGrid """ grid = sns.JointGrid(height=figsize) sns.histplot( data=df, x=_order_categorical(df[x]), y=_order_categorical(df[y]), hue=groupby, alpha=0.5, ax=grid.ax_joint, ) # For margins can use countplot or hist depending on data type. # No legends are shown for these marginal plots. for side, feat, bins in zip(("x", "y"), (x, y), (x_bins, y_bins)): plot_kwargs = { side: feat, "ax": getattr(grid, f"ax_marg_{side}"), "hue": groupby, } if is_numeric_dtype(df[feat]): plotter = sns.histplot plot_kwargs["bins"] = bins else: plotter = sns.countplot ax = plotter(data=df, **plot_kwargs) legend = ax.get_legend() if legend is not None: legend.remove() return grid
[docs]def plot_histogram3d(df, x, y, x_bins="auto", y_bins="auto", figsize=None): """Plot 3D histogram of two features. This is similar to a 2D histogram plot with an extra axis added to display the count for each feature-wise pair as 3D bars. Parameters ---------- df : pandas.DataFrame DataFrame containing the feature(s) to plot. x : str Feature to plot on the x-axis and -margin. y : str Feature to plot on the y-axis and -margin. x_bins, y_bins : array_like or int or str, default "auto" Binning method for axis. If the corresponding feature is categorical, the bins will be set to the cardinality of that feature. If `array_like`, must be sequence of bin edges. If `int`, specifies the number of bins to use. If `str`, can be anything accepted by `numpy.histogram_bin_edges`. Defaults to `"auto"`. figsize: tuple of float, optional Size of figure in inches `(width, height)`. Defaults to `matplotlib.pyplot.rcParams["figure.figsize"]`. Returns ------- matplotlib.figure.Figure """ # Encode categorical data cat_feats = df.select_dtypes(include=("object", "category")).columns cat_labels = dict() if cat_feats.any(): df, cat_labels = cat_encode(df, cat_feats, return_all=True) # Determine bins bins_xy = [] for feat, bins in zip([x, y], [x_bins, y_bins]): bins = df[feat].nunique() if feat in cat_feats else bins bins_xy.append(np.histogram_bin_edges(df[feat], bins)) # Create figure fig = plt.figure(figsize=figsize) ax = plt.axes(projection="3d") # Compute 2D histogram hist, xedges, yedges = np.histogram2d(df[x], df[y], bins=bins_xy) # Determine bar anchors and thickness xpos, ypos = np.meshgrid(xedges[:-1], yedges[:-1], indexing="ij") xpos = xpos.ravel() ypos = ypos.ravel() dx = (xedges[1] - xedges[0]) * 0.8 dy = (yedges[1] - yedges[0]) * 0.8 dz = hist.ravel() zpos = np.zeros_like(dz) + 0.1 # Plot the 3D bars ax.bar3d(xpos, ypos, zpos, dx, dy, dz, zsort="average") # Set title and labels ax.set_xlabel(x, fontsize=16) ax.set_ylabel(y, fontsize=16) ax.set_zlabel("$count$", fontsize=16, rotation=1) ax.set_title("3D Histogram") # Set tick lables for categorical features if x in cat_labels.keys(): ax.set_xticks(xedges[1:]) ax.set_xticklabels(cat_labels[x]) if y in cat_labels.keys(): ax.set_yticks(yedges[1:]) ax.set_yticklabels(cat_labels[y]) return fig
[docs]def plot_correlation( *dfs, feats=None, method="pearson", plot_diff=False, figcols=2, figsize=None, **kwargs, ): """Plot correlation between features in a dataframe. For each dataframe provided a subplot is generated showing a correlation heatmap of the features. For numeric features, the method can be anything supported by `pandas.DataFrame.corr`; for categorical or object feature types `"cramers_v"` must be specified. If the method does not match the data type, an error is raised. The `plot_diff` parameter will also include a difference plot, i.e. correlation difference between two dataframes. This is only used when two dataframes are provided. Parameters ---------- *dfs : pandas.DataFrame Any number of dataframes to plot. feats : list of str or None, default None Features to plot. Must be present in all elements of `dfs`. If `None` (default), uses features common to all dataframes. method : {"pearson", "spearman", "cramers_v"}, default "pearson" Correlation method. See `pandas.DataFrame.corr` for more details on `"pearson"` and `"spearman"`. When `"cramers_v"` is specified, correlation is calculated using `synthgauge.metrics.correlation.cramers_v`. plot_diff : bool, default False If `True` and exactly two dataframes are provided, will also plot a heatmap of the absolute differences between the respective datasets' correlations. figcols : int, default 2 Number of columns to use in the figure. Only used when `feats` contains more than one feature. figsize : tuple of float, optional Size of figure in inches `(width, height)`. Defaults to `matplotlib.pyplot.rcParams["figure.figsize"]`. **kwargs : dict, optional Any other keyword arguments to be passed to `seaborn.heatmap`. For example `annot=True` will turn on cell annotations. See documentation for more examples. Raises ------ ValueError If `method` does not match the data type(s) of `feats`. Returns ------- matplotlib.figure.Figure """ feats = feats or list(set.intersection(*(set(df.columns) for df in dfs))) corr_results = [] for df in dfs: if method.lower() in ("pearson", "spearman"): data = df[feats].select_dtypes(include="number") if len(data.columns) == 0: raise ValueError( f"No numeric columns available for method: {method}" ) corr_results.append( data.corr(method=method) .dropna(axis=0, how="all") .dropna(axis=1, how="all") ) if method.lower() == "cramers_v": data = df[feats].select_dtypes(include=["object", "category"]) if len(data.columns) == 0: raise ValueError( f"No categorical columns available for method: {method}" ) corr_results.append( _pairwise_cramers_v(data) .dropna(axis=0, how="all") .dropna(axis=1, how="all") ) # Get min and max to set consistant colourbar corr_values = np.array(corr_results) vmin = corr_values.min() vmax = corr_values.max() # For now only perform diff if 2 DataFrames are given; no more. # TODO: Allow all diff permutations? if len(corr_results) == 2 and plot_diff: corr_diff = np.abs(corr_results[0] - corr_results[1]) corr_results.append(corr_diff) n_subplots = len(corr_results) ncols = 1 if n_subplots == 1 else figcols nrows = int(np.ceil(n_subplots / ncols)) fig, axes = plt.subplots(nrows, ncols, figsize=figsize) for ax_num, (ax, corr) in enumerate( zip(np.array(axes).ravel(), corr_results) ): if ax_num == n_subplots - 1 and plot_diff: sp_title = "Correlation Difference" # Ignore vmin and vmax for this plot as scale will be # different to the others vmin = None vmax = None else: sp_title = f"DataFrame {ax_num + 1} Correlation" sns.heatmap(corr, ax=ax, vmin=vmin, vmax=vmax, **kwargs) ax.set_title(sp_title) # Turn off axes with no data for ax in np.array(axes).ravel(): if not ax.has_data(): ax.set_visible(False) fig.tight_layout() return fig
[docs]def plot_crosstab( real, synth, x, y, x_bins="auto", y_bins="auto", figsize=None, cmap="rocket", **kwargs, ): """Plot cross-tabulation heatmap for two features. The two-feature crosstab calculation is performed and plotted as a heatmap. One heatmap is shown for the real data and one for the synthetic. Numeric features are discretised using the `*_bins` arguments. Parameters ---------- real : pandas.DataFrame Dataframe containing the real data. synth : pandas.DataFrame Dataframe containing the synthetic data. x : str Feature to plot on the x-axis and -margin. y : str Feature to plot on the y-axis and -margin. x_bins, y_bins : array_like or int or str, default "auto" Binning method for axis. If `array_like`, must be sequence of bin edges. If `int`, specifies the number of bins to use. If `str`, can be anything accepted by `numpy.histogram_bin_edges`. Defaults to `"auto"`. Only used for numeric features. figsize : tuple of float, optional Size of figure in inches `(width, height)`. Defaults to `matplotlib.pyplot.rcParams["figure.figsize"]`. cmap : str, default "rocket" Palette name for heatmap and colour bar. See the documentation for `seaborn.color_palette` on available palette formats. Defaults to `"rocket"`. **kwargs : dict, optional Any other keyword arguments to be passed to `seaborn.heatmap`. For example, `annot=True` will turn on cell annotations. See documentation for more examples. Returns ------- matplotlib.figure.Figure """ # Collect x and y values real_x, real_y = real[x], real[y] synth_x, synth_y = synth[x], synth[y] all_x = pd.concat((real_x, synth_x)) all_y = pd.concat((real_y, synth_y)) # Discretise numeric features if is_numeric_dtype(all_x): x_bins = np.histogram_bin_edges(all_x.dropna(), x_bins) real_x = pd.cut(real_x, x_bins) synth_x = pd.cut(synth_x, x_bins) if is_numeric_dtype(all_y): y_bins = np.histogram_bin_edges(all_y.dropna(), y_bins) real_y = pd.cut(real_y, y_bins) synth_y = pd.cut(synth_y, y_bins) freq_real = pd.crosstab(real_x, real_y, dropna=False) freq_synth = pd.crosstab(synth_x, synth_y, dropna=False) fig, axes = plt.subplots(1, 2, figsize=figsize, constrained_layout=True) # Use same scale for real and synth vmin = min(freq_real.values.min(), freq_synth.values.min()) vmax = max(freq_real.values.max(), freq_synth.values.max()) cmap = sns.color_palette(cmap, as_cmap=True) mpbl = mpl.cm.ScalarMappable(mpl.colors.Normalize(vmin, vmax), cmap=cmap) for freq, ax, title in zip( (freq_real, freq_synth), axes, ("REAL", "SYNTH") ): sns.heatmap( freq.T, vmin=vmin, vmax=vmax, cmap=cmap, cbar=False, ax=ax, **kwargs, ) ax.set_title(title) fig.colorbar(mpbl, ax=axes, shrink=0.8) return fig
[docs]def plot_qq(real, synth, feature, n_quantiles=None, figsize=None): """Generate a Q-Q plot for a feature of real and synthetic data. Quantile-quantile (Q-Q) plots are used to visualise two sets of numeric data to see if they are generated from the same distribution. In this case, it is used to provide some insight into the feature distributions for the synthetic and real data. If the scatter plot shows a straight line, then it can be inferred that the two distributions are similar and therefore the synthetically generated data follows the same distribution as the real data. See `Q-Q Plot <https://en.wikipedia.org/wiki/Q-Q_plot>`_ for more information. Parameters ---------- real : pandas.DataFrame Dataframe containing the real data. synth : pandas.DataFrame Dataframe containing the synthetic data. feature : str Feature to plot. Must be in `real` and `synth`. n_quantiles : int or None, default None Number of quantiles to calculate. If `None` (default), uses the length of `real`. figsize: tuple of float, optional Size of figure in inches `(width, height)`. Defaults to `matplotlib.pyplot.rcParams["figure.figsize"]`. Raises ------ TypeError If `feature` is not a numeric data type feature. Returns ------- matplotlib.figure.Figure """ dtype = real[feature].dtype if not is_numeric_dtype(dtype): raise TypeError(f"The feature must be numeric not of type: {dtype}") n_quantiles = n_quantiles or len(real) qs = np.linspace(0, 1, n_quantiles) x = np.quantile(real[feature], qs) y = np.quantile(synth[feature], qs) fig, ax = plt.subplots(1, 1, figsize=figsize) ax.scatter(x, y) ax.set_xlabel("real data quantiles") ax.set_ylabel("synth data quantiles") ax.set_title(f"Q-Q Plot for {feature}") # Plot X = Y min_xy = min(x.min(), y.min()) max_xy = max(x.max(), y.max()) ax.plot([min_xy, max_xy], [min_xy, max_xy]) return fig
[docs]def plot_feat_density_diff( real, synth, feats=None, feat_bins=10, diff_bins=10, figsize=None ): """Plot real and synth feature density differences. For each feature, the density difference between `real` and `synth` is calculated using `metrics.density._feature_density_diff`. If a single feature is provided in `feats`, the plot shows the raw density differences for each bin in that feature. Where multiple features are provided, the density differences are pd.concatenated into a flattened array and a histogram plotted. The histogram represents the distribution of differences in densities across all features and bins. Parameters ---------- real : pandas.DataFrame Dataframe containing the real data. synth : pandas.DataFrame Dataframe containing the synthetic data. feats : list of str or None, default None Features used to compute the densities. If `None` (default), all common features are used. feat_bins : str or int, default 10 Bins to use for computing the feature densities. This value is passed to `numpy.histogram_bin_edges` so can be any value accepted by that function. By default, uses 10 bins. diff_bins : str or int, default 10 Bins to use when computing the multiple-feature difference histogram. This value is passed to `numpy.histogram_bin_edges` so can be any value accepted by that function. By default, uses 10 bins. Returns ------- matplotlib.figure.Figure """ feats = feats or real.columns.intersection(synth.columns) if len(feats) == 1: feature = feats[0] diff_hist, diff_edges = _feature_density_diff( real, synth, feature, feat_bins ) xlabel = f"{feature} Binned" ylabel = "Density Difference" title = f"Feature Density Difference for {feature}" else: # TODO: option to have different bins for each feature diffs = [ _feature_density_diff(real, synth, feat, feat_bins)[0] for feat in feats ] diff_hist, diff_edges = np.histogram( np.concatenate(diffs), bins=diff_bins ) xlabel = "Difference Bins" ylabel = "Count" title = "Histogram of Density Differences" fig, ax = plt.subplots(figsize=figsize) # default bar width is too large so use scaled bin size bar_width = (diff_edges[1] - diff_edges[0]) * 0.8 ax.bar(diff_edges[:-1], diff_hist, align="edge", width=bar_width) ax.set_xlabel(xlabel) ax.set_ylabel(ylabel) ax.set_title(title) return fig