Source code for traintestdiff.core

from __future__ import division
from itertools import product

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

__alll__ = [
    "datasets_from_frame", "categorical_longform", "continuous_longform",
    "plot_categorical_diff", "plot_continuous_diff", "TrainTestDiff"
]


def _check_features_presence(datasets, features):
    for feature in features:
        for name, dataset in datasets.items():
            if feature not in dataset.columns:
                message = "`{}` feature missing in `{}`".format(feature, name)
                raise KeyError(message)


[docs]def datasets_from_frame(dataframe, feature):
    """Creates a dict dataset from a dataframe

    Given a categorical feature it creates a dict where each key is
    a level of the feature and each value is a dataframe, then you
    can use this datasets dict to plot graphs

    Args:
        dataframe (pandas.DataFrame): the frame that you're going to
            use to create a dict datasets
        feature (str): this feature will be used for grouping and
            creating the datasets dict

    Returns:
        dict: A :class:`dict` where keys are levels of ``feature`` and values
        are :class:`pandas.core.frame.DataFrame` from a ``dataframe.groupby(feature)``

    Raises:
        KeyError: if ``feature`` is not present in ``dataframe``
    """
    grouped = dataframe.groupby([feature])
    datasets = dict(e for e in grouped)

    return datasets


# Long Form Data


def _cat_longform(dataset, name, feature):
    data = dataset[feature].value_counts().reset_index()
    data.columns = ['level', 'count']
    data['feature'] = feature
    data['dataset'] = name
    data['prop'] = data['count'] / dataset.shape[0]

    return data


def _cont_longform(dataset, name, feature):
    data = pd.DataFrame()
    data['dataset'] = np.repeat(name, dataset.shape[0])
    data['feature'] = np.repeat(feature, dataset.shape[0])
    data['value'] = dataset[feature]

    return data


def _longform_frame(datasets, features, func):
    _check_features_presence(datasets, features)

    data_grid = product(datasets.items(), features)
    data = [func(d, n, f) for (n, d), f in data_grid]
    data = pd.concat(data)

    return data


[docs]def categorical_longform(datasets, features):
    """Given datasets and features it returns a long form representation of it

    Args:
        datasets (dict): each key is a dataset name and each value is a ``pandas.DataFrame``
        features (list): a list of string features present in the datasets

    Returns:
        pandas.core.frame.DataFrame: A tidy data long form

    Raises:
        KeyError: if any of the ``features`` isn't present in the ``datasets`` dict
    """
    longform = _longform_frame(datasets, features, _cat_longform)

    return longform


[docs]def continuous_longform(datasets, features):
    """Given datasets and features it returns a long form representation of it

    Args:
        datasets (dict): each key is a dataset name and each value is a ``pandas.DataFrame``
        features (list): a list of string features present in the datasets

    Returns:
        pandas.core.frame.DataFrame: A tidy data longform dataframe

    Raises:
        KeyError: if any of the ``features`` isn't present in the ``datasets`` dict
    """
    longform = _longform_frame(datasets, features, _cont_longform)

    return longform


# Plot Style config

TITLE_FONTSIZE = 20
TITLE_YSPACE = 1.06


[docs]def plot_continuous_diff(datasets,
                         features,
                         kind="box",
                         col_wrap=3,
                         size=4,
                         aspect=1,
                         title=None):
    """Plots the distribution differences of continuous features in each dataset

    Args:
        datasets (dict): a dict where the keys are names and the values
            are ``pandas.DataFrame``
        features (list): a list of continuous features present in every
            dataset of ``datasets``
        kind (str): {point, bar, box, violin, strip}
            The kind of plot to draw.
        col_wrap (int): how many charts you want per row
        size (float): Height (in inches)
        aspect (float): Aspect ratio of each facet, so that aspect * size gives the width
            of each facet in inches
        title (str): the title of the figure

    Returns:
        (pandas.core.frameDataFrame, matplotlib.Figure): a tuple with a longform data frame
        and matplotlib figure to customize

    Raises:
        KeyError: if any of the ``features`` isn't present in the ``datasets`` dict
    """
    if title is None:
        title = "{} differences".format("/".join(datasets.keys()))

    data = continuous_longform(datasets, features)
    grid = sns.factorplot(
        x="dataset",
        y="value",
        col="feature",
        data=data,
        kind=kind,
        sharey=False,
        size=size,
        aspect=aspect,
        col_wrap=col_wrap)

    grid.fig.suptitle(title, y=TITLE_YSPACE, fontsize=TITLE_FONTSIZE)

    return data, grid.fig


[docs]def plot_categorical_diff(datasets,
                          features,
                          kind="prop",
                          col_wrap=4,
                          size=4,
                          aspect=1,
                          title=None):
    """Plots the distribution differences of categorical features in each dataset

    Args:
        datasets (dict): a dict where the keys are names and the values
            are ``pandas.DataFrame``
        features (list): a list of categorical features present in every
            dataset of ``datasets``
        kind (Optional[str]): {count, prop}
            Use "count" for count of unique values for every level of a feature
            in every dataset present in ``datasets``
            Use "prop" for the proportion of that level of a feature
        col_wrap (int): how many charts you want per row
        size (float): Height (in inches)
        aspect (float): Aspect ratio of each facet, so that aspect * size gives the
            width of each facet in inches
        title (str): the title of the figure

    Returns:
        (pandas.core.frameDataFrame, matplotlib.Figure): a tuple with a longform data frame
        and matplotlib figure to customize


    Raises:
        KeyError: if any of the ``features`` isn't present in the ``datasets`` dict
    """
    if title is None:
        title = "{} differences".format("/".join(datasets.keys()))

    longform_data = categorical_longform(datasets, features)

    # Group longform and sort by `features` order
    grouped_features = longform_data.groupby(['feature'])

    grouped_features = sorted(
        grouped_features, key=lambda x: features.index(x[0]))

    ncol = col_wrap
    n_axes = len(features)
    nrow = int(np.ceil(n_axes / col_wrap))

    figsize = (ncol * size * aspect, nrow * size)
    fig = plt.figure(figsize=figsize)
    fig.suptitle(title, y=TITLE_YSPACE, fontsize=TITLE_FONTSIZE)
    plt.subplots_adjust(wspace=0.5, hspace=0.35)

    axes = np.empty(n_axes, object)
    for i in range(n_axes):
        axes[i] = fig.add_subplot(nrow, ncol, i + 1)

    data_grid = zip(grouped_features, axes)
    for (name, data), ax in data_grid:
        sns.barplot(
            x="level", y=kind, hue="dataset", data=data, ax=ax).set_title(name)

    return longform_data, fig


[docs]class TrainTestDiff(object):
    """ Helper class to ease distribution analysis on the same datasets"""

    def __init__(self, datasets):
        self.datasets = datasets

[docs]    def plot_cont_diff(self,
                       features,
                       kind="box",
                       col_wrap=3,
                       size=4,
                       aspect=1,
                       title=None):
        """ See :func:`plot_continuous_diff`"""
        return plot_continuous_diff(self.datasets, features, kind, col_wrap,
                                    size, aspect, title)

[docs]    def plot_cat_diff(self, features, col_wrap=3, kind="prop", title=None):
        """ See :func:`plot_categorical_diff`"""
        return plot_categorical_diff(
            self.datasets, features, kind=kind, col_wrap=col_wrap, title=title)