Source code for henchman.diagnostics

# -*- coding: utf-8 -*-

'''The diagnostics module. Describe a particular dataset.
'''
import numpy as np
import pandas as pd


def title(string):
    centerline = '|  {}  |'.format(string)
    border = '+' + '{}'.format('-' * (len(centerline) - 2)) + '+'
    print('')
    print(border)
    print(centerline)
    print(border)


def subtitle(string):
    print('')
    print('## {} ##'.format(string))


[docs]def overview(data): '''Give a brief data overview. Contains information about data shape, missing values, memory usage and data types of columns. Args: data (pd.DataFrame): The dataframe for which to give an overview. Example: >>> from henchman.diagnostics import overview >>> overview(df) ''' title('Data Shape') print('Number of columns: {}'.format(data.shape[1])) print('Number of rows: {}'.format(data.shape[0])) title('Missing Values') missing_values = data.isnull().sum().sort_values() print('Most values missing from column: {}'.format(missing_values[-1])) print('Average missing values by column: {:.2f}'.format( missing_values.mean())) title('Memory Usage') memory_used = data.memory_usage(deep=True)/1000000 print('Total memory used: {:.2f} MB'.format(memory_used.sum())) print('Average memory by column: {:.2f} MB'.format(memory_used.mean())) title('Data Types') print(pd.DataFrame([data[col].dtype for col in data] ).reset_index().groupby(0).count())
def _find_duplicates(data): duplicates = data[data.duplicated()] if duplicates.shape[0] > 0: print('DataFrame has {} duplicates'.format(duplicates.shape[0])) def _find_correlations(data, corr_thresh): correlations = data.corr() warningfm = correlations[(np.abs(correlations) > corr_thresh) & ( np.abs(correlations) < 1.)] listed = [] unicorr = [] for col in warningfm: warningcol = warningfm[col][~warningfm[col].isnull()] if not warningcol.empty: for index, value in warningcol.iteritems(): if (index, col) not in listed: print('{} and {} are linearly correlated: {:.3f}'.format( col, index, value)) listed.append((col, index)) listed.append((index, col)) unicorr += [col, index] def _find_missing(data, missing_thresh): for index, value in data.isnull().sum().iteritems(): if value > (data.shape[0] * missing_thresh): print('{} has {} missing values: ({}% of total)'.format( index, value, 100 * value / data.shape[0])) def _find_high_card(data, card_thresh): objects = [col for col in data if data[col].dtype == 'O'] for index, value in data[objects].nunique().iteritems(): if value > card_thresh: print('{} has many unique values: {}'.format(index, value))
[docs]def warnings(data, corr_thresh=.9, missing_thresh=.1, card_thresh=50): '''Warn about common dataset problems. Checks for duplicates, highly linearly correlated columns, columns with many missing values and categorical columns with many unique values. Args: data (pd.DataFrame): The dataframe to warn about. corr_thresh (float): Warn above this threshold (Default .9) missing_thresh (float): Warn above this threshold (Default .1) card_thresh (int): Warn above this threshold (Default 50). Example: >>> from henchman.diagnostics import warnings >>> warnings(df, corr_thresh=.5) ''' title('Warnings') _find_duplicates(data) _find_correlations(data, corr_thresh) _find_missing(data, missing_thresh) _find_high_card(data, card_thresh)
def _object_column_summary(data, objects): title('Object Column Summary') for col in objects: subtitle(col) datacol = data[col] print('Unique: {}'.format(len(datacol.unique()))) mode = datacol.mode().values if len(mode) > 1: print('Mode: No Mode') else: mode = mode[0] nummode = 100 * datacol[datacol == mode].shape[0]/datacol.shape[0] print('Mode: {}, (matches {:.1f}% of rows)'.format(mode, nummode)) missing = datacol.isnull().sum() if missing > 0: print('Missing: {}'.format(missing)) def _time_column_summary(data, times): title('Time Column Summary') for col in times: subtitle(col) datacol = data[col] print('Last Time: {}'.format(datacol.max())) print('First Time: {}'.format(datacol.min())) def _boolean_column_summary(data, bools): title('Boolean Column Summary') for col in bools: subtitle(col) datacol = data[col] numtrue = float(datacol.sum()) total = datacol.shape[0] perctrue = 100 * numtrue / total print('Number True: {}, Number False: {}, Mean: {:.2f}'.format( numtrue, total - numtrue, datacol.mean())) print('Percent True: {:.1f}% | Percent False: {:.1f}%'.format( perctrue, 100 - perctrue)) missing = datacol.isnull().sum() if missing > 0: print('Missing: {}'.format(missing)) def _numeric_column_summary(data, numbers): title('Numeric Column Summary') for col in numbers: subtitle(col) datacol = data[col] print('Maximum: {}, Minimum: {}, Mean: {:.2f}'.format( datacol.max(), datacol.min(), datacol.mean())) print('Quartile 3: {:.2f} | Median: {:.2f}' '| Quartile 1: {:.2f}'.format(datacol.quantile(.75), datacol.quantile(.5), datacol.quantile(.25))) missing = datacol.isnull().sum() if missing > 0: print('Missing: {}'.format(missing))
[docs]def column_report(data): '''Give column summaries according to pandas dtype. Has functionality for objects, times, booleans and numeric columns. Finds maximums, minimums, means, missing and other datatype appropriate attributes. Args: data (pd.DataFrame): The dataframe on which to report. Example: >>> from henchman.diagnostics import column_report >>> column_report(df) ''' objects = [col for col in data if data[col].dtype == 'O'] times = [col for col in data if data[col].dtype == '<M8[ns]'] bools = [col for col in data if data[col].dtype == 'bool'] numbers = [col for col in data if data[col].dtype in ( ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'])] if objects != []: _object_column_summary(data, objects) if times != []: _time_column_summary(data, times) if bools != []: _boolean_column_summary(data, bools) if numbers != []: _numeric_column_summary(data, numbers)
[docs]def profile(data, corr_thresh=.9, missing_thresh=.1, card_thresh=50): '''Profile dataset. Gives a dataset overview, writes the warnings and reports on all columns. Args: data (pd.DataFrame): The dataframe to profile. corr_thresh (float): Warn above this threshold (Default .9) missing_thresh (float): Warn above this threshold (Default .1) card_thresh (int): Warn above this threshold (Default 50) Example: >>> from henchman.diagnostics import profile >>> profile(df, missing_thresh=.3, card_thresh=10) ''' overview(data) warnings(data, corr_thresh, missing_thresh, card_thresh) column_report(data)