Exploratory Data Analysis 数据探索

Do EDA first. Do not immediately dig into modelling.

  • Get domain knowledge
    It helps to deeper understand the problem.
  • Check if the data is intuitive
    And agrees with domain knowledge.
  • Understand how the data was generated
    As it is crucial to set up a proper validation.

Data Overview

1
2
3
4
5
6
df.dtypes
df.info()
x.value_counts()
x.isnull()
df.head()
df.shape

Visualization Explained

Image Loading
Image Loading
Image Loading
Image Loading
Image Loading
Image Loading

Visualization

statistics

statistics

Image Loading

1
2
3
df.describe()
x.mean()
x.var()

boxplot and histogram

Image Loading

1
2
3
4
5
6
7
8
9
10
11
12
def plotstats(df, col):
import matplotlib.pyplot as plt
## Setup for ploting two charts one over the other
fig, ax = plt.subplots(2, 1, figsize = (12, 8))
## First a box plot
df.dropna().boxplot(col, ax = ax[0], vert = False, return_type = 'dict')
## Plot the histogram
temp = df[col].as_matrix()
ax[1].hist(temp, bins = 30, alpha = 0.7)
plt.ylabel('Number of Cars')
plt.xlabel(col)
return [col]

Bar Plot the Categorical Features

比例比绝对数目重要。
Image Loading

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
## Plot categorical variables as bar plots
def income_barplot(df):
import numpy as np
import matplotlib.pyplot as plt

cols = df.columns.tolist()[:-1]
for col in cols:
if(df.ix[:, col].dtype not in [np.int64, np.int32, np.float64]):
temp1 = df.ix[df['income'] == '<=50K', col].value_counts()
temp0 = df.ix[df['income'] == '>50K', col].value_counts()

ylim = [0, max(max(temp1), max(temp0))]
fig = plt.figure(figsize = (12, 6))
fig.clf()
ax1 = fig.add_subplot(1, 2, 1)
ax0 = fig.add_subplot(1, 2, 2)
temp1.plot(kind = 'bar', ax = ax1, ylim = ylim)
ax1.set_title('Values of ' + col + '\n for income <= 50K')
temp0.plot(kind = 'bar', ax = ax0, ylim = ylim)
ax0.set_title('Values of ' + col + '\n for income > 50K')
return('Done')

income_barplot(income)

Box Plot the Numeric Features, Conditioned on the Label Value.

Image Loading

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
## Plot categorical variables as box plots
def income_boxplot(df):
import numpy as np
import matplotlib.pyplot as plt

cols = df.columns.tolist()[:-1]
for col in cols:
if(df[col].dtype in [np.int64, np.int32, np.float64]):
fig = plt.figure(figsize = (6, 6))
fig.clf()
ax = fig.gca()
df.boxplot(column = [col], ax = ax, byk = ['income'])
return ('Done')

income_boxplot(income)

Pair-Wise Scatter Plot

大致看看每列之间的关系(不过这种是针对Regression问题的)。用seaborn包的pairplot。
Image Loading

1
2
3
import seaborn as sns
num_cols = ['length', 'curb-weight', 'engine-size', 'horsepower', 'city-mpg', 'price', 'fuel-type']
sns.pairplot(auto_price[num_cols], size = 2)

Conditioned Histograms

一般是一个数据值以一个 categorical 为 condition 的 histogram。
Image Loading

1
2
3
4
5
6
7
8
9
10
11
12
13
## Function to plot conditioned histograms
def cond_hists(df, plot_cols, grid_col):
import matplotlib.pyplot as plt
import seaborn as sns
## Loop over the list of columns
for col in plot_cols:
grid1 = sns.FacetGrid(df, col = grid_col)
grid1.map(plt.hist, col, alpha = .7)
return grid_col

## Define columns for making a conditioned histogram
plot_cols = ['length', 'curb-weight', 'engine-size', 'city-mpg', 'price']
cond_hists(auto_price, plot_cols, 'drive-wheels')

Conditioned Box Plot

Image Loading

1
2
3
4
5
6
7
8
9
10
11
12
## Create boxplots of data
def auto_boxplot(df, plot_cols, by):
import matplotlib.pyplot as plt
for col in plot_cols:
fig = plt.figure(figsize = (9, 6))
ax = fig.gca()
df.boxplot(column = col, by = by, ax = ax)
ax.set_title('Box plots of ' + col + ' by ' + by)
ax.set_ylabel(col)
return by

auto_boxplot(auto_price, plot_cols2, 'drive-wheels')

Scatter Plot

通过使用颜色,可以在二维plot上看看三维信息
Image Loading

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
## Create scatter plot
def auto_scatter(df, plot_cols):
import matplotlib.pyplot as plt
for col in plot_cols:
fig = plt.figure(figsize = (8, 8))
ax = fig.gca()
temp1 = df.ix[df['fuel-type'] == 'gas']
temp2 = df.ix[df['fuel-type'] == 'diesel']
if temp1.shape[0] > 0:
temp1.plot(kind = 'scatter', x = col, y = 'price', ax = ax, color = 'DarkBlue')
if temp2.shape[0] > 0:
temp2.plot(kind = 'scatter', x = col, y = 'price', ax = ax, color = 'Red')
ax.set_title('Scatter plot of price vs. ' + col)
return plot_cols

## Define columns for making scatter plots
plot_cols = ['length', 'curb-weight', 'engine-size', 'city-mpg']
auto_scatter(auto_price, plot_cols)

Conditioned Scatterplot

较难解读。
Image Loading

1
2
3
4
5
6
7
8
def cond_plot(cols):
import Ipython.html.widgets
import seaborn as sns
for col in cols:
g = sns.FacetGrid(auto_price, col = 'num-cylinders', row = 'body-style', hue = 'fuel-type', palette = 'Set2', margin_title = True)
g.map(sns.regplot, col, 'price', fit_reg = False)

cond_plot(plot_cols3)

t-test

(对于两个来源非常相近的值,比如妈妈和女儿的身高,可以用t-test比较两者的mean是否有显著差异。)会用到statsmodels.stats.weightstats来计算two-sided t statistics。
Image Loading
Image Loading
Image Loading

Ref

[1] Coursera - How to Win a Data Competition
[2] Edx - Data Science Essentials