2017 May, 25
Exploratory Data Analysis of the Data from the Sberbank Kaggle Competition. This kernel is based in the A Very Extensive Exploratory Analysis in Python notebook.
Its pupose is to familiarize with the data and prepare to preprocess the data and to decide the machine learning algorithms to use.
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from mpl_toolkits.axes_grid1 import make_axes_locatable
import datetime
color = sns.color_palette()
%matplotlib inline
pd.options.mode.chained_assignment = None # default='warn'
pd.set_option('display.max_columns', 500)
train_df = pd.read_csv("../input/train.csv", parse_dates=['timestamp'])
train_df['price_doc_log'] = np.log1p(train_df['price_doc'])
macro_df = pd.read_csv("../input/macro.csv", parse_dates=['timestamp'])
macro_df['price_doc_log'] = np.log1p(train_df['price_doc'])
train_df.shape
train_df.head()
macro_df.shape
macro_df.head()
train_na = (train_df.isnull().sum() / len(train_df)) * 100
train_na = train_na.drop(train_na[train_na == 0].index).sort_values(ascending=False)
## Proportion of missing values per variable
f, ax = plt.subplots(figsize=(12, 8))
plt.xticks(rotation='90')
sns.barplot(x=train_na.index, y=train_na)
_ = ax.set(title='Percent missing data by feature', ylabel='% missing')
{'null values': sum(train_df['price_doc'].isnull()), 'zero values': sum(train_df['price_doc'] == 0)}
train_df['price_doc'].describe()
train_df['price_doc_log'].describe()
## Distribution of the price
f, ax = plt.subplots(figsize=(12, 8))
#logi = np.logical_not((train_df['price_doc'].isnull()) | (train_df['price_doc'] == 0))
nonnulbuildyears = train_df.loc[:, 'price_doc']
sns.distplot(a=nonnulbuildyears, bins=100, kde=False)
ax.set(title='Distribution of log price', ylabel='counts')
ax.set_yscale('log')
ax.set_xscale('log')
## Distribution of the price
f, ax = plt.subplots(figsize=(12, 8))
#logi = np.logical_not((train_df['price_doc'].isnull()) | (train_df['price_doc'] == 0))
nonnulbuildyears = train_df.loc[:, 'price_doc_log']
#sns.distplot(a=nonnulbuildyears, bins=100, kde=False)
sns.distplot(a=nonnulbuildyears, bins=100, kde=True)
_ = ax.set(title='Distribution of log price', ylabel='counts')
#ax.set_yscale('log')
#ax.set_xscale('log')
Studying the prices of the house considering the instrinsec characteristics of the property.
## Studying the internal characteristics
internal_chars = ['full_sq', 'life_sq', 'floor', 'max_floor', 'build_year', 'num_room', 'kitch_sq', 'state', 'price_doc']
train_df[internal_chars].head()
## Proportion of missing values per variable
train_na = (train_df[internal_chars].isnull().sum() / len(train_df)) * 100
train_na = train_na.drop(train_na[train_na == 0].index).sort_values(ascending=False)
f, ax = plt.subplots(figsize=(12, 8))
plt.xticks(rotation='90')
sns.barplot(x=train_na.index, y=train_na)
_ = ax.set(title='Percent missing data by feature', ylabel='% missing')
Data quality issues for each variable and study of the distribution of each variable values.
# state should be discrete valued between 1 and 4.
train_df['state'].value_counts()
# There is a 33 in it that is cleary a data entry error
# Lets just replace it with the mode.
train_df.loc[train_df['state'] == 33, 'state'] = train_df['state'].mode()
f, ax = plt.subplots(figsize=(12, 8))
sns.violinplot(x='state', y='price_doc_log', data=train_df, scale='count')
#sns.swarmplot(data=train_df[internal_chars], color="w", alpha=.2)
#ax.set_yscale('log')
ax.set_ylabel('Logaritmic prices')
_ = plt.title("Distributions of sq prices per states of the building")
#sns.violinplot(x="square meters", y="price_doc
# Study of the build year (easy to identify some errors)
train_df['build_year'].value_counts()
# There are some of them that has non logical values
# build_year has an erronus value 20052009. Since its unclear which it should be, let's replace with 2007
train_df.loc[train_df['build_year'] == 20052009, 'build_year'] = 2007
train_df.loc[train_df['build_year'] < 1500, 'build_year'] = np.nan
train_df.loc[train_df['build_year'] > 2018, 'build_year'] = np.nan
_ = train_df['build_year'].hist(bins=100)
## Proportion of missing values per variable
## Probably there are two other problematic values
f, ax = plt.subplots(figsize=(12, 8))
logi = np.logical_not((train_df['build_year'].isnull()) | (train_df['build_year'] == 0))
nonnulbuildyears = train_df.loc[logi, 'build_year']
sns.distplot(a=nonnulbuildyears, bins=100, kde=False)
#sns.distplot(a=nonnulbuildyears, bins=100, kde=True)
ax.set(title='Temporal distribution of building years', ylabel='counts')
ax.set_yscale('log')
ax.set_xlim(1650, 2018)
area_chars = ['full_sq', 'life_sq', 'kitch_sq']
# Imputation for the outliers
#train_df[internal_chars].loc[train_df[internal_chars] == 0].head()
train_df.loc[train_df['full_sq'] > 2000, 'full_sq'] = train_df['full_sq'].mean()
train_df.loc[train_df['kitch_sq'] > train_df['full_sq'], 'kitch_sq'] = np.nan
train_df.loc[train_df['kitch_sq'] > train_df['life_sq'], 'kitch_sq'] = np.nan
train_df.loc[train_df['life_sq'] > train_df['full_sq'], 'life_sq'] = np.nan
def plot_notnull_distributions(train_df, name_feats):
for name_fea in name_feats:
logi = np.logical_not((train_df[name_fea].isnull()) | (train_df[name_fea] == 0))
nonnullvals = train_df.loc[logi, name_fea]
sns.distplot(a=nonnullvals, bins=100, kde=False, label=name_fea)
f, ax = plt.subplots(figsize=(12, 8))
plot_notnull_distributions(train_df, area_chars)
ax.set(title='Distribution of area', ylabel='counts')
ax.set_yscale('log')
ax.set_xlabel("Square meters")
_ = ax.legend()
f, ax = plt.subplots(figsize=(12, 8))
sns.violinplot(data=np.log1p(train_df[area_chars]), scale='count')
#sns.swarmplot(data=train_df[internal_chars], color="w", alpha=.2)
#ax.set_yscale('log')
ax.set_ylabel('Logaritmic sqm')
_ = plt.title("Distributions of sq meters")
#sns.violinplot(x="square meters", y="price_doc_log10", data=train_df.drop(ind), inner="box")
# sns.swarmplot(x="state", y="price_doc_log10", data=train_df.dropna(), color="w", alpha=.2);
#ax.set(title='Log10 of median price by state of home', xlabel='state', ylabel='log10(price)')
corrmat = train_df[area_chars].corr()
f, ax = plt.subplots(figsize=(10, 7))
plt.xticks(rotation='90')
_ = sns.heatmap(corrmat, square=True, linewidths=.5, annot=True)
counts_rooms = train_df['num_room'].value_counts()
## Counts of rooms
f, ax = plt.subplots(figsize=(12, 8))
#plt.xticks(rotation='90')
sns.barplot(x=counts_rooms.index, y=counts_rooms)
_ = ax.set(title='Counts of rooms', ylabel='counts')
train_df.loc[train_df['num_room'] == 0, 'num_room'] = np.nan
train_df.loc[(train_df['max_floor'] < train_df['floor']), ['max_floor', 'floor']].head()
## Imputation of the maximum floor for the ones higher than the max floor
logi = (train_df['max_floor'] < train_df['floor'])
train_df.loc[logi, 'max_floor'] = train_df.loc[logi, 'floor']
## Material counts
material_counts = train_df['material'].value_counts().sort_values(ascending=False)
material_counts.index = [str(int(e)) for e in material_counts.index]
material_counts['null'] = sum(train_df['material'].isnull())
## Plot of counts
f, ax = plt.subplots(figsize=(12, 3))
sns.barplot(x=material_counts, y=material_counts.index, orient='h',
palette=sns.color_palette("GnBu_d"))
_ = ax.set(title='Material counts', xlabel='counts')
for p in ax.patches:
width = p.get_width()
plt.text(50+p.get_width(), p.get_y()+0.55*p.get_height(),
'{}'.format(int(width)),
ha='left', va='center')
_ = ax.set_xlim([0, max(material_counts)+1000])
materials = train_df['material'].fillna('null').apply(lambda x: str(int(x)) if x != 'null' else x)
materials = pd.concat([materials, train_df['price_doc_log']], axis=1)
f, ax = plt.subplots(figsize=(12, 8))
_ = sns.violinplot(x='material', y='price_doc_log', data=materials, scale='count')
## Product counts
product_counts = train_df['product_type'].value_counts().sort_values(ascending=False)
## Plot of counts
f, ax = plt.subplots(figsize=(12, 2))
sns.barplot(x=product_counts, y=product_counts.index, orient='h',
palette=sns.color_palette("GnBu_d"))
_ = ax.set(title='Product type counts', xlabel='counts')
for p in ax.patches:
width = p.get_width()
plt.text(-150+p.get_width(), p.get_y()+0.55*p.get_height(),
'{}'.format(int(width)),
ha='right', va='center')
products = train_df['product_type'].fillna('null')
products = pd.concat([products, train_df['price_doc_log']], axis=1)
f, ax = plt.subplots(figsize=(12, 8))
_ = sns.violinplot(x='product_type', y='price_doc_log', data=products, scale='count')
corrmat = train_df[internal_chars].corr()
f, ax = plt.subplots(figsize=(10, 7))
plt.xticks(rotation='90')
_ = sns.heatmap(corrmat, square=True, linewidths=.5, annot=True)
corrmat = corrmat.iloc[:-1, :]
## Plot of counts
#Create a matplotlib colormap from the sns seagreen color palette
cmap = sns.diverging_palette(0, 150, as_cmap=True)
#cmap = sns.color_palette("RdYlGn", 11, as_cmap=True)
# Normalize to the range of possible values from df["c"]
norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
# create a color dictionary (value in c : color from colormap)
colors = {}
for cfea, cval in zip(corrmat.index, corrmat.iloc[:, -1]):
if cfea == 'price_doc':
continue
colors.update({cfea : cmap(norm(cval))})
fig, ax = plt.subplots(figsize=(12, 5))
sns.barplot(x=corrmat['price_doc'], y=corrmat.index, orient='h', palette=colors)
_ = ax.set(title='Correlations with price', xlabel='correlation value')
_ = ax.set_xlim(-1, 1)
# remove the legend, because we want to set a colorbar instead
#plt.gca().legend_.remove()
## create colorbar ##
divider = make_axes_locatable(plt.gca())
ax_cb = divider.new_horizontal(size="5%", pad=0.15)
fig.add_axes(ax_cb)
cb1 = matplotlib.colorbar.ColorbarBase(ax_cb, cmap=cmap, norm=norm,
orientation='vertical')
cb1.set_label('Correlation')
plt.show()
f, ax = plt.subplots()
g = sns.PairGrid(train_df[internal_chars], diag_sharey=False)
#g = sns.pairplot(train_df[internal_chars], diag_kind="kde")
g.map_diag(sns.kdeplot, lw=3)
g.map_upper(plt.scatter)
g.map_lower(plt.scatter)
#g.map_lower(sns.kdeplot, cmap="Blues_d")
for ax in g.axes.flat:
plt.setp(ax.get_xticklabels(), rotation=45)
## Region counts
region_counts = train_df['sub_area'].value_counts().sort_values(ascending=False)
## Plot of counts
f, ax = plt.subplots(figsize=(12, 22))
sns.barplot(x=region_counts, y=region_counts.index, orient='h')
_ = ax.set(title='Counts by region', xlabel='counts')
years = mdates.YearLocator() # every year
yearsFmt = mdates.DateFormatter('%Y')
ts_vc_train = train_df['timestamp'].value_counts()
## Plot
fig, ax = plt.subplots(figsize=(12, 6))
plt.bar(left=ts_vc_train.index, height=ts_vc_train)
ax.xaxis.set_major_locator(years)
ax.xaxis.set_major_formatter(yearsFmt)
_ = ax.set(title='Number of transactions by day', ylabel='count')
# Smooth the data counts
idx = pd.date_range(ts_vc_train.index.min(), ts_vc_train.index.max())
ts_vc_train.index = pd.DatetimeIndex(ts_vc_train.index)
ts_vc_train = ts_vc_train.reindex(idx, fill_value=0)
ts_vc_train_smooth = ts_vc_train.rolling(window=7, win_type='triang', center=True).mean()
## Treatment of data
years_u = np.unique(ts_vc_train_smooth.index.year)
logi = [ts_vc_train_smooth.index.year == y_u for y_u in years_u]
ts_vc_train_years = [ts_vc_train_smooth[l] for l in logi]
indices_years = [np.array(ts_vc_train_smooth.index[logi[i]] - datetime.datetime(years_u[i], 1, 1)) for i in range(len(years_u))]
# Reference year
days = pd.date_range(datetime.datetime(2004, 1, 1), datetime.datetime(2004, 12, 31))
days_diff = np.array(days - datetime.datetime(2004, 1, 1)).astype(int)
days_bool = [d in [1, 15] for d in days.day]
days_loc = [days_diff[i] for i in range(len(days_diff)) if days_bool[i]]
days_ticks = days[days_bool]
## Plot
fig, ax = plt.subplots(figsize=(18, 10))
for dates, data, year in zip(indices_years, ts_vc_train_years, years_u):
plt.plot(dates, data, label=year)
_ = ax.set_xlim([0, int(np.hstack(indices_years).max())])
_ = plt.xticks(days_loc, days_ticks.strftime('%b-%d'), rotation=30)
_ = ax.set(title='Number of transactions per day', ylabel='count')
_ = ax.legend()
# Smooth the data counts
ts_vc_train_smooth = ts_vc_train.rolling(window=1, win_type='triang', center=True).mean()
## Treatment of data
years_u = np.unique(ts_vc_train_smooth.index.year)
logi = [ts_vc_train_smooth.index.year == y_u for y_u in years_u]
ts_vc_train_years = [ts_vc_train_smooth[l] for l in logi]
# Collapse by day of the week
for i in range(len(ts_vc_train_years)):
days_w = ts_vc_train_years[i].index.weekday
collapse2days = {}
for day_i in range(7):
collapse2days[day_i] = ts_vc_train_years[i][days_w == day_i].mean()
ts_vc_train_years[i] = pd.Series(collapse2days)
# Reference year
days_loc = range(7)
days_ticks = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
## Plot
fig, ax = plt.subplots(figsize=(18, 10))
for data, year in zip(ts_vc_train_years, years_u):
plt.plot(data, label=year)
_ = ax.set_xlim([0, 6])
_ = plt.xticks(days_loc, days_ticks, rotation=30)
_ = ax.set(title='Mean count of transactions per day', ylabel='count')
_ = ax.legend()
# Smooth the data counts
idx = pd.date_range(ts_vc_train.index.min(), ts_vc_train.index.max())
ts_vc_train.index = pd.DatetimeIndex(ts_vc_train.index)
ts_vc_train = ts_vc_train.reindex(idx, fill_value=0)
ts_vc_train = ts_vc_train.rolling(window=7, win_type='triang', center=True).mean()
ts_vc_train
## Treatment of data
years_u = np.unique(ts_vc_train.index.year)
logi = [ts_vc_train.index.year == y_u for y_u in years_u]
ts_vc_train_years = [ts_vc_train[l] for l in logi]
indices_years = [np.array(ts_vc_train.index[logi[i]] - datetime.datetime(years_u[i], 1, 1)) for i in range(len(years_u))]
# Reference year
days = pd.date_range(datetime.datetime(2004, 1, 1), datetime.datetime(2004, 12, 31))
days_diff = np.array(days - datetime.datetime(2004, 1, 1)).astype(int)
days_bool = [d in [1, 15] for d in days.day]
days_loc = [days_diff[i] for i in range(len(days_diff)) if days_bool[i]]
days_ticks = days[days_bool]
## Plot
fig, ax = plt.subplots(figsize=(18, 10))
for dates, data, year in zip(indices_years, ts_vc_train_years, years_u):
plt.plot(dates, data, label=year)
_ = ax.set_xlim([0, int(np.hstack(indices_years).max())])
_ = plt.xticks(days_loc, days_ticks.strftime('%b-%d'), rotation=30)
_ = ax.set(title='Number of transactions per day', ylabel='count')
_ = ax.legend()
## Study of the prices by the building year
ind = train_df[(train_df['build_year'] <= 1691) | (train_df['build_year'] >= 2018)].index
by_df = train_df.drop(ind).sort_values(by=['build_year'])
by_price = by_df.groupby('build_year')[['build_year', 'price_doc']].mean()
f, ax = plt.subplots(figsize=(17, 6))
sns.regplot(x="build_year", y="price_doc", data=by_price, scatter=False, order=3, truncate=True)
plt.plot(by_price['build_year'], by_price['price_doc'], color='r')
ax.set(title='Mean price by year of build')
_ = ax.set_xlim(1860, 2018)
## Study of the prices by the transaction date
fig, ax = plt.subplots(figsize=(17, 6))
ts_df = train_df.groupby('timestamp')[['price_doc']].mean().reset_index(level=1)
ts_df['timestamp'] = pd.to_datetime(ts_df['timestamp'], format='%Y-%b-%d').apply(lambda x: mdates.date2num(x))
sns.regplot(x="timestamp", y="price_doc", data=ts_df, scatter=False, order=3, truncate=True, ci=100)
plt.plot(ts_df['timestamp'], ts_df['price_doc'], color='r', )
sns.tsplot(data=ts_df, time='timestamp', value='price_doc', ax=ax, interpolate=True)
# assign locator and formatter for the xaxis ticks.
ax.xaxis.set_major_locator(mdates.AutoDateLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y.%m.%d'))
# put the labels at 45deg since they tend to be too long
fig.autofmt_xdate()
# Axis
_ = ax.set(title='Daily median price over time')
_ = ax.set_xlim(ts_df['timestamp'].min(), ts_df['timestamp'].max())
macro_na = (macro_df.isnull().sum() / len(macro_df)) * 100
macro_na = macro_na.drop(macro_na[macro_na == 0].index).sort_values(ascending=False)
## Proportion of missing values per variable
f, ax = plt.subplots(figsize=(18, 6))
plt.xticks(rotation='90')
sns.barplot(x=macro_na.index, y=macro_na)
_ = ax.set(title='Percent missing data by feature', ylabel='% missing')
corrmat = macro_df.corr()
#f, ax = plt.subplots(figsize=(10, 7))
#plt.xticks(rotation='90')
#_ = sns.heatmap(corrmat, square=True, linewidths=.5, annot=False)
corrmat = corrmat.iloc[:-1, :]
## Plot of counts
#Create a matplotlib colormap from the sns seagreen color palette
cmap = sns.diverging_palette(0, 150, as_cmap=True)
#cmap = sns.color_palette("RdYlGn", 11, as_cmap=True)
# Normalize to the range of possible values from df["c"]
norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
# create a color dictionary (value in c : color from colormap)
colors = {}
for cfea, cval in zip(corrmat.index, corrmat.iloc[:, -1]):
if cfea == 'price_doc_log':
continue
colors.update({cfea : cmap(norm(cval))})
fig, ax = plt.subplots(figsize=(12, 22))
sns.barplot(x=corrmat['price_doc_log'], y=corrmat.index, orient='h', palette=colors)
_ = ax.set(title='Correlations with logprice', xlabel='correlation value')
_ = ax.set_xlim(-1, 1)
# remove the legend, because we want to set a colorbar instead
#plt.gca().legend_.remove()
## create colorbar ##
divider = make_axes_locatable(plt.gca())
ax_cb = divider.new_horizontal(size="5%", pad=0.15)
fig.add_axes(ax_cb)
cb1 = matplotlib.colorbar.ColorbarBase(ax_cb, cmap=cmap, norm=norm,
orientation='vertical')
cb1.set_label('Correlation')
plt.show()