Exploratory Data Analysis¶

2017 May, 25

Exploratory Data Analysis of the Data from the Sberbank Kaggle Competition. This kernel is based in the A Very Extensive Exploratory Analysis in Python notebook.

Its pupose is to familiarize with the data and prepare to preprocess the data and to decide the machine learning algorithms to use.

Training data
Price
Housing internal characteristics
- Univariant study
- Pair variables study
Region
Temporal

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from mpl_toolkits.axes_grid1 import make_axes_locatable
import datetime

color = sns.color_palette()

%matplotlib inline

pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('display.max_columns', 500)

Training Data¶

train_df = pd.read_csv("../input/train.csv", parse_dates=['timestamp'])
train_df['price_doc_log'] = np.log1p(train_df['price_doc'])
macro_df = pd.read_csv("../input/macro.csv", parse_dates=['timestamp'])
macro_df['price_doc_log'] = np.log1p(train_df['price_doc'])

train_df.shape

(30471, 293)

train_df.head()

macro_df.shape

(2484, 101)

macro_df.head()

Missing Data¶

train_na = (train_df.isnull().sum() / len(train_df)) * 100
train_na = train_na.drop(train_na[train_na == 0].index).sort_values(ascending=False)
## Proportion of missing values per variable
f, ax = plt.subplots(figsize=(12, 8))
plt.xticks(rotation='90')
sns.barplot(x=train_na.index, y=train_na)
_ = ax.set(title='Percent missing data by feature', ylabel='% missing')

Price¶

{'null values': sum(train_df['price_doc'].isnull()), 'zero values': sum(train_df['price_doc'] == 0)}

{'null values': 0, 'zero values': 0}

train_df['price_doc'].describe()

count    3.047100e+04
mean     7.123035e+06
std      4.780111e+06
min      1.000000e+05
25%      4.740002e+06
50%      6.274411e+06
75%      8.300000e+06
max      1.111111e+08
Name: price_doc, dtype: float64

train_df['price_doc_log'].describe()

count    30471.000000
mean        15.609483
std          0.604574
min         11.512935
25%         15.371548
50%         15.651990
75%         15.931766
max         18.526041
Name: price_doc_log, dtype: float64

## Distribution of the price
f, ax = plt.subplots(figsize=(12, 8))
#logi = np.logical_not((train_df['price_doc'].isnull()) | (train_df['price_doc'] == 0))
nonnulbuildyears = train_df.loc[:, 'price_doc']
sns.distplot(a=nonnulbuildyears, bins=100, kde=False)
ax.set(title='Distribution of log price', ylabel='counts')
ax.set_yscale('log')
ax.set_xscale('log')

## Distribution of the price
f, ax = plt.subplots(figsize=(12, 8))
#logi = np.logical_not((train_df['price_doc'].isnull()) | (train_df['price_doc'] == 0))
nonnulbuildyears = train_df.loc[:, 'price_doc_log']
#sns.distplot(a=nonnulbuildyears, bins=100, kde=False)
sns.distplot(a=nonnulbuildyears, bins=100, kde=True)
_ = ax.set(title='Distribution of log price', ylabel='counts')
#ax.set_yscale('log')
#ax.set_xscale('log')

Housing internal characteristics¶

Studying the prices of the house considering the instrinsec characteristics of the property.

## Studying the internal characteristics
internal_chars = ['full_sq', 'life_sq', 'floor', 'max_floor', 'build_year', 'num_room', 'kitch_sq', 'state', 'price_doc']

train_df[internal_chars].head()

## Proportion of missing values per variable
train_na = (train_df[internal_chars].isnull().sum() / len(train_df)) * 100
train_na = train_na.drop(train_na[train_na == 0].index).sort_values(ascending=False)
f, ax = plt.subplots(figsize=(12, 8))
plt.xticks(rotation='90')
sns.barplot(x=train_na.index, y=train_na)
_ = ax.set(title='Percent missing data by feature', ylabel='% missing')

Univariant study¶

Data quality issues for each variable and study of the distribution of each variable values.

State¶

# state should be discrete valued between 1 and 4.
train_df['state'].value_counts()

2.0     5844
3.0     5790
1.0     4855
4.0      422
33.0       1
Name: state, dtype: int64

# There is a 33 in it that is cleary a data entry error
# Lets just replace it with the mode.
train_df.loc[train_df['state'] == 33, 'state'] = train_df['state'].mode()

f, ax = plt.subplots(figsize=(12, 8))
sns.violinplot(x='state', y='price_doc_log', data=train_df, scale='count')
#sns.swarmplot(data=train_df[internal_chars], color="w", alpha=.2)
#ax.set_yscale('log')
ax.set_ylabel('Logaritmic prices')
_ = plt.title("Distributions of sq prices per states of the building")
#sns.violinplot(x="square meters", y="price_doc

Build year¶

# Study of the build year (easy to identify some errors)
train_df['build_year'].value_counts()

2014.0        919
2015.0        824
0.0           530
2013.0        464
1970.0        418
1969.0        407
1968.0        389
1967.0        384
1965.0        378
2016.0        375
1.0           368
1972.0        360
1974.0        357
1971.0        352
1966.0        348
1960.0        344
1962.0        338
1973.0        333
1963.0        325
1964.0        315
1975.0        309
1961.0        297
1976.0        263
1977.0        260
2006.0        242
1979.0        236
1978.0        235
2008.0        234
2012.0        233
1980.0        226
             ... 
1912.0          5
1947.0          4
1949.0          3
1914.0          3
1924.0          3
1941.0          2
3.0             2
1943.0          2
1860.0          2
1896.0          2
1946.0          2
1907.0          2
1900.0          2
1895.0          1
1691.0          1
20052009.0      1
1911.0          1
2018.0          1
215.0           1
1886.0          1
1905.0          1
4965.0          1
1906.0          1
1920.0          1
1925.0          1
1948.0          1
1904.0          1
71.0            1
20.0            1
1876.0          1
Name: build_year, dtype: int64

# There are some of them that has non logical values
# build_year has an erronus value 20052009. Since its unclear which it should be, let's replace with 2007
train_df.loc[train_df['build_year'] == 20052009, 'build_year'] = 2007
train_df.loc[train_df['build_year'] < 1500, 'build_year'] = np.nan
train_df.loc[train_df['build_year'] > 2018, 'build_year'] = np.nan

_ = train_df['build_year'].hist(bins=100)

## Proportion of missing values per variable
## Probably there are two other problematic values
f, ax = plt.subplots(figsize=(12, 8))
logi = np.logical_not((train_df['build_year'].isnull()) | (train_df['build_year'] == 0))
nonnulbuildyears = train_df.loc[logi, 'build_year']
sns.distplot(a=nonnulbuildyears, bins=100, kde=False)
#sns.distplot(a=nonnulbuildyears, bins=100, kde=True)
ax.set(title='Temporal distribution of building years', ylabel='counts')
ax.set_yscale('log')
ax.set_xlim(1650, 2018)

(1650, 2018)

Flat areas distribution¶

area_chars = ['full_sq', 'life_sq', 'kitch_sq']

# Imputation for the outliers
#train_df[internal_chars].loc[train_df[internal_chars] == 0].head()
train_df.loc[train_df['full_sq'] > 2000, 'full_sq'] = train_df['full_sq'].mean()
train_df.loc[train_df['kitch_sq'] > train_df['full_sq'], 'kitch_sq'] = np.nan
train_df.loc[train_df['kitch_sq'] > train_df['life_sq'], 'kitch_sq'] = np.nan
train_df.loc[train_df['life_sq'] > train_df['full_sq'], 'life_sq'] = np.nan

def plot_notnull_distributions(train_df, name_feats):
    for name_fea in name_feats:
        logi = np.logical_not((train_df[name_fea].isnull()) | (train_df[name_fea] == 0))
        nonnullvals = train_df.loc[logi, name_fea]
        sns.distplot(a=nonnullvals, bins=100, kde=False, label=name_fea)

f, ax = plt.subplots(figsize=(12, 8))
plot_notnull_distributions(train_df, area_chars)
ax.set(title='Distribution of area', ylabel='counts')
ax.set_yscale('log')
ax.set_xlabel("Square meters")
_ = ax.legend()

f, ax = plt.subplots(figsize=(12, 8))
sns.violinplot(data=np.log1p(train_df[area_chars]), scale='count')
#sns.swarmplot(data=train_df[internal_chars], color="w", alpha=.2)
#ax.set_yscale('log')
ax.set_ylabel('Logaritmic sqm')
_ = plt.title("Distributions of sq meters")
#sns.violinplot(x="square meters", y="price_doc_log10", data=train_df.drop(ind), inner="box")
# sns.swarmplot(x="state", y="price_doc_log10", data=train_df.dropna(), color="w", alpha=.2);
#ax.set(title='Log10 of median price by state of home', xlabel='state', ylabel='log10(price)')

corrmat = train_df[area_chars].corr()
f, ax = plt.subplots(figsize=(10, 7))
plt.xticks(rotation='90')
_ = sns.heatmap(corrmat, square=True, linewidths=.5, annot=True)

Num. rooms¶

counts_rooms = train_df['num_room'].value_counts()
## Counts of rooms
f, ax = plt.subplots(figsize=(12, 8))
#plt.xticks(rotation='90')
sns.barplot(x=counts_rooms.index, y=counts_rooms)
_ = ax.set(title='Counts of rooms', ylabel='counts')


train_df.loc[train_df['num_room'] == 0, 'num_room'] = np.nan

Floors¶

train_df.loc[(train_df['max_floor'] < train_df['floor']), ['max_floor', 'floor']].head()

## Imputation of the maximum floor for the ones higher than the max floor
logi = (train_df['max_floor'] < train_df['floor'])
train_df.loc[logi, 'max_floor'] = train_df.loc[logi, 'floor']

Material¶

## Material counts
material_counts = train_df['material'].value_counts().sort_values(ascending=False)
material_counts.index = [str(int(e)) for e in material_counts.index]
material_counts['null'] = sum(train_df['material'].isnull())

## Plot of counts
f, ax = plt.subplots(figsize=(12, 3))
sns.barplot(x=material_counts, y=material_counts.index, orient='h',
            palette=sns.color_palette("GnBu_d"))
_ = ax.set(title='Material counts', xlabel='counts')

for p in ax.patches:
    width = p.get_width()
    plt.text(50+p.get_width(), p.get_y()+0.55*p.get_height(),
             '{}'.format(int(width)),
             ha='left', va='center')
_ = ax.set_xlim([0, max(material_counts)+1000])

materials = train_df['material'].fillna('null').apply(lambda x: str(int(x)) if x != 'null' else x)
materials = pd.concat([materials, train_df['price_doc_log']], axis=1)

f, ax = plt.subplots(figsize=(12, 8))
_ = sns.violinplot(x='material', y='price_doc_log', data=materials, scale='count')

Product type¶

## Product counts
product_counts = train_df['product_type'].value_counts().sort_values(ascending=False)

## Plot of counts
f, ax = plt.subplots(figsize=(12, 2))
sns.barplot(x=product_counts, y=product_counts.index, orient='h',
            palette=sns.color_palette("GnBu_d"))
_ = ax.set(title='Product type counts', xlabel='counts')

for p in ax.patches:
    width = p.get_width()
    plt.text(-150+p.get_width(), p.get_y()+0.55*p.get_height(),
             '{}'.format(int(width)),
             ha='right', va='center')

products = train_df['product_type'].fillna('null')
products = pd.concat([products, train_df['price_doc_log']], axis=1)

f, ax = plt.subplots(figsize=(12, 8))
_ = sns.violinplot(x='product_type', y='price_doc_log', data=products, scale='count')

Pair variables study¶

corrmat = train_df[internal_chars].corr()
f, ax = plt.subplots(figsize=(10, 7))
plt.xticks(rotation='90')
_ = sns.heatmap(corrmat, square=True, linewidths=.5, annot=True)

corrmat = corrmat.iloc[:-1, :]

## Plot of counts
#Create a matplotlib colormap from the sns seagreen color palette
cmap = sns.diverging_palette(0, 150, as_cmap=True)
#cmap = sns.color_palette("RdYlGn", 11, as_cmap=True)
# Normalize to the range of possible values from df["c"]
norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
# create a color dictionary (value in c : color from colormap) 
colors = {}
for cfea, cval in zip(corrmat.index, corrmat.iloc[:, -1]):
    if cfea == 'price_doc':
        continue
    colors.update({cfea : cmap(norm(cval))})

fig, ax = plt.subplots(figsize=(12, 5))
sns.barplot(x=corrmat['price_doc'], y=corrmat.index, orient='h', palette=colors)
_ = ax.set(title='Correlations with price', xlabel='correlation value')
_ = ax.set_xlim(-1, 1)

# remove the legend, because we want to set a colorbar instead
#plt.gca().legend_.remove()

## create colorbar ##
divider = make_axes_locatable(plt.gca())
ax_cb = divider.new_horizontal(size="5%", pad=0.15)
fig.add_axes(ax_cb)
cb1 = matplotlib.colorbar.ColorbarBase(ax_cb, cmap=cmap, norm=norm,
                                       orientation='vertical')
cb1.set_label('Correlation')
plt.show()

f, ax = plt.subplots()
g = sns.PairGrid(train_df[internal_chars], diag_sharey=False)
#g = sns.pairplot(train_df[internal_chars], diag_kind="kde")
g.map_diag(sns.kdeplot, lw=3)
g.map_upper(plt.scatter)
g.map_lower(plt.scatter)
#g.map_lower(sns.kdeplot, cmap="Blues_d")
for ax in g.axes.flat:  
    plt.setp(ax.get_xticklabels(), rotation=45)

/opt/conda/lib/python3.6/site-packages/matplotlib/axes/_axes.py:545: UserWarning: No labelled objects found. Use label='...' kwarg on individual plots.
  warnings.warn("No labelled objects found. "
/opt/conda/lib/python3.6/site-packages/statsmodels/nonparametric/kde.py:454: RuntimeWarning: invalid value encountered in greater
  X = X[np.logical_and(X>clip[0], X<clip[1])] # won't work for two columns.
/opt/conda/lib/python3.6/site-packages/statsmodels/nonparametric/kde.py:454: RuntimeWarning: invalid value encountered in less
  X = X[np.logical_and(X>clip[0], X<clip[1])] # won't work for two columns.

Region¶

## Region counts
region_counts = train_df['sub_area'].value_counts().sort_values(ascending=False)

## Plot of counts
f, ax = plt.subplots(figsize=(12, 22))
sns.barplot(x=region_counts, y=region_counts.index, orient='h')
_ = ax.set(title='Counts by region', xlabel='counts')

Temporal¶

years = mdates.YearLocator()   # every year
yearsFmt = mdates.DateFormatter('%Y')
ts_vc_train = train_df['timestamp'].value_counts()

## Plot
fig, ax = plt.subplots(figsize=(12, 6))
plt.bar(left=ts_vc_train.index, height=ts_vc_train)
ax.xaxis.set_major_locator(years)
ax.xaxis.set_major_formatter(yearsFmt)
_ = ax.set(title='Number of transactions by day', ylabel='count')

# Smooth the data counts
idx = pd.date_range(ts_vc_train.index.min(), ts_vc_train.index.max())
ts_vc_train.index = pd.DatetimeIndex(ts_vc_train.index)
ts_vc_train = ts_vc_train.reindex(idx, fill_value=0)
ts_vc_train_smooth = ts_vc_train.rolling(window=7, win_type='triang', center=True).mean()

## Treatment of data
years_u = np.unique(ts_vc_train_smooth.index.year)
logi = [ts_vc_train_smooth.index.year == y_u for y_u in years_u]
ts_vc_train_years = [ts_vc_train_smooth[l] for l in logi]
indices_years = [np.array(ts_vc_train_smooth.index[logi[i]] - datetime.datetime(years_u[i], 1, 1)) for i in range(len(years_u))]

# Reference year
days = pd.date_range(datetime.datetime(2004, 1, 1), datetime.datetime(2004, 12, 31))
days_diff = np.array(days - datetime.datetime(2004, 1, 1)).astype(int)
days_bool = [d in [1, 15] for d in days.day]
days_loc = [days_diff[i] for i in range(len(days_diff)) if days_bool[i]]
days_ticks = days[days_bool]

## Plot
fig, ax = plt.subplots(figsize=(18, 10))
for dates, data, year in zip(indices_years, ts_vc_train_years, years_u):
    plt.plot(dates, data, label=year)
_ = ax.set_xlim([0, int(np.hstack(indices_years).max())])
_ = plt.xticks(days_loc, days_ticks.strftime('%b-%d'), rotation=30)

_ = ax.set(title='Number of transactions per day', ylabel='count')
_ = ax.legend()

# Smooth the data counts
ts_vc_train_smooth = ts_vc_train.rolling(window=1, win_type='triang', center=True).mean()

## Treatment of data
years_u = np.unique(ts_vc_train_smooth.index.year)
logi = [ts_vc_train_smooth.index.year == y_u for y_u in years_u]
ts_vc_train_years = [ts_vc_train_smooth[l] for l in logi]
# Collapse by day of the week
for i in range(len(ts_vc_train_years)):
    days_w = ts_vc_train_years[i].index.weekday
    collapse2days = {}
    for day_i in range(7):
        collapse2days[day_i] = ts_vc_train_years[i][days_w == day_i].mean()
    ts_vc_train_years[i] = pd.Series(collapse2days)

# Reference year
days_loc = range(7)
days_ticks = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

## Plot
fig, ax = plt.subplots(figsize=(18, 10))
for data, year in zip(ts_vc_train_years, years_u):
    plt.plot(data, label=year)
_ = ax.set_xlim([0, 6])
_ = plt.xticks(days_loc, days_ticks, rotation=30)

_ = ax.set(title='Mean count of transactions per day', ylabel='count')
_ = ax.legend()

# Smooth the data counts
idx = pd.date_range(ts_vc_train.index.min(), ts_vc_train.index.max())
ts_vc_train.index = pd.DatetimeIndex(ts_vc_train.index)
ts_vc_train = ts_vc_train.reindex(idx, fill_value=0)
ts_vc_train = ts_vc_train.rolling(window=7, win_type='triang', center=True).mean()
ts_vc_train

## Treatment of data
years_u = np.unique(ts_vc_train.index.year)
logi = [ts_vc_train.index.year == y_u for y_u in years_u]
ts_vc_train_years = [ts_vc_train[l] for l in logi]
indices_years = [np.array(ts_vc_train.index[logi[i]] - datetime.datetime(years_u[i], 1, 1)) for i in range(len(years_u))]

# Reference year
days = pd.date_range(datetime.datetime(2004, 1, 1), datetime.datetime(2004, 12, 31))
days_diff = np.array(days - datetime.datetime(2004, 1, 1)).astype(int)
days_bool = [d in [1, 15] for d in days.day]
days_loc = [days_diff[i] for i in range(len(days_diff)) if days_bool[i]]
days_ticks = days[days_bool]

## Plot
fig, ax = plt.subplots(figsize=(18, 10))
for dates, data, year in zip(indices_years, ts_vc_train_years, years_u):
    plt.plot(dates, data, label=year)
_ = ax.set_xlim([0, int(np.hstack(indices_years).max())])
_ = plt.xticks(days_loc, days_ticks.strftime('%b-%d'), rotation=30)

_ = ax.set(title='Number of transactions per day', ylabel='count')
_ = ax.legend()

## Study of the prices by the building year
ind = train_df[(train_df['build_year'] <= 1691) | (train_df['build_year'] >= 2018)].index
by_df = train_df.drop(ind).sort_values(by=['build_year'])
by_price = by_df.groupby('build_year')[['build_year', 'price_doc']].mean()

f, ax = plt.subplots(figsize=(17, 6))
sns.regplot(x="build_year", y="price_doc", data=by_price, scatter=False, order=3, truncate=True)
plt.plot(by_price['build_year'], by_price['price_doc'], color='r')
ax.set(title='Mean price by year of build')
_ = ax.set_xlim(1860, 2018)

## Study of the prices by the transaction date
fig, ax = plt.subplots(figsize=(17, 6))
ts_df = train_df.groupby('timestamp')[['price_doc']].mean().reset_index(level=1)
ts_df['timestamp'] = pd.to_datetime(ts_df['timestamp'], format='%Y-%b-%d').apply(lambda x: mdates.date2num(x))

sns.regplot(x="timestamp", y="price_doc", data=ts_df, scatter=False, order=3, truncate=True, ci=100)
plt.plot(ts_df['timestamp'], ts_df['price_doc'], color='r', )
sns.tsplot(data=ts_df, time='timestamp', value='price_doc', ax=ax, interpolate=True)

# assign locator and formatter for the xaxis ticks.
ax.xaxis.set_major_locator(mdates.AutoDateLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y.%m.%d'))

# put the labels at 45deg since they tend to be too long
fig.autofmt_xdate()

# Axis
_ = ax.set(title='Daily median price over time')
_ = ax.set_xlim(ts_df['timestamp'].min(), ts_df['timestamp'].max())

Macro data¶

Missing data¶

macro_na = (macro_df.isnull().sum() / len(macro_df)) * 100
macro_na = macro_na.drop(macro_na[macro_na == 0].index).sort_values(ascending=False)
## Proportion of missing values per variable
f, ax = plt.subplots(figsize=(18, 6))
plt.xticks(rotation='90')
sns.barplot(x=macro_na.index, y=macro_na)
_ = ax.set(title='Percent missing data by feature', ylabel='% missing')

corrmat = macro_df.corr()
#f, ax = plt.subplots(figsize=(10, 7))
#plt.xticks(rotation='90')
#_ = sns.heatmap(corrmat, square=True, linewidths=.5, annot=False)
corrmat = corrmat.iloc[:-1, :]

## Plot of counts
#Create a matplotlib colormap from the sns seagreen color palette
cmap = sns.diverging_palette(0, 150, as_cmap=True)
#cmap = sns.color_palette("RdYlGn", 11, as_cmap=True)
# Normalize to the range of possible values from df["c"]
norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
# create a color dictionary (value in c : color from colormap) 
colors = {}
for cfea, cval in zip(corrmat.index, corrmat.iloc[:, -1]):
    if cfea == 'price_doc_log':
        continue
    colors.update({cfea : cmap(norm(cval))})

fig, ax = plt.subplots(figsize=(12, 22))
sns.barplot(x=corrmat['price_doc_log'], y=corrmat.index, orient='h', palette=colors)
_ = ax.set(title='Correlations with logprice', xlabel='correlation value')
_ = ax.set_xlim(-1, 1)

# remove the legend, because we want to set a colorbar instead
#plt.gca().legend_.remove()

## create colorbar ##
divider = make_axes_locatable(plt.gca())
ax_cb = divider.new_horizontal(size="5%", pad=0.15)
fig.add_axes(ax_cb)
cb1 = matplotlib.colorbar.ColorbarBase(ax_cb, cmap=cmap, norm=norm,
                                       orientation='vertical')
cb1.set_label('Correlation')
plt.show()

	id	timestamp	full_sq	life_sq	floor	max_floor	material	build_year	num_room	kitch_sq	state	product_type	sub_area	area_m	raion_popul	green_zone_part	indust_part	children_preschool	preschool_quota	preschool_education_centers_raion	children_school	school_quota	school_education_centers_raion	hospital_beds_raion	healthcare_centers_raion	university_top_20_raion	sport_objects_raion	additional_education_raion	culture_objects_top_25	culture_objects_top_25_raion	shopping_centers_raion	office_raion	thermal_power_plant_raion	incineration_raion	oil_chemistry_raion	radiation_raion	railroad_terminal_raion	big_market_raion	nuclear_reactor_raion	detention_facility_raion	full_all	male_f	female_f	young_all	young_male	young_female	work_all	work_male	work_female	ekder_all	ekder_male	ekder_female	0_6_all	0_6_male	0_6_female	7_14_all	7_14_male	7_14_female	0_17_all	0_17_male	0_17_female	16_29_all	16_29_male	16_29_female	0_13_all	0_13_male	0_13_female	raion_build_count_with_material_info	build_count_block	build_count_wood	build_count_frame	build_count_brick	build_count_monolith	build_count_panel	build_count_slag	build_count_mix	raion_build_count_with_builddate_info	build_count_before_1920	build_count_1921-1945	build_count_1946-1970	build_count_1971-1995	build_count_after_1995	ID_metro	metro_min_avto	metro_km_avto	metro_min_walk	metro_km_walk	kindergarten_km	school_km	park_km	green_zone_km	industrial_km	water_treatment_km	cemetery_km	incineration_km	railroad_station_walk_km	railroad_station_walk_min	ID_railroad_station_walk	railroad_station_avto_km	railroad_station_avto_min	ID_railroad_station_avto	public_transport_station_km	public_transport_station_min_walk	water_km	water_1line	mkad_km	ttk_km	sadovoe_km	bulvar_ring_km	kremlin_km	big_road1_km	ID_big_road1	big_road1_1line	big_road2_km	ID_big_road2	railroad_km	railroad_1line	zd_vokzaly_avto_km	ID_railroad_terminal	bus_terminal_avto_km	ID_bus_terminal	oil_chemistry_km	nuclear_reactor_km	radiation_km	power_transmission_line_km	thermal_power_plant_km	ts_km	big_market_km	market_shop_km	fitness_km	swim_pool_km	ice_rink_km	stadium_km	basketball_km	hospice_morgue_km	detention_facility_km	public_healthcare_km	university_km	workplaces_km	shopping_centers_km	office_km	additional_education_km	preschool_km	big_church_km	church_synagogue_km	mosque_km	theater_km	museum_km	exhibition_km	catering_km	ecology	green_part_500	prom_part_500	office_count_500	office_sqm_500	trc_count_500	trc_sqm_500	cafe_count_500	cafe_sum_500_min_price_avg	cafe_sum_500_max_price_avg	cafe_avg_price_500	cafe_count_500_na_price	cafe_count_500_price_500	cafe_count_500_price_1000	cafe_count_500_price_1500	cafe_count_500_price_2500	cafe_count_500_price_4000	big_church_count_500	church_count_500	leisure_count_500	sport_count_500	green_part_1000	prom_part_1000	office_count_1000	office_sqm_1000	trc_count_1000	trc_sqm_1000	cafe_count_1000	cafe_sum_1000_min_price_avg	cafe_sum_1000_max_price_avg	cafe_avg_price_1000	cafe_count_1000_na_price	cafe_count_1000_price_500	cafe_count_1000_price_1000	cafe_count_1000_price_1500	cafe_count_1000_price_2500	cafe_count_1000_price_4000	cafe_count_1000_price_high	big_church_count_1000	church_count_1000	leisure_count_1000	sport_count_1000	market_count_1000	green_part_1500	prom_part_1500	office_count_1500	office_sqm_1500	trc_count_1500	trc_sqm_1500	cafe_count_1500	cafe_sum_1500_min_price_avg	cafe_sum_1500_max_price_avg	cafe_avg_price_1500	cafe_count_1500_na_price	cafe_count_1500_price_500	cafe_count_1500_price_1000	cafe_count_1500_price_1500	cafe_count_1500_price_2500	cafe_count_1500_price_4000	cafe_count_1500_price_high	big_church_count_1500	church_count_1500	leisure_count_1500	sport_count_1500	market_count_1500	green_part_2000	prom_part_2000	office_count_2000	office_sqm_2000	trc_count_2000	trc_sqm_2000	cafe_count_2000	cafe_sum_2000_min_price_avg	cafe_sum_2000_max_price_avg	cafe_avg_price_2000	cafe_count_2000_na_price	cafe_count_2000_price_500	cafe_count_2000_price_1000	cafe_count_2000_price_1500	cafe_count_2000_price_2500	cafe_count_2000_price_4000	cafe_count_2000_price_high	big_church_count_2000	church_count_2000	leisure_count_2000	sport_count_2000	market_count_2000	green_part_3000	prom_part_3000	office_count_3000	office_sqm_3000	trc_count_3000	trc_sqm_3000	cafe_count_3000	cafe_sum_3000_min_price_avg	cafe_sum_3000_max_price_avg	cafe_avg_price_3000	cafe_count_3000_na_price	cafe_count_3000_price_500	cafe_count_3000_price_1000	cafe_count_3000_price_1500	cafe_count_3000_price_2500	cafe_count_3000_price_4000	cafe_count_3000_price_high	big_church_count_3000	church_count_3000	mosque_count_3000	leisure_count_3000	sport_count_3000	market_count_3000	green_part_5000	prom_part_5000	office_count_5000	office_sqm_5000	trc_count_5000	trc_sqm_5000	cafe_count_5000	cafe_sum_5000_min_price_avg	cafe_sum_5000_max_price_avg	cafe_avg_price_5000	cafe_count_5000_na_price	cafe_count_5000_price_500	cafe_count_5000_price_1000	cafe_count_5000_price_1500	cafe_count_5000_price_2500	cafe_count_5000_price_4000	cafe_count_5000_price_high	big_church_count_5000	church_count_5000	mosque_count_5000	leisure_count_5000	sport_count_5000	market_count_5000	price_doc	price_doc_log
0	1	2011-08-20	43	27.0	4.0	NaN	NaN	NaN	NaN	NaN	NaN	Investment	Bibirevo	6.407578e+06	155572	0.189727	0.000070	9576	5001.0	5	10309	11065.0	5	240.0	1	0	7	3	no	0	16	1	no	no	no	no	no	no	no	no	86206	40477	45729	21154	11007	10147	98207	52277	45930	36211	10580	25631	9576	4899	4677	10309	5463	4846	23603	12286	11317	17508	9425	8083	18654	9709	8945	211.0	25.0	0.0	0.0	0.0	2.0	184.0	0.0	0.0	211.0	0.0	0.0	0.0	206.0	5.0	1	2.590241	1.131260	13.575119	1.131260	0.145700	0.177975	2.158587	0.600973	1.080934	23.683460	1.804127	3.633334	5.419893	65.038716	1.0	5.419893	6.905893	1	0.274985	3.299822	0.992631	no	1.422391	10.918587	13.100618	13.675657	15.156211	1.422391	1	no	3.830951	5	1.305159	no	14.231961	101	24.292406	1	18.152338	5.718519	1.210027	1.062513	5.814135	4.308127	10.814172	1.676258	0.485841	3.065047	1.107594	8.148591	3.516513	2.392353	4.248036	0.974743	6.715026	0.884350	0.648488	0.637189	0.947962	0.177975	0.625783	0.628187	3.932040	14.053047	7.389498	7.023705	0.516838	good	0.00	0.00	0	0	0	0	0	NaN	NaN	NaN	0	0	0	0	0	0	0	0	0	1	7.36	0.00	1	30500	3	55600	19	527.78	888.89	708.33	1	10	4	3	1	0	0	1	2	0	6	1	14.27	6.92	3	39554	9	171420	34	566.67	969.70	768.18	1	14	11	6	2	0	0	1	2	0	7	1	11.77	15.97	9	188854	19	1244891	36	614.29	1042.86	828.57	1	15	11	6	2	1	0	1	2	0	10	1	11.98	13.55	12	251554	23	1419204	68	639.68	1079.37	859.52	5	21	22	16	3	1	0	2	4	0	0	21	1	13.09	13.31	29	807385	52	4036616	152	708.57	1185.71	947.14	12	39	48	40	9	4	0	13	22	1	0	52	4	5850000	15.581952
1	2	2011-08-23	34	19.0	3.0	NaN	NaN	NaN	NaN	NaN	NaN	Investment	Nagatinskij Zaton	9.589337e+06	115352	0.372602	0.049637	6880	3119.0	5	7759	6237.0	8	229.0	1	0	6	1	yes	1	3	0	no	no	no	no	no	no	no	no	76284	34200	42084	15727	7925	7802	70194	35622	34572	29431	9266	20165	6880	3466	3414	7759	3909	3850	17700	8998	8702	15164	7571	7593	13729	6929	6800	245.0	83.0	1.0	0.0	67.0	4.0	90.0	0.0	0.0	244.0	1.0	1.0	143.0	84.0	15.0	2	0.936700	0.647337	7.620630	0.635053	0.147754	0.273345	0.550690	0.065321	0.966479	1.317476	4.655004	8.648587	3.411993	40.943917	2.0	3.641773	4.679745	2	0.065263	0.783160	0.698081	no	9.503405	3.103996	6.444333	8.132640	8.698054	2.887377	2	no	3.103996	4	0.694536	no	9.242586	32	5.706113	2	9.034642	3.489954	2.724295	1.246149	3.419574	0.725560	6.910568	3.424716	0.668364	2.000154	8.972823	6.127073	1.161579	2.543747	12.649879	1.477723	1.852560	0.686252	0.519311	0.688796	1.072315	0.273345	0.967821	0.471447	4.841544	6.829889	0.709260	2.358840	0.230287	excellent	25.14	0.00	0	0	0	0	5	860.00	1500.00	1180.00	0	1	3	0	0	1	0	1	0	0	26.66	0.07	2	86600	5	94065	13	615.38	1076.92	846.15	0	5	6	1	0	1	0	1	2	4	2	0	21.53	7.71	3	102910	7	127065	17	694.12	1205.88	950.00	0	6	7	1	2	1	0	1	5	4	9	0	22.37	19.25	4	165510	8	179065	21	695.24	1190.48	942.86	0	7	8	3	2	1	0	1	5	4	11	0	18.07	27.32	12	821986	14	491565	30	631.03	1086.21	858.62	1	11	11	4	2	1	0	1	7	0	6	19	1	10.26	27.47	66	2690465	40	2034942	177	673.81	1148.81	911.31	9	49	65	36	15	3	0	15	29	1	10	66	14	6000000	15.607270
2	3	2011-08-27	43	29.0	2.0	NaN	NaN	NaN	NaN	NaN	NaN	Investment	Tekstil'shhiki	4.808270e+06	101708	0.112560	0.118537	5879	1463.0	4	6207	5580.0	7	1183.0	1	0	5	1	no	0	0	1	no	no	no	yes	no	no	no	no	101982	46076	55906	13028	6835	6193	63388	31813	31575	25292	7609	17683	5879	3095	2784	6207	3269	2938	14884	7821	7063	19401	9045	10356	11252	5916	5336	330.0	59.0	0.0	0.0	206.0	4.0	60.0	1.0	0.0	330.0	1.0	0.0	246.0	63.0	20.0	3	2.120999	1.637996	17.351515	1.445960	0.049102	0.158072	0.374848	0.453172	0.939275	4.912660	3.381083	11.996480	1.277658	15.331896	3.0	1.277658	1.701420	3	0.328756	3.945073	0.468265	no	5.604800	2.927487	6.963403	8.054252	9.067885	0.647250	3	no	2.927487	4	0.700691	no	9.540544	5	6.710302	3	5.777394	7.506612	0.772216	1.602183	3.682455	3.562188	5.752368	1.375443	0.733101	1.239304	1.978517	0.767569	1.952771	0.621357	7.682303	0.097144	0.841254	1.510089	1.486533	1.543049	0.391957	0.158072	3.178751	0.755946	7.922152	4.273200	3.156423	4.958214	0.190462	poor	1.67	0.00	0	0	0	0	3	666.67	1166.67	916.67	0	0	2	1	0	0	0	0	0	0	4.99	0.29	0	0	0	0	9	642.86	1142.86	892.86	2	0	5	2	0	0	0	0	1	0	5	3	9.92	6.73	0	0	1	2600	14	516.67	916.67	716.67	2	4	6	2	0	0	0	0	4	0	6	5	12.99	12.75	4	100200	7	52550	24	563.64	977.27	770.45	2	8	9	4	1	0	0	0	4	0	8	5	12.14	26.46	8	110856	7	52550	41	697.44	1192.31	944.87	2	9	17	9	3	1	0	0	11	0	0	20	6	13.69	21.58	43	1478160	35	1572990	122	702.68	1196.43	949.55	10	29	45	25	10	3	0	11	27	0	4	67	10	5700000	15.555977
3	4	2011-09-01	89	50.0	9.0	NaN	NaN	NaN	NaN	NaN	NaN	Investment	Mitino	1.258354e+07	178473	0.194703	0.069753	13087	6839.0	9	13670	17063.0	10	NaN	1	0	17	6	no	0	11	4	no	no	no	no	no	no	no	no	21155	9828	11327	28563	14680	13883	120381	60040	60341	29529	9083	20446	13087	6645	6442	13670	7126	6544	32063	16513	15550	3292	1450	1842	24934	12782	12152	458.0	9.0	51.0	12.0	124.0	50.0	201.0	9.0	2.0	459.0	13.0	24.0	40.0	130.0	252.0	4	1.489049	0.984537	11.565624	0.963802	0.179441	0.236455	0.078090	0.106125	0.451173	15.623710	2.017080	14.317640	4.291432	51.497190	4.0	3.816045	5.271136	4	0.131597	1.579164	1.200336	no	2.677824	14.606501	17.457198	18.309433	19.487005	2.677824	1	no	2.780449	17	1.999265	no	17.478380	83	6.734618	1	27.667863	9.522538	6.348716	1.767612	11.178333	0.583025	27.892717	0.811275	0.623484	1.950317	6.483172	7.385521	4.923843	3.549558	8.789894	2.163735	10.903161	0.622272	0.599914	0.934273	0.892674	0.236455	1.031777	1.561505	15.300449	16.990677	16.041521	5.029696	0.465820	good	17.36	0.57	0	0	0	0	2	1000.00	1500.00	1250.00	0	0	0	2	0	0	0	0	0	0	19.25	10.35	1	11000	6	80780	12	658.33	1083.33	870.83	0	3	4	5	0	0	0	0	0	0	3	1	28.38	6.57	2	11000	7	89492	23	673.91	1130.43	902.17	0	5	9	8	1	0	0	1	0	0	9	2	32.29	5.73	2	11000	7	89492	25	660.00	1120.00	890.00	0	5	11	8	1	0	0	1	1	0	13	2	20.79	3.57	4	167000	12	205756	32	718.75	1218.75	968.75	0	5	14	10	3	0	0	1	2	0	0	18	3	14.18	3.89	8	244166	22	942180	61	931.58	1552.63	1242.11	4	7	21	15	11	2	1	4	4	0	0	26	3	13100000	16.388123
4	5	2011-09-05	77	77.0	4.0	NaN	NaN	NaN	NaN	NaN	NaN	Investment	Basmannoe	8.398461e+06	108171	0.015234	0.037316	5706	3240.0	7	6748	7770.0	9	562.0	4	2	25	2	no	0	10	93	no	no	no	yes	yes	no	no	no	28179	13522	14657	13368	7159	6209	68043	34236	33807	26760	8563	18197	5706	2982	2724	6748	3664	3084	15237	8113	7124	5164	2583	2581	11631	6223	5408	746.0	48.0	0.0	0.0	643.0	16.0	35.0	3.0	1.0	746.0	371.0	114.0	146.0	62.0	53.0	5	1.257186	0.876620	8.266305	0.688859	0.247901	0.376838	0.258289	0.236214	0.392871	10.683540	2.936581	11.903910	0.853960	10.247521	5.0	1.595898	2.156284	113	0.071480	0.857764	0.820294	no	11.616653	1.721834	0.046810	0.787593	2.578671	1.721834	4	no	3.133531	10	0.084113	yes	1.595898	113	1.423428	4	6.515857	8.671016	1.638318	3.632640	4.587917	2.609420	9.155057	1.969738	0.220288	2.544696	3.975401	3.610754	0.307915	1.864637	3.779781	1.121703	0.991683	0.892668	0.429052	0.077901	0.810801	0.376838	0.378756	0.121681	2.584370	1.112486	1.800125	1.339652	0.026102	excellent	3.56	4.44	15	293699	1	45000	48	702.22	1166.67	934.44	3	17	10	11	7	0	1	4	2	3	3.34	8.29	46	420952	3	158200	153	763.45	1272.41	1017.93	8	39	45	39	19	2	1	7	12	6	7	0	4.12	4.83	93	1195735	9	445900	272	766.80	1272.73	1019.76	19	70	74	72	30	6	1	18	30	10	14	2	4.53	5.02	149	1625130	17	564843	483	765.93	1269.23	1017.58	28	130	129	131	50	14	1	35	61	17	21	3	5.06	8.62	305	3420907	60	2296870	1068	853.03	1410.45	1131.74	63	266	267	262	149	57	4	70	121	1	40	77	5	8.38	10.92	689	8404624	114	3503058	2283	853.88	1411.45	1132.66	143	566	578	552	319	108	17	135	236	2	91	195	14	16331452	16.608603

	timestamp	oil_urals	gdp_quart	gdp_quart_growth	cpi	ppi	gdp_deflator	balance_trade	balance_trade_growth	usdrub	eurrub	brent	net_capital_export	gdp_annual	gdp_annual_growth	average_provision_of_build_contract	average_provision_of_build_contract_moscow	rts	micex	micex_rgbi_tr	micex_cbi_tr	deposits_value	deposits_growth	deposits_rate	mortgage_value	mortgage_growth	mortgage_rate	grp	grp_growth	income_per_cap	real_dispos_income_per_cap_growth	salary	salary_growth	fixed_basket	retail_trade_turnover	retail_trade_turnover_per_cap	retail_trade_turnover_growth	labor_force	unemployment	employment	invest_fixed_capital_per_cap	invest_fixed_assets	profitable_enterpr_share	unprofitable_enterpr_share	share_own_revenues	overdue_wages_per_cap	fin_res_per_cap	marriages_per_1000_cap	divorce_rate	construction_value	invest_fixed_assets_phys	pop_natural_increase	pop_migration	pop_total_inc	childbirth	mortality	housing_fund_sqm	lodging_sqm_per_cap	water_pipes_share	baths_share	sewerage_share	gas_share	hot_water_share	electric_stove_share	heating_share	old_house_share	average_life_exp	infant_mortarity_per_1000_cap	perinatal_mort_per_1000_cap	incidence_population	rent_price_4+room_bus	rent_price_3room_bus	rent_price_2room_bus	rent_price_1room_bus	rent_price_3room_eco	rent_price_2room_eco	rent_price_1room_eco	load_of_teachers_preschool_per_teacher	child_on_acc_pre_school	load_of_teachers_school_per_teacher	students_state_oneshift	modern_education_share	old_education_build_share	provision_doctors	provision_nurse	load_on_doctors	power_clinics	hospital_beds_available_per_cap	hospital_bed_occupancy_per_year	provision_retail_space_sqm	provision_retail_space_modern_sqm	turnover_catering_per_cap	theaters_viewers_per_1000_cap	seats_theather_rfmin_per_100000_cap	museum_visitis_per_100_cap	bandwidth_sports	population_reg_sports_share	students_reg_sports_share	apartment_build	apartment_fund_sqm	price_doc_log
0	2010-01-01	76.1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	38807.2	-0.078209	5.0	NaN	NaN	NaN	NaN	NaN	7484970	NaN	NaN	142968	NaN	13.72	8375.8638	NaN	30789.2	0.038	38410.5	NaN	11443.63	2882.4169	251.484	106.8	6436.244	0.018	0.69	63932.0	732.760604	0.716	0.284	0.962915	12865.0	233.351529	8.0	3.9	553874.9	95.8	-0.3	14.1	13.8	10.7	11.0	216.0	18.715883	99.9	99.8	99.6	44.4	95.7	55.0	99.9	0.3	74.2	6.1	5.87	696.6	NaN	NaN	NaN	NaN	NaN	NaN	NaN	721.477765	45,713	1356.112607	NaN	NaN	NaN	18.0	99.4	7872.848285	162.9	NaN	NaN	NaN	690.0	6221.0	527.0	0.41	993.0	NaN	NaN	63.03	22825.0	NaN	15.581952
1	2010-01-02	76.1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	38807.2	-0.078209	5.0	NaN	NaN	NaN	NaN	175.15	7484970	NaN	NaN	142968	NaN	13.72	8375.8638	NaN	30789.2	0.038	38410.5	NaN	11443.63	2882.4169	251.484	106.8	6436.244	0.018	0.69	63932.0	732.760604	0.716	0.284	0.962915	12865.0	233.351529	8.0	3.9	553874.9	95.8	-0.3	14.1	13.8	10.7	11.0	216.0	18.715883	99.9	99.8	99.6	44.4	95.7	55.0	99.9	0.3	74.2	6.1	5.87	696.6	NaN	NaN	NaN	NaN	NaN	NaN	NaN	721.477765	45,713	1356.112607	NaN	NaN	NaN	18.0	99.4	7872.848285	162.9	NaN	NaN	NaN	690.0	6221.0	527.0	0.41	993.0	NaN	NaN	63.03	22825.0	NaN	15.607270
2	2010-01-03	76.1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	38807.2	-0.078209	5.0	NaN	NaN	NaN	NaN	178.66	7484970	NaN	NaN	142968	NaN	13.72	8375.8638	NaN	30789.2	0.038	38410.5	NaN	11443.63	2882.4169	251.484	106.8	6436.244	0.018	0.69	63932.0	732.760604	0.716	0.284	0.962915	12865.0	233.351529	8.0	3.9	553874.9	95.8	-0.3	14.1	13.8	10.7	11.0	216.0	18.715883	99.9	99.8	99.6	44.4	95.7	55.0	99.9	0.3	74.2	6.1	5.87	696.6	NaN	NaN	NaN	NaN	NaN	NaN	NaN	721.477765	45,713	1356.112607	NaN	NaN	NaN	18.0	99.4	7872.848285	162.9	NaN	NaN	NaN	690.0	6221.0	527.0	0.41	993.0	NaN	NaN	63.03	22825.0	NaN	15.555977
3	2010-01-04	76.1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	29.905	43.4054	80.12	NaN	38807.2	-0.078209	5.0	NaN	NaN	NaN	NaN	183.44	7484970	NaN	NaN	142968	NaN	13.72	8375.8638	NaN	30789.2	0.038	38410.5	NaN	11443.63	2882.4169	251.484	106.8	6436.244	0.018	0.69	63932.0	732.760604	0.716	0.284	0.962915	12865.0	233.351529	8.0	3.9	553874.9	95.8	-0.3	14.1	13.8	10.7	11.0	216.0	18.715883	99.9	99.8	99.6	44.4	95.7	55.0	99.9	0.3	74.2	6.1	5.87	696.6	NaN	NaN	NaN	NaN	NaN	NaN	NaN	721.477765	45,713	1356.112607	NaN	NaN	NaN	18.0	99.4	7872.848285	162.9	NaN	NaN	NaN	690.0	6221.0	527.0	0.41	993.0	NaN	NaN	63.03	22825.0	NaN	16.388123
4	2010-01-05	76.1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	29.836	42.9600	80.59	NaN	38807.2	-0.078209	5.0	NaN	NaN	NaN	NaN	183.44	7484970	NaN	NaN	142968	NaN	13.72	8375.8638	NaN	30789.2	0.038	38410.5	NaN	11443.63	2882.4169	251.484	106.8	6436.244	0.018	0.69	63932.0	732.760604	0.716	0.284	0.962915	12865.0	233.351529	8.0	3.9	553874.9	95.8	-0.3	14.1	13.8	10.7	11.0	216.0	18.715883	99.9	99.8	99.6	44.4	95.7	55.0	99.9	0.3	74.2	6.1	5.87	696.6	NaN	NaN	NaN	NaN	NaN	NaN	NaN	721.477765	45,713	1356.112607	NaN	NaN	NaN	18.0	99.4	7872.848285	162.9	NaN	NaN	NaN	690.0	6221.0	527.0	0.41	993.0	NaN	NaN	63.03	22825.0	NaN	16.608603

	full_sq	life_sq	floor	max_floor	build_year	num_room	kitch_sq	state	price_doc
0	43	27.0	4.0	NaN	NaN	NaN	NaN	NaN	5850000
1	34	19.0	3.0	NaN	NaN	NaN	NaN	NaN	6000000
2	43	29.0	2.0	NaN	NaN	NaN	NaN	NaN	5700000
3	89	50.0	9.0	NaN	NaN	NaN	NaN	NaN	13100000
4	77	77.0	4.0	NaN	NaN	NaN	NaN	NaN	16331452