Exploratory Data Analysis

2017 May, 25

Exploratory Data Analysis of the Data from the Sberbank Kaggle Competition. This kernel is based in the A Very Extensive Exploratory Analysis in Python notebook.

Its pupose is to familiarize with the data and prepare to preprocess the data and to decide the machine learning algorithms to use.

In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from mpl_toolkits.axes_grid1 import make_axes_locatable
import datetime

color = sns.color_palette()

%matplotlib inline

pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('display.max_columns', 500)

Training Data

In [2]:
train_df = pd.read_csv("../input/train.csv", parse_dates=['timestamp'])
train_df['price_doc_log'] = np.log1p(train_df['price_doc'])
macro_df = pd.read_csv("../input/macro.csv", parse_dates=['timestamp'])
macro_df['price_doc_log'] = np.log1p(train_df['price_doc'])
In [3]:
train_df.shape
Out[3]:
(30471, 293)
In [4]:
train_df.head()
Out[4]:
id timestamp full_sq life_sq floor max_floor material build_year num_room kitch_sq state product_type sub_area area_m raion_popul green_zone_part indust_part children_preschool preschool_quota preschool_education_centers_raion children_school school_quota school_education_centers_raion school_education_centers_top_20_raion hospital_beds_raion healthcare_centers_raion university_top_20_raion sport_objects_raion additional_education_raion culture_objects_top_25 culture_objects_top_25_raion shopping_centers_raion office_raion thermal_power_plant_raion incineration_raion oil_chemistry_raion radiation_raion railroad_terminal_raion big_market_raion nuclear_reactor_raion detention_facility_raion full_all male_f female_f young_all young_male young_female work_all work_male work_female ekder_all ekder_male ekder_female 0_6_all 0_6_male 0_6_female 7_14_all 7_14_male 7_14_female 0_17_all 0_17_male 0_17_female 16_29_all 16_29_male 16_29_female 0_13_all 0_13_male 0_13_female raion_build_count_with_material_info build_count_block build_count_wood build_count_frame build_count_brick build_count_monolith build_count_panel build_count_foam build_count_slag build_count_mix raion_build_count_with_builddate_info build_count_before_1920 build_count_1921-1945 build_count_1946-1970 build_count_1971-1995 build_count_after_1995 ID_metro metro_min_avto metro_km_avto metro_min_walk metro_km_walk kindergarten_km school_km park_km green_zone_km industrial_km water_treatment_km cemetery_km incineration_km railroad_station_walk_km railroad_station_walk_min ID_railroad_station_walk railroad_station_avto_km railroad_station_avto_min ID_railroad_station_avto public_transport_station_km public_transport_station_min_walk water_km water_1line mkad_km ttk_km sadovoe_km bulvar_ring_km kremlin_km big_road1_km ID_big_road1 big_road1_1line big_road2_km ID_big_road2 railroad_km railroad_1line zd_vokzaly_avto_km ID_railroad_terminal bus_terminal_avto_km ID_bus_terminal oil_chemistry_km nuclear_reactor_km radiation_km power_transmission_line_km thermal_power_plant_km ts_km big_market_km market_shop_km fitness_km swim_pool_km ice_rink_km stadium_km basketball_km hospice_morgue_km detention_facility_km public_healthcare_km university_km workplaces_km shopping_centers_km office_km additional_education_km preschool_km big_church_km church_synagogue_km mosque_km theater_km museum_km exhibition_km catering_km ecology green_part_500 prom_part_500 office_count_500 office_sqm_500 trc_count_500 trc_sqm_500 cafe_count_500 cafe_sum_500_min_price_avg cafe_sum_500_max_price_avg cafe_avg_price_500 cafe_count_500_na_price cafe_count_500_price_500 cafe_count_500_price_1000 cafe_count_500_price_1500 cafe_count_500_price_2500 cafe_count_500_price_4000 cafe_count_500_price_high big_church_count_500 church_count_500 mosque_count_500 leisure_count_500 sport_count_500 market_count_500 green_part_1000 prom_part_1000 office_count_1000 office_sqm_1000 trc_count_1000 trc_sqm_1000 cafe_count_1000 cafe_sum_1000_min_price_avg cafe_sum_1000_max_price_avg cafe_avg_price_1000 cafe_count_1000_na_price cafe_count_1000_price_500 cafe_count_1000_price_1000 cafe_count_1000_price_1500 cafe_count_1000_price_2500 cafe_count_1000_price_4000 cafe_count_1000_price_high big_church_count_1000 church_count_1000 mosque_count_1000 leisure_count_1000 sport_count_1000 market_count_1000 green_part_1500 prom_part_1500 office_count_1500 office_sqm_1500 trc_count_1500 trc_sqm_1500 cafe_count_1500 cafe_sum_1500_min_price_avg cafe_sum_1500_max_price_avg cafe_avg_price_1500 cafe_count_1500_na_price cafe_count_1500_price_500 cafe_count_1500_price_1000 cafe_count_1500_price_1500 cafe_count_1500_price_2500 cafe_count_1500_price_4000 cafe_count_1500_price_high big_church_count_1500 church_count_1500 mosque_count_1500 leisure_count_1500 sport_count_1500 market_count_1500 green_part_2000 prom_part_2000 office_count_2000 office_sqm_2000 trc_count_2000 trc_sqm_2000 cafe_count_2000 cafe_sum_2000_min_price_avg cafe_sum_2000_max_price_avg cafe_avg_price_2000 cafe_count_2000_na_price cafe_count_2000_price_500 cafe_count_2000_price_1000 cafe_count_2000_price_1500 cafe_count_2000_price_2500 cafe_count_2000_price_4000 cafe_count_2000_price_high big_church_count_2000 church_count_2000 mosque_count_2000 leisure_count_2000 sport_count_2000 market_count_2000 green_part_3000 prom_part_3000 office_count_3000 office_sqm_3000 trc_count_3000 trc_sqm_3000 cafe_count_3000 cafe_sum_3000_min_price_avg cafe_sum_3000_max_price_avg cafe_avg_price_3000 cafe_count_3000_na_price cafe_count_3000_price_500 cafe_count_3000_price_1000 cafe_count_3000_price_1500 cafe_count_3000_price_2500 cafe_count_3000_price_4000 cafe_count_3000_price_high big_church_count_3000 church_count_3000 mosque_count_3000 leisure_count_3000 sport_count_3000 market_count_3000 green_part_5000 prom_part_5000 office_count_5000 office_sqm_5000 trc_count_5000 trc_sqm_5000 cafe_count_5000 cafe_sum_5000_min_price_avg cafe_sum_5000_max_price_avg cafe_avg_price_5000 cafe_count_5000_na_price cafe_count_5000_price_500 cafe_count_5000_price_1000 cafe_count_5000_price_1500 cafe_count_5000_price_2500 cafe_count_5000_price_4000 cafe_count_5000_price_high big_church_count_5000 church_count_5000 mosque_count_5000 leisure_count_5000 sport_count_5000 market_count_5000 price_doc price_doc_log
0 1 2011-08-20 43 27.0 4.0 NaN NaN NaN NaN NaN NaN Investment Bibirevo 6.407578e+06 155572 0.189727 0.000070 9576 5001.0 5 10309 11065.0 5 0 240.0 1 0 7 3 no 0 16 1 no no no no no no no no 86206 40477 45729 21154 11007 10147 98207 52277 45930 36211 10580 25631 9576 4899 4677 10309 5463 4846 23603 12286 11317 17508 9425 8083 18654 9709 8945 211.0 25.0 0.0 0.0 0.0 2.0 184.0 0.0 0.0 0.0 211.0 0.0 0.0 0.0 206.0 5.0 1 2.590241 1.131260 13.575119 1.131260 0.145700 0.177975 2.158587 0.600973 1.080934 23.683460 1.804127 3.633334 5.419893 65.038716 1.0 5.419893 6.905893 1 0.274985 3.299822 0.992631 no 1.422391 10.918587 13.100618 13.675657 15.156211 1.422391 1 no 3.830951 5 1.305159 no 14.231961 101 24.292406 1 18.152338 5.718519 1.210027 1.062513 5.814135 4.308127 10.814172 1.676258 0.485841 3.065047 1.107594 8.148591 3.516513 2.392353 4.248036 0.974743 6.715026 0.884350 0.648488 0.637189 0.947962 0.177975 0.625783 0.628187 3.932040 14.053047 7.389498 7.023705 0.516838 good 0.00 0.00 0 0 0 0 0 NaN NaN NaN 0 0 0 0 0 0 0 0 0 0 0 1 0 7.36 0.00 1 30500 3 55600 19 527.78 888.89 708.33 1 10 4 3 1 0 0 1 2 0 0 6 1 14.27 6.92 3 39554 9 171420 34 566.67 969.70 768.18 1 14 11 6 2 0 0 1 2 0 0 7 1 11.77 15.97 9 188854 19 1244891 36 614.29 1042.86 828.57 1 15 11 6 2 1 0 1 2 0 0 10 1 11.98 13.55 12 251554 23 1419204 68 639.68 1079.37 859.52 5 21 22 16 3 1 0 2 4 0 0 21 1 13.09 13.31 29 807385 52 4036616 152 708.57 1185.71 947.14 12 39 48 40 9 4 0 13 22 1 0 52 4 5850000 15.581952
1 2 2011-08-23 34 19.0 3.0 NaN NaN NaN NaN NaN NaN Investment Nagatinskij Zaton 9.589337e+06 115352 0.372602 0.049637 6880 3119.0 5 7759 6237.0 8 0 229.0 1 0 6 1 yes 1 3 0 no no no no no no no no 76284 34200 42084 15727 7925 7802 70194 35622 34572 29431 9266 20165 6880 3466 3414 7759 3909 3850 17700 8998 8702 15164 7571 7593 13729 6929 6800 245.0 83.0 1.0 0.0 67.0 4.0 90.0 0.0 0.0 0.0 244.0 1.0 1.0 143.0 84.0 15.0 2 0.936700 0.647337 7.620630 0.635053 0.147754 0.273345 0.550690 0.065321 0.966479 1.317476 4.655004 8.648587 3.411993 40.943917 2.0 3.641773 4.679745 2 0.065263 0.783160 0.698081 no 9.503405 3.103996 6.444333 8.132640 8.698054 2.887377 2 no 3.103996 4 0.694536 no 9.242586 32 5.706113 2 9.034642 3.489954 2.724295 1.246149 3.419574 0.725560 6.910568 3.424716 0.668364 2.000154 8.972823 6.127073 1.161579 2.543747 12.649879 1.477723 1.852560 0.686252 0.519311 0.688796 1.072315 0.273345 0.967821 0.471447 4.841544 6.829889 0.709260 2.358840 0.230287 excellent 25.14 0.00 0 0 0 0 5 860.00 1500.00 1180.00 0 1 3 0 0 1 0 0 1 0 0 0 0 26.66 0.07 2 86600 5 94065 13 615.38 1076.92 846.15 0 5 6 1 0 1 0 1 2 0 4 2 0 21.53 7.71 3 102910 7 127065 17 694.12 1205.88 950.00 0 6 7 1 2 1 0 1 5 0 4 9 0 22.37 19.25 4 165510 8 179065 21 695.24 1190.48 942.86 0 7 8 3 2 1 0 1 5 0 4 11 0 18.07 27.32 12 821986 14 491565 30 631.03 1086.21 858.62 1 11 11 4 2 1 0 1 7 0 6 19 1 10.26 27.47 66 2690465 40 2034942 177 673.81 1148.81 911.31 9 49 65 36 15 3 0 15 29 1 10 66 14 6000000 15.607270
2 3 2011-08-27 43 29.0 2.0 NaN NaN NaN NaN NaN NaN Investment Tekstil'shhiki 4.808270e+06 101708 0.112560 0.118537 5879 1463.0 4 6207 5580.0 7 0 1183.0 1 0 5 1 no 0 0 1 no no no yes no no no no 101982 46076 55906 13028 6835 6193 63388 31813 31575 25292 7609 17683 5879 3095 2784 6207 3269 2938 14884 7821 7063 19401 9045 10356 11252 5916 5336 330.0 59.0 0.0 0.0 206.0 4.0 60.0 0.0 1.0 0.0 330.0 1.0 0.0 246.0 63.0 20.0 3 2.120999 1.637996 17.351515 1.445960 0.049102 0.158072 0.374848 0.453172 0.939275 4.912660 3.381083 11.996480 1.277658 15.331896 3.0 1.277658 1.701420 3 0.328756 3.945073 0.468265 no 5.604800 2.927487 6.963403 8.054252 9.067885 0.647250 3 no 2.927487 4 0.700691 no 9.540544 5 6.710302 3 5.777394 7.506612 0.772216 1.602183 3.682455 3.562188 5.752368 1.375443 0.733101 1.239304 1.978517 0.767569 1.952771 0.621357 7.682303 0.097144 0.841254 1.510089 1.486533 1.543049 0.391957 0.158072 3.178751 0.755946 7.922152 4.273200 3.156423 4.958214 0.190462 poor 1.67 0.00 0 0 0 0 3 666.67 1166.67 916.67 0 0 2 1 0 0 0 0 0 0 0 0 0 4.99 0.29 0 0 0 0 9 642.86 1142.86 892.86 2 0 5 2 0 0 0 0 1 0 0 5 3 9.92 6.73 0 0 1 2600 14 516.67 916.67 716.67 2 4 6 2 0 0 0 0 4 0 0 6 5 12.99 12.75 4 100200 7 52550 24 563.64 977.27 770.45 2 8 9 4 1 0 0 0 4 0 0 8 5 12.14 26.46 8 110856 7 52550 41 697.44 1192.31 944.87 2 9 17 9 3 1 0 0 11 0 0 20 6 13.69 21.58 43 1478160 35 1572990 122 702.68 1196.43 949.55 10 29 45 25 10 3 0 11 27 0 4 67 10 5700000 15.555977
3 4 2011-09-01 89 50.0 9.0 NaN NaN NaN NaN NaN NaN Investment Mitino 1.258354e+07 178473 0.194703 0.069753 13087 6839.0 9 13670 17063.0 10 0 NaN 1 0 17 6 no 0 11 4 no no no no no no no no 21155 9828 11327 28563 14680 13883 120381 60040 60341 29529 9083 20446 13087 6645 6442 13670 7126 6544 32063 16513 15550 3292 1450 1842 24934 12782 12152 458.0 9.0 51.0 12.0 124.0 50.0 201.0 0.0 9.0 2.0 459.0 13.0 24.0 40.0 130.0 252.0 4 1.489049 0.984537 11.565624 0.963802 0.179441 0.236455 0.078090 0.106125 0.451173 15.623710 2.017080 14.317640 4.291432 51.497190 4.0 3.816045 5.271136 4 0.131597 1.579164 1.200336 no 2.677824 14.606501 17.457198 18.309433 19.487005 2.677824 1 no 2.780449 17 1.999265 no 17.478380 83 6.734618 1 27.667863 9.522538 6.348716 1.767612 11.178333 0.583025 27.892717 0.811275 0.623484 1.950317 6.483172 7.385521 4.923843 3.549558 8.789894 2.163735 10.903161 0.622272 0.599914 0.934273 0.892674 0.236455 1.031777 1.561505 15.300449 16.990677 16.041521 5.029696 0.465820 good 17.36 0.57 0 0 0 0 2 1000.00 1500.00 1250.00 0 0 0 2 0 0 0 0 0 0 0 0 0 19.25 10.35 1 11000 6 80780 12 658.33 1083.33 870.83 0 3 4 5 0 0 0 0 0 0 0 3 1 28.38 6.57 2 11000 7 89492 23 673.91 1130.43 902.17 0 5 9 8 1 0 0 1 0 0 0 9 2 32.29 5.73 2 11000 7 89492 25 660.00 1120.00 890.00 0 5 11 8 1 0 0 1 1 0 0 13 2 20.79 3.57 4 167000 12 205756 32 718.75 1218.75 968.75 0 5 14 10 3 0 0 1 2 0 0 18 3 14.18 3.89 8 244166 22 942180 61 931.58 1552.63 1242.11 4 7 21 15 11 2 1 4 4 0 0 26 3 13100000 16.388123
4 5 2011-09-05 77 77.0 4.0 NaN NaN NaN NaN NaN NaN Investment Basmannoe 8.398461e+06 108171 0.015234 0.037316 5706 3240.0 7 6748 7770.0 9 0 562.0 4 2 25 2 no 0 10 93 no no no yes yes no no no 28179 13522 14657 13368 7159 6209 68043 34236 33807 26760 8563 18197 5706 2982 2724 6748 3664 3084 15237 8113 7124 5164 2583 2581 11631 6223 5408 746.0 48.0 0.0 0.0 643.0 16.0 35.0 0.0 3.0 1.0 746.0 371.0 114.0 146.0 62.0 53.0 5 1.257186 0.876620 8.266305 0.688859 0.247901 0.376838 0.258289 0.236214 0.392871 10.683540 2.936581 11.903910 0.853960 10.247521 5.0 1.595898 2.156284 113 0.071480 0.857764 0.820294 no 11.616653 1.721834 0.046810 0.787593 2.578671 1.721834 4 no 3.133531 10 0.084113 yes 1.595898 113 1.423428 4 6.515857 8.671016 1.638318 3.632640 4.587917 2.609420 9.155057 1.969738 0.220288 2.544696 3.975401 3.610754 0.307915 1.864637 3.779781 1.121703 0.991683 0.892668 0.429052 0.077901 0.810801 0.376838 0.378756 0.121681 2.584370 1.112486 1.800125 1.339652 0.026102 excellent 3.56 4.44 15 293699 1 45000 48 702.22 1166.67 934.44 3 17 10 11 7 0 0 1 4 0 2 3 0 3.34 8.29 46 420952 3 158200 153 763.45 1272.41 1017.93 8 39 45 39 19 2 1 7 12 0 6 7 0 4.12 4.83 93 1195735 9 445900 272 766.80 1272.73 1019.76 19 70 74 72 30 6 1 18 30 0 10 14 2 4.53 5.02 149 1625130 17 564843 483 765.93 1269.23 1017.58 28 130 129 131 50 14 1 35 61 0 17 21 3 5.06 8.62 305 3420907 60 2296870 1068 853.03 1410.45 1131.74 63 266 267 262 149 57 4 70 121 1 40 77 5 8.38 10.92 689 8404624 114 3503058 2283 853.88 1411.45 1132.66 143 566 578 552 319 108 17 135 236 2 91 195 14 16331452 16.608603
In [5]:
macro_df.shape
Out[5]:
(2484, 101)
In [6]:
macro_df.head()
Out[6]:
timestamp oil_urals gdp_quart gdp_quart_growth cpi ppi gdp_deflator balance_trade balance_trade_growth usdrub eurrub brent net_capital_export gdp_annual gdp_annual_growth average_provision_of_build_contract average_provision_of_build_contract_moscow rts micex micex_rgbi_tr micex_cbi_tr deposits_value deposits_growth deposits_rate mortgage_value mortgage_growth mortgage_rate grp grp_growth income_per_cap real_dispos_income_per_cap_growth salary salary_growth fixed_basket retail_trade_turnover retail_trade_turnover_per_cap retail_trade_turnover_growth labor_force unemployment employment invest_fixed_capital_per_cap invest_fixed_assets profitable_enterpr_share unprofitable_enterpr_share share_own_revenues overdue_wages_per_cap fin_res_per_cap marriages_per_1000_cap divorce_rate construction_value invest_fixed_assets_phys pop_natural_increase pop_migration pop_total_inc childbirth mortality housing_fund_sqm lodging_sqm_per_cap water_pipes_share baths_share sewerage_share gas_share hot_water_share electric_stove_share heating_share old_house_share average_life_exp infant_mortarity_per_1000_cap perinatal_mort_per_1000_cap incidence_population rent_price_4+room_bus rent_price_3room_bus rent_price_2room_bus rent_price_1room_bus rent_price_3room_eco rent_price_2room_eco rent_price_1room_eco load_of_teachers_preschool_per_teacher child_on_acc_pre_school load_of_teachers_school_per_teacher students_state_oneshift modern_education_share old_education_build_share provision_doctors provision_nurse load_on_doctors power_clinics hospital_beds_available_per_cap hospital_bed_occupancy_per_year provision_retail_space_sqm provision_retail_space_modern_sqm turnover_catering_per_cap theaters_viewers_per_1000_cap seats_theather_rfmin_per_100000_cap museum_visitis_per_100_cap bandwidth_sports population_reg_sports_share students_reg_sports_share apartment_build apartment_fund_sqm price_doc_log
0 2010-01-01 76.1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 38807.2 -0.078209 5.0 NaN NaN NaN NaN NaN 7484970 NaN NaN 142968 NaN 13.72 8375.8638 NaN 30789.2 0.038 38410.5 NaN 11443.63 2882.4169 251.484 106.8 6436.244 0.018 0.69 63932.0 732.760604 0.716 0.284 0.962915 12865.0 233.351529 8.0 3.9 553874.9 95.8 -0.3 14.1 13.8 10.7 11.0 216.0 18.715883 99.9 99.8 99.6 44.4 95.7 55.0 99.9 0.3 74.2 6.1 5.87 696.6 NaN NaN NaN NaN NaN NaN NaN 721.477765 45,713 1356.112607 NaN NaN NaN 18.0 99.4 7872.848285 162.9 NaN NaN NaN 690.0 6221.0 527.0 0.41 993.0 NaN NaN 63.03 22825.0 NaN 15.581952
1 2010-01-02 76.1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 38807.2 -0.078209 5.0 NaN NaN NaN NaN 175.15 7484970 NaN NaN 142968 NaN 13.72 8375.8638 NaN 30789.2 0.038 38410.5 NaN 11443.63 2882.4169 251.484 106.8 6436.244 0.018 0.69 63932.0 732.760604 0.716 0.284 0.962915 12865.0 233.351529 8.0 3.9 553874.9 95.8 -0.3 14.1 13.8 10.7 11.0 216.0 18.715883 99.9 99.8 99.6 44.4 95.7 55.0 99.9 0.3 74.2 6.1 5.87 696.6 NaN NaN NaN NaN NaN NaN NaN 721.477765 45,713 1356.112607 NaN NaN NaN 18.0 99.4 7872.848285 162.9 NaN NaN NaN 690.0 6221.0 527.0 0.41 993.0 NaN NaN 63.03 22825.0 NaN 15.607270
2 2010-01-03 76.1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 38807.2 -0.078209 5.0 NaN NaN NaN NaN 178.66 7484970 NaN NaN 142968 NaN 13.72 8375.8638 NaN 30789.2 0.038 38410.5 NaN 11443.63 2882.4169 251.484 106.8 6436.244 0.018 0.69 63932.0 732.760604 0.716 0.284 0.962915 12865.0 233.351529 8.0 3.9 553874.9 95.8 -0.3 14.1 13.8 10.7 11.0 216.0 18.715883 99.9 99.8 99.6 44.4 95.7 55.0 99.9 0.3 74.2 6.1 5.87 696.6 NaN NaN NaN NaN NaN NaN NaN 721.477765 45,713 1356.112607 NaN NaN NaN 18.0 99.4 7872.848285 162.9 NaN NaN NaN 690.0 6221.0 527.0 0.41 993.0 NaN NaN 63.03 22825.0 NaN 15.555977
3 2010-01-04 76.1 NaN NaN NaN NaN NaN NaN NaN 29.905 43.4054 80.12 NaN 38807.2 -0.078209 5.0 NaN NaN NaN NaN 183.44 7484970 NaN NaN 142968 NaN 13.72 8375.8638 NaN 30789.2 0.038 38410.5 NaN 11443.63 2882.4169 251.484 106.8 6436.244 0.018 0.69 63932.0 732.760604 0.716 0.284 0.962915 12865.0 233.351529 8.0 3.9 553874.9 95.8 -0.3 14.1 13.8 10.7 11.0 216.0 18.715883 99.9 99.8 99.6 44.4 95.7 55.0 99.9 0.3 74.2 6.1 5.87 696.6 NaN NaN NaN NaN NaN NaN NaN 721.477765 45,713 1356.112607 NaN NaN NaN 18.0 99.4 7872.848285 162.9 NaN NaN NaN 690.0 6221.0 527.0 0.41 993.0 NaN NaN 63.03 22825.0 NaN 16.388123
4 2010-01-05 76.1 NaN NaN NaN NaN NaN NaN NaN 29.836 42.9600 80.59 NaN 38807.2 -0.078209 5.0 NaN NaN NaN NaN 183.44 7484970 NaN NaN 142968 NaN 13.72 8375.8638 NaN 30789.2 0.038 38410.5 NaN 11443.63 2882.4169 251.484 106.8 6436.244 0.018 0.69 63932.0 732.760604 0.716 0.284 0.962915 12865.0 233.351529 8.0 3.9 553874.9 95.8 -0.3 14.1 13.8 10.7 11.0 216.0 18.715883 99.9 99.8 99.6 44.4 95.7 55.0 99.9 0.3 74.2 6.1 5.87 696.6 NaN NaN NaN NaN NaN NaN NaN 721.477765 45,713 1356.112607 NaN NaN NaN 18.0 99.4 7872.848285 162.9 NaN NaN NaN 690.0 6221.0 527.0 0.41 993.0 NaN NaN 63.03 22825.0 NaN 16.608603

Missing Data

In [7]:
train_na = (train_df.isnull().sum() / len(train_df)) * 100
train_na = train_na.drop(train_na[train_na == 0].index).sort_values(ascending=False)
## Proportion of missing values per variable
f, ax = plt.subplots(figsize=(12, 8))
plt.xticks(rotation='90')
sns.barplot(x=train_na.index, y=train_na)
_ = ax.set(title='Percent missing data by feature', ylabel='% missing')

Price

In [8]:
{'null values': sum(train_df['price_doc'].isnull()), 'zero values': sum(train_df['price_doc'] == 0)}
Out[8]:
{'null values': 0, 'zero values': 0}
In [9]:
train_df['price_doc'].describe()
Out[9]:
count    3.047100e+04
mean     7.123035e+06
std      4.780111e+06
min      1.000000e+05
25%      4.740002e+06
50%      6.274411e+06
75%      8.300000e+06
max      1.111111e+08
Name: price_doc, dtype: float64
In [10]:
train_df['price_doc_log'].describe()
Out[10]:
count    30471.000000
mean        15.609483
std          0.604574
min         11.512935
25%         15.371548
50%         15.651990
75%         15.931766
max         18.526041
Name: price_doc_log, dtype: float64
In [11]:
## Distribution of the price
f, ax = plt.subplots(figsize=(12, 8))
#logi = np.logical_not((train_df['price_doc'].isnull()) | (train_df['price_doc'] == 0))
nonnulbuildyears = train_df.loc[:, 'price_doc']
sns.distplot(a=nonnulbuildyears, bins=100, kde=False)
ax.set(title='Distribution of log price', ylabel='counts')
ax.set_yscale('log')
ax.set_xscale('log')
In [12]:
## Distribution of the price
f, ax = plt.subplots(figsize=(12, 8))
#logi = np.logical_not((train_df['price_doc'].isnull()) | (train_df['price_doc'] == 0))
nonnulbuildyears = train_df.loc[:, 'price_doc_log']
#sns.distplot(a=nonnulbuildyears, bins=100, kde=False)
sns.distplot(a=nonnulbuildyears, bins=100, kde=True)
_ = ax.set(title='Distribution of log price', ylabel='counts')
#ax.set_yscale('log')
#ax.set_xscale('log')

Housing internal characteristics

Studying the prices of the house considering the instrinsec characteristics of the property.

In [13]:
## Studying the internal characteristics
internal_chars = ['full_sq', 'life_sq', 'floor', 'max_floor', 'build_year', 'num_room', 'kitch_sq', 'state', 'price_doc']

train_df[internal_chars].head()
Out[13]:
full_sq life_sq floor max_floor build_year num_room kitch_sq state price_doc
0 43 27.0 4.0 NaN NaN NaN NaN NaN 5850000
1 34 19.0 3.0 NaN NaN NaN NaN NaN 6000000
2 43 29.0 2.0 NaN NaN NaN NaN NaN 5700000
3 89 50.0 9.0 NaN NaN NaN NaN NaN 13100000
4 77 77.0 4.0 NaN NaN NaN NaN NaN 16331452
In [14]:
## Proportion of missing values per variable
train_na = (train_df[internal_chars].isnull().sum() / len(train_df)) * 100
train_na = train_na.drop(train_na[train_na == 0].index).sort_values(ascending=False)
f, ax = plt.subplots(figsize=(12, 8))
plt.xticks(rotation='90')
sns.barplot(x=train_na.index, y=train_na)
_ = ax.set(title='Percent missing data by feature', ylabel='% missing')

Univariant study

Data quality issues for each variable and study of the distribution of each variable values.

State

In [15]:
# state should be discrete valued between 1 and 4.
train_df['state'].value_counts()
Out[15]:
2.0     5844
3.0     5790
1.0     4855
4.0      422
33.0       1
Name: state, dtype: int64
In [16]:
# There is a 33 in it that is cleary a data entry error
# Lets just replace it with the mode.
train_df.loc[train_df['state'] == 33, 'state'] = train_df['state'].mode()
In [17]:
f, ax = plt.subplots(figsize=(12, 8))
sns.violinplot(x='state', y='price_doc_log', data=train_df, scale='count')
#sns.swarmplot(data=train_df[internal_chars], color="w", alpha=.2)
#ax.set_yscale('log')
ax.set_ylabel('Logaritmic prices')
_ = plt.title("Distributions of sq prices per states of the building")
#sns.violinplot(x="square meters", y="price_doc

Build year

In [18]:
# Study of the build year (easy to identify some errors)
train_df['build_year'].value_counts()
Out[18]:
2014.0        919
2015.0        824
0.0           530
2013.0        464
1970.0        418
1969.0        407
1968.0        389
1967.0        384
1965.0        378
2016.0        375
1.0           368
1972.0        360
1974.0        357
1971.0        352
1966.0        348
1960.0        344
1962.0        338
1973.0        333
1963.0        325
1964.0        315
1975.0        309
1961.0        297
1976.0        263
1977.0        260
2006.0        242
1979.0        236
1978.0        235
2008.0        234
2012.0        233
1980.0        226
             ... 
1912.0          5
1947.0          4
1949.0          3
1914.0          3
1924.0          3
1941.0          2
3.0             2
1943.0          2
1860.0          2
1896.0          2
1946.0          2
1907.0          2
1900.0          2
1895.0          1
1691.0          1
20052009.0      1
1911.0          1
2018.0          1
215.0           1
1886.0          1
1905.0          1
4965.0          1
1906.0          1
1920.0          1
1925.0          1
1948.0          1
1904.0          1
71.0            1
20.0            1
1876.0          1
Name: build_year, dtype: int64
In [19]:
# There are some of them that has non logical values
# build_year has an erronus value 20052009. Since its unclear which it should be, let's replace with 2007
train_df.loc[train_df['build_year'] == 20052009, 'build_year'] = 2007
train_df.loc[train_df['build_year'] < 1500, 'build_year'] = np.nan
train_df.loc[train_df['build_year'] > 2018, 'build_year'] = np.nan
In [20]:
_ = train_df['build_year'].hist(bins=100)
In [21]:
## Proportion of missing values per variable
## Probably there are two other problematic values
f, ax = plt.subplots(figsize=(12, 8))
logi = np.logical_not((train_df['build_year'].isnull()) | (train_df['build_year'] == 0))
nonnulbuildyears = train_df.loc[logi, 'build_year']
sns.distplot(a=nonnulbuildyears, bins=100, kde=False)
#sns.distplot(a=nonnulbuildyears, bins=100, kde=True)
ax.set(title='Temporal distribution of building years', ylabel='counts')
ax.set_yscale('log')
ax.set_xlim(1650, 2018)
Out[21]:
(1650, 2018)

Flat areas distribution

In [22]:
area_chars = ['full_sq', 'life_sq', 'kitch_sq']

# Imputation for the outliers
#train_df[internal_chars].loc[train_df[internal_chars] == 0].head()
train_df.loc[train_df['full_sq'] > 2000, 'full_sq'] = train_df['full_sq'].mean()
train_df.loc[train_df['kitch_sq'] > train_df['full_sq'], 'kitch_sq'] = np.nan
train_df.loc[train_df['kitch_sq'] > train_df['life_sq'], 'kitch_sq'] = np.nan
train_df.loc[train_df['life_sq'] > train_df['full_sq'], 'life_sq'] = np.nan

def plot_notnull_distributions(train_df, name_feats):
    for name_fea in name_feats:
        logi = np.logical_not((train_df[name_fea].isnull()) | (train_df[name_fea] == 0))
        nonnullvals = train_df.loc[logi, name_fea]
        sns.distplot(a=nonnullvals, bins=100, kde=False, label=name_fea)

f, ax = plt.subplots(figsize=(12, 8))
plot_notnull_distributions(train_df, area_chars)
ax.set(title='Distribution of area', ylabel='counts')
ax.set_yscale('log')
ax.set_xlabel("Square meters")
_ = ax.legend()
In [23]:
f, ax = plt.subplots(figsize=(12, 8))
sns.violinplot(data=np.log1p(train_df[area_chars]), scale='count')
#sns.swarmplot(data=train_df[internal_chars], color="w", alpha=.2)
#ax.set_yscale('log')
ax.set_ylabel('Logaritmic sqm')
_ = plt.title("Distributions of sq meters")
#sns.violinplot(x="square meters", y="price_doc_log10", data=train_df.drop(ind), inner="box")
# sns.swarmplot(x="state", y="price_doc_log10", data=train_df.dropna(), color="w", alpha=.2);
#ax.set(title='Log10 of median price by state of home', xlabel='state', ylabel='log10(price)')
In [24]:
corrmat = train_df[area_chars].corr()
f, ax = plt.subplots(figsize=(10, 7))
plt.xticks(rotation='90')
_ = sns.heatmap(corrmat, square=True, linewidths=.5, annot=True)

Num. rooms

In [25]:
counts_rooms = train_df['num_room'].value_counts()
## Counts of rooms
f, ax = plt.subplots(figsize=(12, 8))
#plt.xticks(rotation='90')
sns.barplot(x=counts_rooms.index, y=counts_rooms)
_ = ax.set(title='Counts of rooms', ylabel='counts')


train_df.loc[train_df['num_room'] == 0, 'num_room'] = np.nan

Floors

In [26]:
train_df.loc[(train_df['max_floor'] < train_df['floor']), ['max_floor', 'floor']].head()
Out[26]:
max_floor floor
8216 0.0 13.0
8268 1.0 3.0
8499 0.0 2.0
8531 0.0 7.0
8912 0.0 5.0
In [27]:
## Imputation of the maximum floor for the ones higher than the max floor
logi = (train_df['max_floor'] < train_df['floor'])
train_df.loc[logi, 'max_floor'] = train_df.loc[logi, 'floor']

Material

In [28]:
## Material counts
material_counts = train_df['material'].value_counts().sort_values(ascending=False)
material_counts.index = [str(int(e)) for e in material_counts.index]
material_counts['null'] = sum(train_df['material'].isnull())

## Plot of counts
f, ax = plt.subplots(figsize=(12, 3))
sns.barplot(x=material_counts, y=material_counts.index, orient='h',
            palette=sns.color_palette("GnBu_d"))
_ = ax.set(title='Material counts', xlabel='counts')

for p in ax.patches:
    width = p.get_width()
    plt.text(50+p.get_width(), p.get_y()+0.55*p.get_height(),
             '{}'.format(int(width)),
             ha='left', va='center')
_ = ax.set_xlim([0, max(material_counts)+1000])
In [29]:
materials = train_df['material'].fillna('null').apply(lambda x: str(int(x)) if x != 'null' else x)
materials = pd.concat([materials, train_df['price_doc_log']], axis=1)

f, ax = plt.subplots(figsize=(12, 8))
_ = sns.violinplot(x='material', y='price_doc_log', data=materials, scale='count')

Product type

In [30]:
## Product counts
product_counts = train_df['product_type'].value_counts().sort_values(ascending=False)

## Plot of counts
f, ax = plt.subplots(figsize=(12, 2))
sns.barplot(x=product_counts, y=product_counts.index, orient='h',
            palette=sns.color_palette("GnBu_d"))
_ = ax.set(title='Product type counts', xlabel='counts')

for p in ax.patches:
    width = p.get_width()
    plt.text(-150+p.get_width(), p.get_y()+0.55*p.get_height(),
             '{}'.format(int(width)),
             ha='right', va='center')
In [31]:
products = train_df['product_type'].fillna('null')
products = pd.concat([products, train_df['price_doc_log']], axis=1)

f, ax = plt.subplots(figsize=(12, 8))
_ = sns.violinplot(x='product_type', y='price_doc_log', data=products, scale='count')

Pair variables study

In [32]:
corrmat = train_df[internal_chars].corr()
f, ax = plt.subplots(figsize=(10, 7))
plt.xticks(rotation='90')
_ = sns.heatmap(corrmat, square=True, linewidths=.5, annot=True)
In [33]:
corrmat = corrmat.iloc[:-1, :]

## Plot of counts
#Create a matplotlib colormap from the sns seagreen color palette
cmap = sns.diverging_palette(0, 150, as_cmap=True)
#cmap = sns.color_palette("RdYlGn", 11, as_cmap=True)
# Normalize to the range of possible values from df["c"]
norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
# create a color dictionary (value in c : color from colormap) 
colors = {}
for cfea, cval in zip(corrmat.index, corrmat.iloc[:, -1]):
    if cfea == 'price_doc':
        continue
    colors.update({cfea : cmap(norm(cval))})

fig, ax = plt.subplots(figsize=(12, 5))
sns.barplot(x=corrmat['price_doc'], y=corrmat.index, orient='h', palette=colors)
_ = ax.set(title='Correlations with price', xlabel='correlation value')
_ = ax.set_xlim(-1, 1)

# remove the legend, because we want to set a colorbar instead
#plt.gca().legend_.remove()

## create colorbar ##
divider = make_axes_locatable(plt.gca())
ax_cb = divider.new_horizontal(size="5%", pad=0.15)
fig.add_axes(ax_cb)
cb1 = matplotlib.colorbar.ColorbarBase(ax_cb, cmap=cmap, norm=norm,
                                       orientation='vertical')
cb1.set_label('Correlation')
plt.show()
In [34]:
f, ax = plt.subplots()
g = sns.PairGrid(train_df[internal_chars], diag_sharey=False)
#g = sns.pairplot(train_df[internal_chars], diag_kind="kde")
g.map_diag(sns.kdeplot, lw=3)
g.map_upper(plt.scatter)
g.map_lower(plt.scatter)
#g.map_lower(sns.kdeplot, cmap="Blues_d")
for ax in g.axes.flat:  
    plt.setp(ax.get_xticklabels(), rotation=45)
/opt/conda/lib/python3.6/site-packages/matplotlib/axes/_axes.py:545: UserWarning: No labelled objects found. Use label='...' kwarg on individual plots.
  warnings.warn("No labelled objects found. "
/opt/conda/lib/python3.6/site-packages/statsmodels/nonparametric/kde.py:454: RuntimeWarning: invalid value encountered in greater
  X = X[np.logical_and(X>clip[0], X<clip[1])] # won't work for two columns.
/opt/conda/lib/python3.6/site-packages/statsmodels/nonparametric/kde.py:454: RuntimeWarning: invalid value encountered in less
  X = X[np.logical_and(X>clip[0], X<clip[1])] # won't work for two columns.

Region

In [35]:
## Region counts
region_counts = train_df['sub_area'].value_counts().sort_values(ascending=False)

## Plot of counts
f, ax = plt.subplots(figsize=(12, 22))
sns.barplot(x=region_counts, y=region_counts.index, orient='h')
_ = ax.set(title='Counts by region', xlabel='counts')

Temporal

In [36]:
years = mdates.YearLocator()   # every year
yearsFmt = mdates.DateFormatter('%Y')
ts_vc_train = train_df['timestamp'].value_counts()

## Plot
fig, ax = plt.subplots(figsize=(12, 6))
plt.bar(left=ts_vc_train.index, height=ts_vc_train)
ax.xaxis.set_major_locator(years)
ax.xaxis.set_major_formatter(yearsFmt)
_ = ax.set(title='Number of transactions by day', ylabel='count')
In [37]:
# Smooth the data counts
idx = pd.date_range(ts_vc_train.index.min(), ts_vc_train.index.max())
ts_vc_train.index = pd.DatetimeIndex(ts_vc_train.index)
ts_vc_train = ts_vc_train.reindex(idx, fill_value=0)
ts_vc_train_smooth = ts_vc_train.rolling(window=7, win_type='triang', center=True).mean()
In [45]:
## Treatment of data
years_u = np.unique(ts_vc_train_smooth.index.year)
logi = [ts_vc_train_smooth.index.year == y_u for y_u in years_u]
ts_vc_train_years = [ts_vc_train_smooth[l] for l in logi]
indices_years = [np.array(ts_vc_train_smooth.index[logi[i]] - datetime.datetime(years_u[i], 1, 1)) for i in range(len(years_u))]

# Reference year
days = pd.date_range(datetime.datetime(2004, 1, 1), datetime.datetime(2004, 12, 31))
days_diff = np.array(days - datetime.datetime(2004, 1, 1)).astype(int)
days_bool = [d in [1, 15] for d in days.day]
days_loc = [days_diff[i] for i in range(len(days_diff)) if days_bool[i]]
days_ticks = days[days_bool]

## Plot
fig, ax = plt.subplots(figsize=(18, 10))
for dates, data, year in zip(indices_years, ts_vc_train_years, years_u):
    plt.plot(dates, data, label=year)
_ = ax.set_xlim([0, int(np.hstack(indices_years).max())])
_ = plt.xticks(days_loc, days_ticks.strftime('%b-%d'), rotation=30)

_ = ax.set(title='Number of transactions per day', ylabel='count')
_ = ax.legend()
In [57]:
# Smooth the data counts
ts_vc_train_smooth = ts_vc_train.rolling(window=1, win_type='triang', center=True).mean()

## Treatment of data
years_u = np.unique(ts_vc_train_smooth.index.year)
logi = [ts_vc_train_smooth.index.year == y_u for y_u in years_u]
ts_vc_train_years = [ts_vc_train_smooth[l] for l in logi]
# Collapse by day of the week
for i in range(len(ts_vc_train_years)):
    days_w = ts_vc_train_years[i].index.weekday
    collapse2days = {}
    for day_i in range(7):
        collapse2days[day_i] = ts_vc_train_years[i][days_w == day_i].mean()
    ts_vc_train_years[i] = pd.Series(collapse2days)

# Reference year
days_loc = range(7)
days_ticks = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

## Plot
fig, ax = plt.subplots(figsize=(18, 10))
for data, year in zip(ts_vc_train_years, years_u):
    plt.plot(data, label=year)
_ = ax.set_xlim([0, 6])
_ = plt.xticks(days_loc, days_ticks, rotation=30)

_ = ax.set(title='Mean count of transactions per day', ylabel='count')
_ = ax.legend()
In [40]:
# Smooth the data counts
idx = pd.date_range(ts_vc_train.index.min(), ts_vc_train.index.max())
ts_vc_train.index = pd.DatetimeIndex(ts_vc_train.index)
ts_vc_train = ts_vc_train.reindex(idx, fill_value=0)
ts_vc_train = ts_vc_train.rolling(window=7, win_type='triang', center=True).mean()
ts_vc_train

## Treatment of data
years_u = np.unique(ts_vc_train.index.year)
logi = [ts_vc_train.index.year == y_u for y_u in years_u]
ts_vc_train_years = [ts_vc_train[l] for l in logi]
indices_years = [np.array(ts_vc_train.index[logi[i]] - datetime.datetime(years_u[i], 1, 1)) for i in range(len(years_u))]

# Reference year
days = pd.date_range(datetime.datetime(2004, 1, 1), datetime.datetime(2004, 12, 31))
days_diff = np.array(days - datetime.datetime(2004, 1, 1)).astype(int)
days_bool = [d in [1, 15] for d in days.day]
days_loc = [days_diff[i] for i in range(len(days_diff)) if days_bool[i]]
days_ticks = days[days_bool]

## Plot
fig, ax = plt.subplots(figsize=(18, 10))
for dates, data, year in zip(indices_years, ts_vc_train_years, years_u):
    plt.plot(dates, data, label=year)
_ = ax.set_xlim([0, int(np.hstack(indices_years).max())])
_ = plt.xticks(days_loc, days_ticks.strftime('%b-%d'), rotation=30)

_ = ax.set(title='Number of transactions per day', ylabel='count')
_ = ax.legend()
In [41]:
## Study of the prices by the building year
ind = train_df[(train_df['build_year'] <= 1691) | (train_df['build_year'] >= 2018)].index
by_df = train_df.drop(ind).sort_values(by=['build_year'])
by_price = by_df.groupby('build_year')[['build_year', 'price_doc']].mean()

f, ax = plt.subplots(figsize=(17, 6))
sns.regplot(x="build_year", y="price_doc", data=by_price, scatter=False, order=3, truncate=True)
plt.plot(by_price['build_year'], by_price['price_doc'], color='r')
ax.set(title='Mean price by year of build')
_ = ax.set_xlim(1860, 2018)
In [42]:
## Study of the prices by the transaction date
fig, ax = plt.subplots(figsize=(17, 6))
ts_df = train_df.groupby('timestamp')[['price_doc']].mean().reset_index(level=1)
ts_df['timestamp'] = pd.to_datetime(ts_df['timestamp'], format='%Y-%b-%d').apply(lambda x: mdates.date2num(x))

sns.regplot(x="timestamp", y="price_doc", data=ts_df, scatter=False, order=3, truncate=True, ci=100)
plt.plot(ts_df['timestamp'], ts_df['price_doc'], color='r', )
sns.tsplot(data=ts_df, time='timestamp', value='price_doc', ax=ax, interpolate=True)

# assign locator and formatter for the xaxis ticks.
ax.xaxis.set_major_locator(mdates.AutoDateLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y.%m.%d'))

# put the labels at 45deg since they tend to be too long
fig.autofmt_xdate()

# Axis
_ = ax.set(title='Daily median price over time')
_ = ax.set_xlim(ts_df['timestamp'].min(), ts_df['timestamp'].max())
In [ ]:
 

Macro data

Missing data

In [43]:
macro_na = (macro_df.isnull().sum() / len(macro_df)) * 100
macro_na = macro_na.drop(macro_na[macro_na == 0].index).sort_values(ascending=False)
## Proportion of missing values per variable
f, ax = plt.subplots(figsize=(18, 6))
plt.xticks(rotation='90')
sns.barplot(x=macro_na.index, y=macro_na)
_ = ax.set(title='Percent missing data by feature', ylabel='% missing')
In [58]:
corrmat = macro_df.corr()
#f, ax = plt.subplots(figsize=(10, 7))
#plt.xticks(rotation='90')
#_ = sns.heatmap(corrmat, square=True, linewidths=.5, annot=False)
corrmat = corrmat.iloc[:-1, :]

## Plot of counts
#Create a matplotlib colormap from the sns seagreen color palette
cmap = sns.diverging_palette(0, 150, as_cmap=True)
#cmap = sns.color_palette("RdYlGn", 11, as_cmap=True)
# Normalize to the range of possible values from df["c"]
norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
# create a color dictionary (value in c : color from colormap) 
colors = {}
for cfea, cval in zip(corrmat.index, corrmat.iloc[:, -1]):
    if cfea == 'price_doc_log':
        continue
    colors.update({cfea : cmap(norm(cval))})

fig, ax = plt.subplots(figsize=(12, 22))
sns.barplot(x=corrmat['price_doc_log'], y=corrmat.index, orient='h', palette=colors)
_ = ax.set(title='Correlations with logprice', xlabel='correlation value')
_ = ax.set_xlim(-1, 1)

# remove the legend, because we want to set a colorbar instead
#plt.gca().legend_.remove()

## create colorbar ##
divider = make_axes_locatable(plt.gca())
ax_cb = divider.new_horizontal(size="5%", pad=0.15)
fig.add_axes(ax_cb)
cb1 = matplotlib.colorbar.ColorbarBase(ax_cb, cmap=cmap, norm=norm,
                                       orientation='vertical')
cb1.set_label('Correlation')
plt.show()