Data Scientist Challenge

In this coding challenge, you will analyze the real Blood Glucose variability and the iPhone-detected motion activities of somobody. He's not a diabetic, but you never know. You will develop some Python functions and modules, and some very basic statistical analysis. The main point of this notebook is to show some basic coding skills, and basic elementary statistical analysis.

Input data

You have two files available:

  • blood-glucose.csv is a CSV file with each line reporting a timestamp and a blood glucose level (expressed in mg/dL), measured every 15 minutes. Please note that some values are missing.
  • motion.tsv is a TSV file with each line reporting a timestamp and 5 boolean values. Each of these boolean values refer to a type of activity, respectively 'stationary', 'walking', 'running', 'automotive' and 'cycling'. These must be interpreted in this way: when the user starts to walk, a line with 1 in the walking position is reported; when the user stops walking, a line with 0 in that position is reported. The whole period in between should be interpreted as walking. The same goes for the other activities.

Consider that it is possible to have multiple activities at the same time (for example, stationary and automotive: think of being on a bus), and it's also possible to have no activities recognized at a given time (meaning we don't know).

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
from ts_tools import get_ts_range, get_times_by_interval, get_floor_range, get_ceil_range
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib
In [2]:
## Functions definition
import datetime
import numpy as np


def get_ts_range(init_dt, end_dt, ts_range=900):
    time_ranges = []
    i = 0
    while (init_dt+i*datetime.timedelta(seconds=ts_range)) <= end_dt:
        time_ranges.append(init_dt+i*datetime.timedelta(seconds=ts_range))
        i += 1
    return time_ranges


def get_times_by_interval(dt_ini, dt_end):
    # Compute floor and ceil interval limits
    init = datetime.datetime(dt_ini.year, dt_ini.month, dt_ini.day, dt_ini.hour, dt_ini.minute - (dt_ini.minute % 15), 0)
    endt = datetime.datetime(dt_end.year, dt_end.month, dt_end.day, dt_end.hour, dt_end.minute - (dt_end.minute % 15), 0)
    endt = endt + datetime.timedelta(seconds=900)
    
    # Assign times
    interval_ranges = get_ts_range(init, endt, 900)
    times = np.zeros(len(interval_ranges))
    times[0] = (dt_ini - interval_ranges[0]).total_seconds()
    times[-1] = (interval_ranges[-1] - dt_end).total_seconds()
    times[1:-1] = 900
    
    return interval_ranges, times
In [3]:
blood_glucose = pd.read_csv('input/blood-glucose.csv', header=None, names=['times', 'blood_glucose'])
blood_glucose = blood_glucose[~ pd.isnull(blood_glucose['blood_glucose'])]
blood_glucose['times'] = blood_glucose['times'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S+00:00'))
In [4]:
blood_glucose.head()
Out[4]:
times blood_glucose
42 2017-05-23 10:30:00 76.0
43 2017-05-23 10:45:00 73.0
44 2017-05-23 11:00:00 80.0
45 2017-05-23 11:15:00 92.0
46 2017-05-23 11:30:00 100.0
In [5]:
motion = pd.read_csv('input/motion.tsv', sep='\t', names=['times', 'stationary', 'walking', 'running', 'automotive', 'cycling'], header=None)
motion['times'] = motion['times'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%f+00:00'))
In [6]:
motion.tail()
Out[6]:
times stationary walking running automotive cycling
13117 2017-06-02 23:52:16.418372 0 0 0 0 0
13118 2017-06-02 23:54:18.267440 1 0 0 0 0
13119 2017-06-02 23:54:21.136673 0 0 0 0 0
13120 2017-06-02 23:57:19.966934 0 1 0 0 0
13121 2017-06-02 23:57:25.666652 0 0 0 0 0
In [7]:
# Get time ranges of the whole data
init_dt = datetime.datetime(2017, 5, 23, 0, 0)
end_dt = datetime.datetime(2017, 6, 3, 0, 0)
time_ranges = get_ts_range(init_dt, end_dt, 900)
In [8]:
times = list(motion['times'])
times_intervals = np.zeros((len(time_ranges), 5))
col_activities = ['stationary', 'walking', 'running', 'automotive', 'cycling']

for i_col, col in enumerate(col_activities):
    idxs = np.where(np.diff(motion[col].as_matrix()) == -1)[0]
    # For each interval of activity
    for i in idxs:
        dt_ini = times[i]
        dt_end = times[i+1]
        # Fill times activity
        interval_ranges, times_intervals_range = get_times_by_interval(dt_ini, dt_end)
        
        i_init = time_ranges.index(interval_ranges[0])
        for i_i in range(len(times_intervals_range)):
            times_intervals[i_init + i_i, i_col] += times_intervals_range[i_i]
In [9]:
times_activities_48h = time_ranges[192:]
times_intervals_48h = np.zeros((len(times_intervals)-192, 5))
std_intervals_48h = np.zeros((len(times_intervals)-192, 5))

for i in range(len(times_intervals)-192):
    times_intervals_48h[i] = times_intervals[i:i+192].sum(0)
    std_intervals_48h[i] = times_intervals[i:i+192].std(0)
In [10]:
# Compute and interpolate blood glucose
init_time = get_floor_range(blood_glucose.times.iloc[0], 15)
endt_time = get_ceil_range(blood_glucose.times.iloc[-1], 15)
time_ranges_blood = get_ts_range(init_time, endt_time, 900)

blood_glucose_interp = np.zeros(len(time_ranges_blood))
blood_times = list(blood_glucose.times)
for i_time in range(len(blood_times)):
    try:
        idx = time_ranges_blood.index(blood_times[i_time])
        blood_glucose_interp[idx] = blood_glucose.blood_glucose.iloc[i_time]
    except:
        pass

idxs = np.where(blood_glucose_interp)[0]
# TOImprove: better interpolation
blood_glucose_interp = np.interp(range(len(blood_glucose_interp)), idxs, blood_glucose_interp[idxs])
In [11]:
# Compute the std of the blood glucose level on the last 48h
blood_glucose_std_48h = np.zeros(len(blood_glucose_interp)-192)
for i in range(len(blood_glucose_interp)-192):
    blood_glucose_std_48h[i] = blood_glucose_interp[i:i+192].std(0)
time_ranges_blood_48h = time_ranges_blood[192:]
In [12]:
# Compute std blood glucose
walking_time_48h = np.zeros(len(time_ranges_blood_48h))
for i_time in range(len(times_intervals_48h)):
    try:
        idx = time_ranges_blood_48h.index(times_48h[i_time])
        walking_time_48h[idx] = times_intervals_48h[i_time, 1]
    except:
        pass
    
times_48h = time_ranges_blood_48h
In [13]:
walking_time_at_interval = np.zeros(len(times_48h))
for i_time in range(len(walking_time_at_interval)):
    try:
        idx = times_48h.index(time_ranges[i_time])
        walking_time_at_interval[idx] = times_intervals[i_time, 1]
    except:
        pass

Plots of computed values

In [14]:
## Plot
fig, ax = plt.subplots(figsize=(18, 10))

for i_col in range(5):
    plt.plot(times_activities_48h, times_intervals_48h[:, i_col], label=col_activities[i_col])

_ = plt.legend()
In [15]:
## Plot
fig, ax = plt.subplots(figsize=(18, 10))

for i_col in range(5):
    plt.plot(times_activities_48h, times_intervals_48h[:, i_col], label=col_activities[i_col])

ax.set_yscale('log')
_ = plt.legend()
In [16]:
## Plot
fig, ax = plt.subplots(figsize=(18, 10))

for i_col in range(5):
    plt.plot(times_activities_48h, std_intervals_48h[:, i_col], label=col_activities[i_col])

_ = plt.legend()
In [17]:
## Plot
fig, ax = plt.subplots(figsize=(18, 10))

for i_col in range(5):
    plt.plot(times_activities_48h, std_intervals_48h[:, i_col], label=col_activities[i_col])

ax.set_yscale('log')
_ = plt.legend()
In [18]:
## Plot
fig, ax = plt.subplots(figsize=(18, 10))

_ = plt.plot(times_48h, walking_time_at_interval)

Correlation

In [19]:
np.correlate(walking_time_48h, blood_glucose_std_48h)[0]
Out[19]:
0.0
In [20]:
## Plot
fig, ax = plt.subplots(figsize=(18, 10))


diff_walking = np.diff(walking_time_48h)
diff_blood_glucose = np.diff(blood_glucose_std_48h)

plt.plot(times_48h[:-1], diff_walking/np.std(diff_walking), label="Walking time in the last 48h")
plt.plot(times_48h[:-1], diff_blood_glucose/np.std(diff_blood_glucose), label="std of blood glucose levels of the last 48h")

_ = plt.legend()
/opt/conda/lib/python3.6/site-packages/ipykernel/__main__.py:8: RuntimeWarning: invalid value encountered in true_divide
In [21]:
## Plot
fig, ax = plt.subplots(figsize=(18, 10))

_ = plt.plot(walking_time_48h, blood_glucose_std_48h, '.')
In [22]:
## Plot
fig, ax = plt.subplots(figsize=(18, 10))

over_logi = blood_glucose_std_48h > 12.5
under_logi = np.logical_not(over_logi)

_ = plt.plot(walking_time_48h[under_logi], blood_glucose_std_48h[under_logi], '.')
_ = plt.plot(walking_time_48h[over_logi], blood_glucose_std_48h[over_logi], '.')

_ = plt.plot(np.linspace(walking_time_48h.min(), walking_time_48h.max(), 100), 12.5*np.ones(100), linestyle='dashed')
In [23]:
## Plot of correlation of walking in that time
fig, ax = plt.subplots(figsize=(18, 10))

walking_time_at_interval

over_logi = blood_glucose_std_48h > 12.5
under_logi = np.logical_not(over_logi)

_ = plt.plot(walking_time_at_interval[under_logi], blood_glucose_std_48h[under_logi], '.')
_ = plt.plot(walking_time_at_interval[over_logi], blood_glucose_std_48h[over_logi], '.')

_ = plt.plot(np.linspace(walking_time_at_interval.min(), walking_time_at_interval.max(), 100), 12.5*np.ones(100), linestyle='dashed')

Prediction

In [24]:
## Dummy model
model = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True)
param_grid = { 
    'penalty': ['l2'],
    'C': [0.1, 1., 10.]
}

CV_model = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
CV_model.fit(walking_time_48h.reshape(-1, 1), blood_glucose_std_48h>14.5)
Out[24]:
GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'penalty': ['l2'], 'C': [0.1, 1.0, 10.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)
In [25]:
CV_model.best_score_
Out[25]:
0.89281210592686
In [26]:
CV_model.predict_proba([[100000]])
Out[26]:
array([[ 0.86895632,  0.13104368]])
In [27]:
CV_model.predict_proba([[350000]])
Out[27]:
array([[ 0.86895632,  0.13104368]])
In [28]:
## Store trained model
joblib.dump(CV_model, 'model_blood_glucose.pkl')
Out[28]:
['model_blood_glucose.pkl']
In [29]:
### Prediction using walking times in each moment
# Dummy model
model = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True)
param_grid = { 
    'penalty': ['l2'],
    'C': [0.1, 1., 10.]
}

CV_model = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
CV_model.fit(walking_time_at_interval.reshape(-1, 1), blood_glucose_std_48h>14.5)
Out[29]:
GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'penalty': ['l2'], 'C': [0.1, 1.0, 10.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)
In [30]:
CV_model.best_score_
Out[30]:
0.89281210592686
In [31]:
CV_model.predict_proba([[8000]])
Out[31]:
array([[ 0.91783258,  0.08216742]])
In [32]:
CV_model.predict_proba([[0]])
Out[32]:
array([[ 0.86444132,  0.13555868]])
In [33]:
## Store trained model
joblib.dump(CV_model, 'model_interval_blood_glucose.pkl') 
Out[33]:
['model_interval_blood_glucose.pkl']