In this coding challenge, you will analyze the real Blood Glucose variability and the iPhone-detected motion activities of somobody. He's not a diabetic, but you never know. You will develop some Python functions and modules, and some very basic statistical analysis. The main point of this notebook is to show some basic coding skills, and basic elementary statistical analysis.
You have two files available:
blood-glucose.csv
is a CSV file with each line reporting a timestamp and a blood glucose level
(expressed in mg/dL), measured every 15 minutes. Please note that some values are missing.motion.tsv
is a TSV file with each line reporting a timestamp and 5 boolean values. Each of these
boolean values refer to a type of activity, respectively 'stationary', 'walking', 'running', 'automotive' and
'cycling'. These must be interpreted in this way: when the user starts to walk, a line with 1 in the
walking position is reported; when the user stops walking, a line with 0 in that position is reported.
The whole period in between should be interpreted as walking. The same goes for the other activities.Consider that it is possible to have multiple activities at the same time (for example, stationary and automotive: think of being on a bus), and it's also possible to have no activities recognized at a given time (meaning we don't know).
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
from ts_tools import get_ts_range, get_times_by_interval, get_floor_range, get_ceil_range
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib
## Functions definition
import datetime
import numpy as np
def get_ts_range(init_dt, end_dt, ts_range=900):
time_ranges = []
i = 0
while (init_dt+i*datetime.timedelta(seconds=ts_range)) <= end_dt:
time_ranges.append(init_dt+i*datetime.timedelta(seconds=ts_range))
i += 1
return time_ranges
def get_times_by_interval(dt_ini, dt_end):
# Compute floor and ceil interval limits
init = datetime.datetime(dt_ini.year, dt_ini.month, dt_ini.day, dt_ini.hour, dt_ini.minute - (dt_ini.minute % 15), 0)
endt = datetime.datetime(dt_end.year, dt_end.month, dt_end.day, dt_end.hour, dt_end.minute - (dt_end.minute % 15), 0)
endt = endt + datetime.timedelta(seconds=900)
# Assign times
interval_ranges = get_ts_range(init, endt, 900)
times = np.zeros(len(interval_ranges))
times[0] = (dt_ini - interval_ranges[0]).total_seconds()
times[-1] = (interval_ranges[-1] - dt_end).total_seconds()
times[1:-1] = 900
return interval_ranges, times
blood_glucose = pd.read_csv('input/blood-glucose.csv', header=None, names=['times', 'blood_glucose'])
blood_glucose = blood_glucose[~ pd.isnull(blood_glucose['blood_glucose'])]
blood_glucose['times'] = blood_glucose['times'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S+00:00'))
blood_glucose.head()
motion = pd.read_csv('input/motion.tsv', sep='\t', names=['times', 'stationary', 'walking', 'running', 'automotive', 'cycling'], header=None)
motion['times'] = motion['times'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%f+00:00'))
motion.tail()
# Get time ranges of the whole data
init_dt = datetime.datetime(2017, 5, 23, 0, 0)
end_dt = datetime.datetime(2017, 6, 3, 0, 0)
time_ranges = get_ts_range(init_dt, end_dt, 900)
times = list(motion['times'])
times_intervals = np.zeros((len(time_ranges), 5))
col_activities = ['stationary', 'walking', 'running', 'automotive', 'cycling']
for i_col, col in enumerate(col_activities):
idxs = np.where(np.diff(motion[col].as_matrix()) == -1)[0]
# For each interval of activity
for i in idxs:
dt_ini = times[i]
dt_end = times[i+1]
# Fill times activity
interval_ranges, times_intervals_range = get_times_by_interval(dt_ini, dt_end)
i_init = time_ranges.index(interval_ranges[0])
for i_i in range(len(times_intervals_range)):
times_intervals[i_init + i_i, i_col] += times_intervals_range[i_i]
times_activities_48h = time_ranges[192:]
times_intervals_48h = np.zeros((len(times_intervals)-192, 5))
std_intervals_48h = np.zeros((len(times_intervals)-192, 5))
for i in range(len(times_intervals)-192):
times_intervals_48h[i] = times_intervals[i:i+192].sum(0)
std_intervals_48h[i] = times_intervals[i:i+192].std(0)
# Compute and interpolate blood glucose
init_time = get_floor_range(blood_glucose.times.iloc[0], 15)
endt_time = get_ceil_range(blood_glucose.times.iloc[-1], 15)
time_ranges_blood = get_ts_range(init_time, endt_time, 900)
blood_glucose_interp = np.zeros(len(time_ranges_blood))
blood_times = list(blood_glucose.times)
for i_time in range(len(blood_times)):
try:
idx = time_ranges_blood.index(blood_times[i_time])
blood_glucose_interp[idx] = blood_glucose.blood_glucose.iloc[i_time]
except:
pass
idxs = np.where(blood_glucose_interp)[0]
# TOImprove: better interpolation
blood_glucose_interp = np.interp(range(len(blood_glucose_interp)), idxs, blood_glucose_interp[idxs])
# Compute the std of the blood glucose level on the last 48h
blood_glucose_std_48h = np.zeros(len(blood_glucose_interp)-192)
for i in range(len(blood_glucose_interp)-192):
blood_glucose_std_48h[i] = blood_glucose_interp[i:i+192].std(0)
time_ranges_blood_48h = time_ranges_blood[192:]
# Compute std blood glucose
walking_time_48h = np.zeros(len(time_ranges_blood_48h))
for i_time in range(len(times_intervals_48h)):
try:
idx = time_ranges_blood_48h.index(times_48h[i_time])
walking_time_48h[idx] = times_intervals_48h[i_time, 1]
except:
pass
times_48h = time_ranges_blood_48h
walking_time_at_interval = np.zeros(len(times_48h))
for i_time in range(len(walking_time_at_interval)):
try:
idx = times_48h.index(time_ranges[i_time])
walking_time_at_interval[idx] = times_intervals[i_time, 1]
except:
pass
## Plot
fig, ax = plt.subplots(figsize=(18, 10))
for i_col in range(5):
plt.plot(times_activities_48h, times_intervals_48h[:, i_col], label=col_activities[i_col])
_ = plt.legend()
## Plot
fig, ax = plt.subplots(figsize=(18, 10))
for i_col in range(5):
plt.plot(times_activities_48h, times_intervals_48h[:, i_col], label=col_activities[i_col])
ax.set_yscale('log')
_ = plt.legend()
## Plot
fig, ax = plt.subplots(figsize=(18, 10))
for i_col in range(5):
plt.plot(times_activities_48h, std_intervals_48h[:, i_col], label=col_activities[i_col])
_ = plt.legend()
## Plot
fig, ax = plt.subplots(figsize=(18, 10))
for i_col in range(5):
plt.plot(times_activities_48h, std_intervals_48h[:, i_col], label=col_activities[i_col])
ax.set_yscale('log')
_ = plt.legend()
## Plot
fig, ax = plt.subplots(figsize=(18, 10))
_ = plt.plot(times_48h, walking_time_at_interval)
np.correlate(walking_time_48h, blood_glucose_std_48h)[0]
## Plot
fig, ax = plt.subplots(figsize=(18, 10))
diff_walking = np.diff(walking_time_48h)
diff_blood_glucose = np.diff(blood_glucose_std_48h)
plt.plot(times_48h[:-1], diff_walking/np.std(diff_walking), label="Walking time in the last 48h")
plt.plot(times_48h[:-1], diff_blood_glucose/np.std(diff_blood_glucose), label="std of blood glucose levels of the last 48h")
_ = plt.legend()
## Plot
fig, ax = plt.subplots(figsize=(18, 10))
_ = plt.plot(walking_time_48h, blood_glucose_std_48h, '.')
## Plot
fig, ax = plt.subplots(figsize=(18, 10))
over_logi = blood_glucose_std_48h > 12.5
under_logi = np.logical_not(over_logi)
_ = plt.plot(walking_time_48h[under_logi], blood_glucose_std_48h[under_logi], '.')
_ = plt.plot(walking_time_48h[over_logi], blood_glucose_std_48h[over_logi], '.')
_ = plt.plot(np.linspace(walking_time_48h.min(), walking_time_48h.max(), 100), 12.5*np.ones(100), linestyle='dashed')
## Plot of correlation of walking in that time
fig, ax = plt.subplots(figsize=(18, 10))
walking_time_at_interval
over_logi = blood_glucose_std_48h > 12.5
under_logi = np.logical_not(over_logi)
_ = plt.plot(walking_time_at_interval[under_logi], blood_glucose_std_48h[under_logi], '.')
_ = plt.plot(walking_time_at_interval[over_logi], blood_glucose_std_48h[over_logi], '.')
_ = plt.plot(np.linspace(walking_time_at_interval.min(), walking_time_at_interval.max(), 100), 12.5*np.ones(100), linestyle='dashed')
## Dummy model
model = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True)
param_grid = {
'penalty': ['l2'],
'C': [0.1, 1., 10.]
}
CV_model = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
CV_model.fit(walking_time_48h.reshape(-1, 1), blood_glucose_std_48h>14.5)
CV_model.best_score_
CV_model.predict_proba([[100000]])
CV_model.predict_proba([[350000]])
## Store trained model
joblib.dump(CV_model, 'model_blood_glucose.pkl')
### Prediction using walking times in each moment
# Dummy model
model = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True)
param_grid = {
'penalty': ['l2'],
'C': [0.1, 1., 10.]
}
CV_model = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
CV_model.fit(walking_time_at_interval.reshape(-1, 1), blood_glucose_std_48h>14.5)
CV_model.best_score_
CV_model.predict_proba([[8000]])
CV_model.predict_proba([[0]])
## Store trained model
joblib.dump(CV_model, 'model_interval_blood_glucose.pkl')