Import Data¶

In [61]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.datasets import load_diabetes
from sklearn.datasets import load_digits
from IPython.display import display
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

Import Diabetes¶

In [62]:
diabetes = load_diabetes()
diabetes.keys()
Out[62]:
dict_keys(['data', 'target', 'frame', 'DESCR', 'feature_names', 'data_filename', 'target_filename', 'data_module'])
In [63]:
X = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
y = pd.DataFrame(diabetes.target, columns=["disease_progression"])

X.head()
y.head()
Out[63]:
age sex bmi bp s1 s2 s3 s4 s5 s6
0 0.038076 0.050680 0.061696 0.021872 -0.044223 -0.034821 -0.043401 -0.002592 0.019907 -0.017646
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163 0.074412 -0.039493 -0.068332 -0.092204
2 0.085299 0.050680 0.044451 -0.005670 -0.045599 -0.034194 -0.032356 -0.002592 0.002861 -0.025930
3 -0.089063 -0.044642 -0.011595 -0.036656 0.012191 0.024991 -0.036038 0.034309 0.022688 -0.009362
4 0.005383 -0.044642 -0.036385 0.021872 0.003935 0.015596 0.008142 -0.002592 -0.031988 -0.046641
Out[63]:
disease_progression
0 151.0
1 75.0
2 141.0
3 206.0
4 135.0
In [64]:
data = pd.merge(X, y, left_index=True, right_index=True)
data.head()
Out[64]:
age sex bmi bp s1 s2 s3 s4 s5 s6 disease_progression
0 0.038076 0.050680 0.061696 0.021872 -0.044223 -0.034821 -0.043401 -0.002592 0.019907 -0.017646 151.0
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163 0.074412 -0.039493 -0.068332 -0.092204 75.0
2 0.085299 0.050680 0.044451 -0.005670 -0.045599 -0.034194 -0.032356 -0.002592 0.002861 -0.025930 141.0
3 -0.089063 -0.044642 -0.011595 -0.036656 0.012191 0.024991 -0.036038 0.034309 0.022688 -0.009362 206.0
4 0.005383 -0.044642 -0.036385 0.021872 0.003935 0.015596 0.008142 -0.002592 -0.031988 -0.046641 135.0
In [65]:
import matplotlib.pyplot as plt

plt.scatter(data["age"], data["disease_progression"], color="b")
plt.show()
Out[65]:
<matplotlib.collections.PathCollection at 0x126246eb3a0>
In [66]:
data.columns
Out[66]:
Index(['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6',
       'disease_progression'],
      dtype='object')

Import OLS¶

In [67]:
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

Multiple Linear Regression Model¶

In [68]:
# Create formula
dependent_var = "disease_progression" # we are predicting 'mpg'
independent_var = "age + sex + bmi + bp + s1 + s2 + s3 + s4 + s5 + s6"

formula = dependent_var + " ~ " + independent_var

# Create model & train
model = ols(formula, data).fit()

# Results
print("Anova:")
print(f"{anova_lm(model)}")
print("Params:")
print(f"{model.params}")
print("Summary:")
print(f"{model.summary()}")
Anova:
             df        sum_sq        mean_sq           F        PR(>F)
age         1.0  9.252734e+04   92527.342829   31.550422  3.490058e-08
sex         1.0  2.933667e+02     293.366704    0.100034  7.519424e-01
bmi         1.0  8.269553e+05  826955.273690  281.979218  4.744010e-49
bp          1.0  1.293118e+05  129311.781305   44.093358  9.447833e-11
s1          1.0  1.791171e+03    1791.170780    0.610762  4.349301e-01
s2          1.0  5.057628e+03    5057.628085    1.724575  1.898040e-01
s3          1.0  2.373292e+05  237329.237611   80.925674  7.509047e-18
s4          1.0  1.821160e+03    1821.160324    0.620988  4.311135e-01
s5          1.0  5.885588e+04   58855.884508   20.068965  9.581554e-06
s6          1.0  3.080493e+03    3080.492964    1.050401  3.059895e-01
Residual  431.0  1.263986e+06    2932.681637         NaN           NaN
Params:
Intercept    152.133484
age          -10.009866
sex         -239.815644
bmi          519.845920
bp           324.384646
s1          -792.175639
s2           476.739021
s3           101.043268
s4           177.063238
s5           751.273700
s6            67.626692
dtype: float64
Summary:
                             OLS Regression Results                            
===============================================================================
Dep. Variable:     disease_progression   R-squared:                       0.518
Model:                             OLS   Adj. R-squared:                  0.507
Method:                  Least Squares   F-statistic:                     46.27
Date:                 Thu, 23 Feb 2023   Prob (F-statistic):           3.83e-62
Time:                         21:28:09   Log-Likelihood:                -2386.0
No. Observations:                  442   AIC:                             4794.
Df Residuals:                      431   BIC:                             4839.
Df Model:                           10                                         
Covariance Type:             nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    152.1335      2.576     59.061      0.000     147.071     157.196
age          -10.0099     59.749     -0.168      0.867    -127.446     107.426
sex         -239.8156     61.222     -3.917      0.000    -360.147    -119.484
bmi          519.8459     66.533      7.813      0.000     389.076     650.616
bp           324.3846     65.422      4.958      0.000     195.799     452.970
s1          -792.1756    416.680     -1.901      0.058   -1611.153      26.802
s2           476.7390    339.030      1.406      0.160    -189.620    1143.098
s3           101.0433    212.531      0.475      0.635    -316.684     518.770
s4           177.0632    161.476      1.097      0.273    -140.315     494.441
s5           751.2737    171.900      4.370      0.000     413.407    1089.140
s6            67.6267     65.984      1.025      0.306     -62.064     197.318
==============================================================================
Omnibus:                        1.506   Durbin-Watson:                   2.029
Prob(Omnibus):                  0.471   Jarque-Bera (JB):                1.404
Skew:                           0.017   Prob(JB):                        0.496
Kurtosis:                       2.726   Cond. No.                         227.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.