import pandas as pd
from sklearn.datasets import load_iris
from sklearn.datasets import load_diabetes
from sklearn.datasets import load_digits
from IPython.display import display
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
diabetes = load_diabetes()
diabetes.keys()
dict_keys(['data', 'target', 'frame', 'DESCR', 'feature_names', 'data_filename', 'target_filename', 'data_module'])
X = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
y = pd.DataFrame(diabetes.target, columns=["disease_progression"])
X.head()
y.head()
age | sex | bmi | bp | s1 | s2 | s3 | s4 | s5 | s6 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0.038076 | 0.050680 | 0.061696 | 0.021872 | -0.044223 | -0.034821 | -0.043401 | -0.002592 | 0.019907 | -0.017646 |
1 | -0.001882 | -0.044642 | -0.051474 | -0.026328 | -0.008449 | -0.019163 | 0.074412 | -0.039493 | -0.068332 | -0.092204 |
2 | 0.085299 | 0.050680 | 0.044451 | -0.005670 | -0.045599 | -0.034194 | -0.032356 | -0.002592 | 0.002861 | -0.025930 |
3 | -0.089063 | -0.044642 | -0.011595 | -0.036656 | 0.012191 | 0.024991 | -0.036038 | 0.034309 | 0.022688 | -0.009362 |
4 | 0.005383 | -0.044642 | -0.036385 | 0.021872 | 0.003935 | 0.015596 | 0.008142 | -0.002592 | -0.031988 | -0.046641 |
disease_progression | |
---|---|
0 | 151.0 |
1 | 75.0 |
2 | 141.0 |
3 | 206.0 |
4 | 135.0 |
data = pd.merge(X, y, left_index=True, right_index=True)
data.head()
age | sex | bmi | bp | s1 | s2 | s3 | s4 | s5 | s6 | disease_progression | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.038076 | 0.050680 | 0.061696 | 0.021872 | -0.044223 | -0.034821 | -0.043401 | -0.002592 | 0.019907 | -0.017646 | 151.0 |
1 | -0.001882 | -0.044642 | -0.051474 | -0.026328 | -0.008449 | -0.019163 | 0.074412 | -0.039493 | -0.068332 | -0.092204 | 75.0 |
2 | 0.085299 | 0.050680 | 0.044451 | -0.005670 | -0.045599 | -0.034194 | -0.032356 | -0.002592 | 0.002861 | -0.025930 | 141.0 |
3 | -0.089063 | -0.044642 | -0.011595 | -0.036656 | 0.012191 | 0.024991 | -0.036038 | 0.034309 | 0.022688 | -0.009362 | 206.0 |
4 | 0.005383 | -0.044642 | -0.036385 | 0.021872 | 0.003935 | 0.015596 | 0.008142 | -0.002592 | -0.031988 | -0.046641 | 135.0 |
import matplotlib.pyplot as plt
plt.scatter(data["age"], data["disease_progression"], color="b")
plt.show()
<matplotlib.collections.PathCollection at 0x126246eb3a0>
data.columns
Index(['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6', 'disease_progression'], dtype='object')
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
# Create formula
dependent_var = "disease_progression" # we are predicting 'mpg'
independent_var = "age + sex + bmi + bp + s1 + s2 + s3 + s4 + s5 + s6"
formula = dependent_var + " ~ " + independent_var
# Create model & train
model = ols(formula, data).fit()
# Results
print("Anova:")
print(f"{anova_lm(model)}")
print("Params:")
print(f"{model.params}")
print("Summary:")
print(f"{model.summary()}")
Anova: df sum_sq mean_sq F PR(>F) age 1.0 9.252734e+04 92527.342829 31.550422 3.490058e-08 sex 1.0 2.933667e+02 293.366704 0.100034 7.519424e-01 bmi 1.0 8.269553e+05 826955.273690 281.979218 4.744010e-49 bp 1.0 1.293118e+05 129311.781305 44.093358 9.447833e-11 s1 1.0 1.791171e+03 1791.170780 0.610762 4.349301e-01 s2 1.0 5.057628e+03 5057.628085 1.724575 1.898040e-01 s3 1.0 2.373292e+05 237329.237611 80.925674 7.509047e-18 s4 1.0 1.821160e+03 1821.160324 0.620988 4.311135e-01 s5 1.0 5.885588e+04 58855.884508 20.068965 9.581554e-06 s6 1.0 3.080493e+03 3080.492964 1.050401 3.059895e-01 Residual 431.0 1.263986e+06 2932.681637 NaN NaN Params: Intercept 152.133484 age -10.009866 sex -239.815644 bmi 519.845920 bp 324.384646 s1 -792.175639 s2 476.739021 s3 101.043268 s4 177.063238 s5 751.273700 s6 67.626692 dtype: float64 Summary: OLS Regression Results =============================================================================== Dep. Variable: disease_progression R-squared: 0.518 Model: OLS Adj. R-squared: 0.507 Method: Least Squares F-statistic: 46.27 Date: Thu, 23 Feb 2023 Prob (F-statistic): 3.83e-62 Time: 21:28:09 Log-Likelihood: -2386.0 No. Observations: 442 AIC: 4794. Df Residuals: 431 BIC: 4839. Df Model: 10 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 152.1335 2.576 59.061 0.000 147.071 157.196 age -10.0099 59.749 -0.168 0.867 -127.446 107.426 sex -239.8156 61.222 -3.917 0.000 -360.147 -119.484 bmi 519.8459 66.533 7.813 0.000 389.076 650.616 bp 324.3846 65.422 4.958 0.000 195.799 452.970 s1 -792.1756 416.680 -1.901 0.058 -1611.153 26.802 s2 476.7390 339.030 1.406 0.160 -189.620 1143.098 s3 101.0433 212.531 0.475 0.635 -316.684 518.770 s4 177.0632 161.476 1.097 0.273 -140.315 494.441 s5 751.2737 171.900 4.370 0.000 413.407 1089.140 s6 67.6267 65.984 1.025 0.306 -62.064 197.318 ============================================================================== Omnibus: 1.506 Durbin-Watson: 2.029 Prob(Omnibus): 0.471 Jarque-Bera (JB): 1.404 Skew: 0.017 Prob(JB): 0.496 Kurtosis: 2.726 Cond. No. 227. ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.