import pandas as pd
from sklearn.datasets import load_iris
from sklearn.datasets import load_diabetes
from sklearn.datasets import load_digits
from IPython.display import display
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
iris = load_iris()
# type(iris)
iris.keys()
# iris.DESCR
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.DataFrame(iris.target, columns=["class"]) # 0 - Setosa, 1 - versicolor, 2 - virginica
X.head()
y.head()
# Split data into train and test sets
from sklearn.model_selection import train_test_split
# Test size: %30 - compute train test split on "scaled" data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
# Display the test data
display(X_test.head())
# Display the shape of the variable y_test, which contains the labels of test data
display(y_test.shape)
# Display the labels of test data
display(y_test.head())
print(f"The type of the y_test: {type(y_test)}")
print(f"The type of the X_test: {type(X_test)}")
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(random_state=5)
# Train the model, using the "train" data labels
model.fit(X_train, y_train)
# Predict labels using "test" data
y_test_predicted = model.predict(X_test)
# Plot the results into a canvas
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
plt.figure(figsize=(20,12))
modeltree = plot_tree(model,
feature_names=X.columns,
filled=True,
rounded=True,
fontsize=14
)
# Print the distinct labels (in the Iris data, there are only 3 distinct labels)
print(f"Distinct labels in the Iris dataset: {iris.target_names}\n")
print(f"Predicted labels:\t {list(y_test_predicted)}")
print(f"Actual labels:\t\t {list(y_test)}")
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
confusionMatrix = confusion_matrix(y_test, y_test_predicted)
# Rows are the actual values, columns are predicted values
# Setosa Versicolor Virginica
# array([[14, 0, 0],
# [ 0, 17, 1],
# [ 0, 0, 12]])
# 1st row, 19 flowers are actually Setosa, and it is perfectly predicted as Setosa
# 2nd row, 13 flowers are correctly labeled as Versicolor out of the total 14. So, 1 of the flowers has been incorrectly classified
# 3rd row, all 12 flowers have been correctly classified as Virginica
display(confusionMatrix)
from sklearn import metrics
import matplotlib.pyplot as plt
disp = metrics.ConfusionMatrixDisplay.from_predictions(y_test, y_test_predicted)
disp.figure_.suptitle("Confusion Matrix")
print(f"Confusion matrix:\n{disp.confusion_matrix}")
plt.show()
accuracy_score(y_test, y_test_predicted)
print(classification_report(y_test, y_test_predicted))