import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn import datasets
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# Splitting data in X(independent) and Y(dependent) variables.
X, y = datasets.load_wine(return_X_y=True, as_frame=True)

#Combine X and Y to do EDA
df_wines= pd.concat([X,y.rename("target")], axis=1)

# Printing first observations
df_wines.head().round(3)

df_wines.shape

(178, 14)

df_wines.dtypes

alcohol                         float64
malic_acid                      float64
ash                             float64
alcalinity_of_ash               float64
magnesium                       float64
total_phenols                   float64
flavanoids                      float64
nonflavanoid_phenols            float64
proanthocyanins                 float64
color_intensity                 float64
hue                             float64
od280/od315_of_diluted_wines    float64
proline                         float64
target                            int64
dtype: object

df_wines.describe()

# Split test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Rows in the train set: {len(X_train)}")
print(f"Rows in the test set: {len(X_test)}")

Rows in the train set: 142
Rows in the test set: 36

#*-----------Transform a positive numbers------------*
X_train_abs = np.abs(X_train)
X_test_abs = np.abs(X_test)

#*-----------Training the Multinomial Naive Bayes Model------------*
mul_model = MultinomialNB()
mul_model.fit(X_train, y_train)

#*---------------Prediction in the test dataset-------------------*
y_pred = mul_model.predict(X_test)

#*--------------------Evaluating Accuracy-------------------------*
accuracy = accuracy_score(y_test, y_pred)
print(f"Miltunomial Accuracy: {accuracy * 100:.2f}%\n")

Miltunomial Accuracy: 88.89%

#*-----------Training the Gaussian Naive Bayes Model------------*
gau_model = GaussianNB()
gau_model.fit(X_train, y_train)

#*---------------Prediction in the test dataset-------------------*
y_pred_gauss = gau_model.predict(X_test)

#*--------------------Evaluating Accuracy-------------------------*
accuracy_gauss = accuracy_score(y_test, y_pred_gauss)
print(f"Gaussian Accuracy: {accuracy_gauss * 100:.2f}%")

Gaussian Accuracy: 100.00%

# Creating a sample with 5 observations
samples = X_test.iloc[[15, 1, 2, 4, 28]]
labels = y_test.iloc[[15, 1, 2, 4, 28]]

pred_gau = gau_model.predict(samples)
pred_mul = mul_model.predict(np.abs(samples)) 

comparison_df = pd.DataFrame({
    "Sample Index": samples.index,
    "Real Label": labels.values,
    "GaussianNB Prediction": pred_gau,
    "MultinomialNB Prediction": pred_mul
})

comparison_df

#  Gaussian Matrix
disp_gau = ConfusionMatrixDisplay(confusion_matrix(y_test, y_pred_gauss))
disp_gau.plot()
plt.title("Confusion Matrix - GaussianNB")
plt.show()

# Multinomial matrix
disp_mul = ConfusionMatrixDisplay(confusion_matrix(y_test, y_pred))
disp_mul.plot()
plt.title("Confusion Matrix - MultinomialNB")
plt.show()

# Splitting data in X(independent) and Y(dependent) variables.
X, y = datasets.load_iris(return_X_y=True, as_frame=True)

#Combine X and Y to do EDA
df_flowers= pd.concat([X,y.rename("target")], axis=1)

# Printing first observations
df_flowers.head().round(3)

df_flowers.shape

(150, 5)

df_flowers.dtypes

sepal length (cm)    float64
sepal width (cm)     float64
petal length (cm)    float64
petal width (cm)     float64
target                 int64
dtype: object

df_flowers.describe()

# Defining the model.
lr_model = LogisticRegression(max_iter=200)

# Evaluating the model with cross val score

scores = cross_val_score(lr_model, X, y, cv=4)

# Showing the results
print("Cross-validation scores for each fold:", scores)
print("Average accuracy:", np.mean(scores))

Cross-validation scores for each fold: [0.97368421 0.97368421 0.94594595 1.        ]
Average accuracy: 0.9733285917496444

#  Plottin to sow accuracy
plt.figure(figsize=(4, 4))
plt.bar(range(1, 5), scores, color='skyblue')
plt.ylim(0.8, 1.05)
plt.xticks(range(1, 5), [f'Fold {i}' for i in range(1, 5)], size=8)
plt.yticks(size=8)
plt.title('Accuracy per Fold (Logistic Regression with CV=4)')
plt.xlabel('Fold', size=10)
plt.ylabel('Accuracy', size=10)
plt.tight_layout()
plt.show()

	alcohol	malic_acid	ash	alcalinity_of_ash	magnesium	total_phenols	flavanoids	nonflavanoid_phenols	proanthocyanins	color_intensity	hue	od280/od315_of_diluted_wines	proline
0	14.23	1.71	2.43	15.6	127.0	2.80	3.06	0.28	2.29	5.64	1.04	3.92	1065.0
1	13.20	1.78	2.14	11.2	100.0	2.65	2.76	0.26	1.28	4.38	1.05	3.40	1050.0
2	13.16	2.36	2.67	18.6	101.0	2.80	3.24	0.30	2.81	5.68	1.03	3.17	1185.0
3	14.37	1.95	2.50	16.8	113.0	3.85	3.49	0.24	2.18	7.80	0.86	3.45	1480.0
4	13.24	2.59	2.87	21.0	118.0	2.80	2.69	0.39	1.82	4.32	1.04	2.93	735.0

	alcohol	malic_acid	ash	alcalinity_of_ash	magnesium	total_phenols	flavanoids	nonflavanoid_phenols	proanthocyanins	color_intensity	hue	od280/od315_of_diluted_wines	proline	target
count	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000	178.000000
mean	13.000618	2.336348	2.366517	19.494944	99.741573	2.295112	2.029270	0.361854	1.590899	5.058090	0.957449	2.611685	746.893258	0.938202
std	0.811827	1.117146	0.274344	3.339564	14.282484	0.625851	0.998859	0.124453	0.572359	2.318286	0.228572	0.709990	314.907474	0.775035
min	11.030000	0.740000	1.360000	10.600000	70.000000	0.980000	0.340000	0.130000	0.410000	1.280000	0.480000	1.270000	278.000000	0.000000
25%	12.362500	1.602500	2.210000	17.200000	88.000000	1.742500	1.205000	0.270000	1.250000	3.220000	0.782500	1.937500	500.500000	0.000000
50%	13.050000	1.865000	2.360000	19.500000	98.000000	2.355000	2.135000	0.340000	1.555000	4.690000	0.965000	2.780000	673.500000	1.000000
75%	13.677500	3.082500	2.557500	21.500000	107.000000	2.800000	2.875000	0.437500	1.950000	6.200000	1.120000	3.170000	985.000000	2.000000
max	14.830000	5.800000	3.230000	30.000000	162.000000	3.880000	5.080000	0.660000	3.580000	13.000000	1.710000	4.000000	1680.000000	2.000000

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)	target
count	150.000000	150.000000	150.000000	150.000000	150.000000
mean	5.843333	3.057333	3.758000	1.199333	1.000000
std	0.828066	0.435866	1.765298	0.762238	0.819232
min	4.300000	2.000000	1.000000	0.100000	0.000000
25%	5.100000	2.800000	1.600000	0.300000	0.000000
50%	5.800000	3.000000	4.350000	1.300000	1.000000
75%	6.400000	3.300000	5.100000	1.800000	2.000000
max	7.900000	4.400000	6.900000	2.500000	2.000000

MID-TERM PART B¶

Gaussian and Multinominal Classifiers for Wines Dataset¶

1. Data Collection¶

FINDINGS¶

2. Split test and train¶

3. Multinomial Naive Bayes¶

4. Gaussian Naive Bayes¶

4. Predicting some observations¶

4. Conclusion¶

Logistic Regression with Cross-Validation for the Flower Dataset¶

1. Data Collection¶

FINDINGS¶

3. Logistic Regression¶

3. Applying Cross Val Score¶

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	Sample Index	Real Label	GaussianNB Prediction	MultinomialNB Prediction
0	111	1	1	1
1	45	0	0	0
2	140	2	2	2
3	67	1	1	1
4	90	1	1	1

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2