Давайте код

Используется набор данных Стекло. Классификационный набор данных. На основании содержания различных элементов, присутствующих в нем, решается, для какой цели следует использовать стекло

#Data Preprocessing Part
#Data Visualisation, Data Analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("glass.csv")
print("\nThe Number of Rows and Columns in Dataset : ",df.shape)
print(df)
print("\nThe Number of Rows in Dataset : ",df.shape[0])
#print(df)

#Displaying the Presence of Null Data using Heat Map
print("Heat Map to display the NULL values present in the Dataset\n")
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')
#sns.pairplot(df,hue='Type')

nameList = []
for i in df:
    nameList.append(i)

print("The Columns in dataset and their datatypes\n",df.dtypes)

print("\nThe Number of Rows in Dataset : ",df.shape[0])

#To check the Null Data Present in the Dataset
print("\nChecking the Presence of Null Data")
print(df.isnull())

#To Check total Number of Null Data Points in Each Columns
print("\nTotal Number of Null Data Values in Each Column \n",df.isnull().sum())

#To Check total Number of Null Data Points in Each Columns
print("\nTotal Number of Null Data Values in Each Column \n",df.isnull().sum())

#Removing all the Null values present in the Dataset
# making new data frame with dropped NA values
df = df.dropna(axis = 0, how ='any')

#Describing the dataset Statistical Analysis
df.describe()

#Showing the Number different types of Glass
sns.countplot(x='Type',data=df)

#Outlier Detection and removal from all the features
for i in df:
    factor = 3
    upper_lim = df[i].mean () + df[i].std () * factor
    lower_lim = df[i].mean () - df[i].std () * factor
    df = df[(df[i] < upper_lim) & (df[i] > lower_lim)]
    #print(df)
    plt.figure(figsize=(12, 7))
    sns.boxplot(x='Type',y=i,data=df,palette='winter')

print("\nThe Number of Rows in Dataset : ",df.shape[0])

Тип стекла: По номеру

1 зданиеwindowsfloatобработанный

2 зданияокнанеплавающиеобработанные

3 транспортное средствоокнапоплавокобработанный

4 автомобиляокнане плавучиеобработанные

5 контейнеров

6 посуда

7 фар

#To Check total Number of Null Data Points in Each Columns
typeList = df.Type.unique()
print("The Following are the Types of Glasses Present in the Dataset")
print(typeList)

Ниже приведены типы очков, представленные в наборе данных [1 2 3 5 6 7]

#For the KNN Algorithm
#Using Euclidean Distance
#Using Standard scaling for the Parameters
#The Outliers are Removed from this as they may be Important in predicting the Type of Glasses
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

new_df = df
#df = pd.read_csv("glass.csv")
X_train, X_test, y_train, y_test = train_test_split(df.drop('Type',axis=1), df['Type'], test_size=0.30,random_state=101)

#Feature scaling
scaling_X_Values = StandardScaler()
X_train = scaling_X_Values.fit_transform(X_train)
X_test = scaling_X_Values.transform(X_test)

#Getting the Initial Value of K
import math
print(math.sqrt(len(y_train)))
print("The K value should be Odd therefore it should be",int(math.sqrt(len(y_train))))
kvalueInitial = int(math.sqrt(len(y_train)))


Distance_Accuracy = []
Distance_Name = ["Euclidean ","Minkowski ","Manhattan"]

11.269427669584644.

Значение K должно быть нечетным, поэтому должно быть 11.

#Defining the Model KNN
#The value 47 was choose due to square root indicating odd number of data
knn_model = KNeighborsClassifier(n_neighbors=kvalueInitial, metric='euclidean')
#Fitting the Model
knn_model.fit(X_train, y_train)
#Predicting the Test Set Result for the Euclidean Distance
y_pred = knn_model.predict(X_test)
#Model Evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
confusionMatrix = confusion_matrix(y_test, y_pred)
print ("\n\tThe Confusion Matrix\n",confusionMatrix)
print("\n\tThe F1 Score : ",f1_score(y_test, y_pred, average='micro'))
print("\n\tThe Accuracy Score is : ",accuracy_score(y_test, y_pred))

#Deciding on Which Value of K to choose firstly 
#Checking the Accuracy of the Model
#Secondly checking the Error rate

#From 1 to 13 neighbours choosing the Values of K 
accuracy = []
for i in range(1,13):
    knn_model = KNeighborsClassifier(n_neighbors=i,metric='euclidean')
    knn_model.fit(X_train, y_train)
    y_pred = knn_model.predict(X_test)
    accuracy.append(accuracy_score(y_test, y_pred))

plt.figure(figsize=(10,6))
plt.plot(range(1,13),accuracy,color='green', linestyle='dashed', marker='o',markerfacecolor='red', markersize=10)
plt.title('Accuracy vs. K Value')
plt.xlabel('K')
plt.ylabel('Accuracy')
plt.show()

#Error Rate Calculation
errorRate = []
for i in range(1,13):
    knn_model = KNeighborsClassifier(n_neighbors=i, metric='euclidean')
    knn_model.fit(X_train, y_train)
    y_pred = knn_model.predict(X_test)
    errorRate.append(np.mean(y_pred!=y_test))
    #print(errorRate)

plt.figure(figsize=(10,6))
plt.plot(range(1,13),errorRate,color='black', linestyle='dashed', marker='*',
         markerfacecolor='red', markersize=10)
plt.title('Error vs. K-Value')
plt.xlabel('K-Value')
plt.ylabel('Error')
plt.show()

#From the Accuracy list finding the Max Accuracy
#Finding the Index position of that value and addign 1 will give the value of i for which Highest accuracy was observed
ll = accuracy.index(max(accuracy))
print("\n\tThe Highest Accuracy Observed with Euclidean Distance is : ",max(accuracy),", for K = ",ll+1)
#Distance_Accuracy.append(max(accuracy)) #For Euclidean Distance
#From the Error list finding the Min Error
#Finding the Index position of that value and addign 1 will give the value of i for which Lowest Error was observed
ll = errorRate.index(min(errorRate))
print("\n\tThe Lowest Error Observed with Euclidean Distance is : ",min(errorRate),", for K = ",ll+1)

print("\n\tThe Appropriate Value of K will be ",ll+1)

kvalue = ll+1

df = new_df
#Using Euclidean distance for Best Value of K got from Euclidean Distance
X_train, X_test, y_train, y_test = train_test_split(df.drop('Type',axis=1), df['Type'], test_size=0.30,random_state=101)
#Feature scaling
scaling_X_Values = StandardScaler()
X_train = scaling_X_Values.fit_transform(X_train)
X_test = scaling_X_Values.transform(X_test)
#Defining the Model KNN
knn_model = KNeighborsClassifier(n_neighbors=kvalue, metric='euclidean')
# Fit Model
knn_model.fit(X_train, y_train)
# Predict the test set results
y_pred = knn_model.predict(X_test)
y_pred
confusionMatrix = confusion_matrix(y_test, y_pred)
print ("\n\tThe Confusion Matrix\n",confusionMatrix)
print("\n\tThe F1 Score : ",f1_score(y_test, y_pred,average='micro'))
print("\n\tThe Accuracy Score is : ",accuracy_score(y_test, y_pred))
Distance_Accuracy.append(accuracy_score(y_test, y_pred))
print(classification_report(y_test,y_pred))

df = new_df
#Using minkowski distance for Best Value of K got from Euclidean Distance
X_train, X_test, y_train, y_test = train_test_split(df.drop('Type',axis=1), df['Type'], test_size=0.30,random_state=101)
#Feature scaling
scaling_X_Values = StandardScaler()
X_train = scaling_X_Values.fit_transform(X_train)
X_test = scaling_X_Values.transform(X_test)
#Defining the Model KNN
knn_model = KNeighborsClassifier(n_neighbors=kvalue, metric='minkowski')
# Fit Model
knn_model.fit(X_train, y_train)
# Predict the test set results
y_pred = knn_model.predict(X_test)
y_pred
confusionMatrix = confusion_matrix(y_test, y_pred)
print ("\n\tThe Confusion Matrix\n",confusionMatrix)
print("\n\tThe F1 Score : ",f1_score(y_test, y_pred,average='micro'))
print("\n\tThe Accuracy Score is : ",accuracy_score(y_test, y_pred))
Distance_Accuracy.append(accuracy_score(y_test, y_pred))

df = new_df
#Using Manhattan distance for the Best Value of K Got for Euclidean Distance
X_train, X_test, y_train, y_test = train_test_split(df.drop('Type',axis=1), df['Type'], test_size=0.30,random_state=101)
#Feature scaling
scaling_X_Values = StandardScaler()
X_train = scaling_X_Values.fit_transform(X_train)
X_test = scaling_X_Values.transform(X_test)
#Defining the Model KNN using Manhattan Distance
knn_model = KNeighborsClassifier(n_neighbors=kvalue,metric='manhattan')
# Fit Model
knn_model.fit(X_train, y_train)
# Predict the test set results
y_pred = knn_model.predict(X_test)
y_pred

confusionMatrix = confusion_matrix(y_test, y_pred)
print ("\n\tThe Confusion Matrix\n",confusionMatrix)
print("\n\tThe F1 Score : ",f1_score(y_test, y_pred,average='micro'))
print("\n\tThe Accuracy Score is : ",accuracy_score(y_test, y_pred))
Distance_Accuracy.append(accuracy_score(y_test, y_pred))

print(Distance_Name)
print(Distance_Accuracy)
print("\n\tDistance Method \t Accuracy")
for i in range(3):
    print("\n\t",Distance_Name[i] ,"\t", Distance_Accuracy[i])

В заключение, евклидовы расстояния и расстояния Минковского более точны, чем манхэттенские для этого набора данных.