Давайте код
Используется набор данных Стекло. Классификационный набор данных. На основании содержания различных элементов, присутствующих в нем, решается, для какой цели следует использовать стекло
#Data Preprocessing Part #Data Visualisation, Data Analysis import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns df = pd.read_csv("glass.csv") print("\nThe Number of Rows and Columns in Dataset : ",df.shape) print(df) print("\nThe Number of Rows in Dataset : ",df.shape[0]) #print(df) #Displaying the Presence of Null Data using Heat Map print("Heat Map to display the NULL values present in the Dataset\n") sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis') #sns.pairplot(df,hue='Type') nameList = [] for i in df: nameList.append(i) print("The Columns in dataset and their datatypes\n",df.dtypes) print("\nThe Number of Rows in Dataset : ",df.shape[0]) #To check the Null Data Present in the Dataset print("\nChecking the Presence of Null Data") print(df.isnull()) #To Check total Number of Null Data Points in Each Columns print("\nTotal Number of Null Data Values in Each Column \n",df.isnull().sum())
#To Check total Number of Null Data Points in Each Columns print("\nTotal Number of Null Data Values in Each Column \n",df.isnull().sum())
#Removing all the Null values present in the Dataset # making new data frame with dropped NA values df = df.dropna(axis = 0, how ='any') #Describing the dataset Statistical Analysis df.describe()
#Showing the Number different types of Glass sns.countplot(x='Type',data=df)
#Outlier Detection and removal from all the features for i in df: factor = 3 upper_lim = df[i].mean () + df[i].std () * factor lower_lim = df[i].mean () - df[i].std () * factor df = df[(df[i] < upper_lim) & (df[i] > lower_lim)] #print(df) plt.figure(figsize=(12, 7)) sns.boxplot(x='Type',y=i,data=df,palette='winter') print("\nThe Number of Rows in Dataset : ",df.shape[0])
Тип стекла: По номеру
1 зданиеwindowsfloatобработанный
2 зданияокнанеплавающиеобработанные
3 транспортное средствоокнапоплавокобработанный
4 автомобиляокнане плавучиеобработанные
5 контейнеров
6 посуда
7 фар
#To Check total Number of Null Data Points in Each Columns typeList = df.Type.unique() print("The Following are the Types of Glasses Present in the Dataset") print(typeList)
Ниже приведены типы очков, представленные в наборе данных [1 2 3 5 6 7]
#For the KNN Algorithm #Using Euclidean Distance #Using Standard scaling for the Parameters #The Outliers are Removed from this as they may be Important in predicting the Type of Glasses from sklearn.metrics import classification_report from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.neighbors import KNeighborsClassifier new_df = df #df = pd.read_csv("glass.csv") X_train, X_test, y_train, y_test = train_test_split(df.drop('Type',axis=1), df['Type'], test_size=0.30,random_state=101) #Feature scaling scaling_X_Values = StandardScaler() X_train = scaling_X_Values.fit_transform(X_train) X_test = scaling_X_Values.transform(X_test) #Getting the Initial Value of K import math print(math.sqrt(len(y_train))) print("The K value should be Odd therefore it should be",int(math.sqrt(len(y_train)))) kvalueInitial = int(math.sqrt(len(y_train))) Distance_Accuracy = [] Distance_Name = ["Euclidean ","Minkowski ","Manhattan"]
11.269427669584644.
Значение K должно быть нечетным, поэтому должно быть 11.
#Defining the Model KNN #The value 47 was choose due to square root indicating odd number of data knn_model = KNeighborsClassifier(n_neighbors=kvalueInitial, metric='euclidean') #Fitting the Model knn_model.fit(X_train, y_train) #Predicting the Test Set Result for the Euclidean Distance y_pred = knn_model.predict(X_test) #Model Evaluation from sklearn.metrics import confusion_matrix from sklearn.metrics import f1_score from sklearn.metrics import accuracy_score confusionMatrix = confusion_matrix(y_test, y_pred) print ("\n\tThe Confusion Matrix\n",confusionMatrix) print("\n\tThe F1 Score : ",f1_score(y_test, y_pred, average='micro')) print("\n\tThe Accuracy Score is : ",accuracy_score(y_test, y_pred))
#Deciding on Which Value of K to choose firstly #Checking the Accuracy of the Model #Secondly checking the Error rate #From 1 to 13 neighbours choosing the Values of K accuracy = [] for i in range(1,13): knn_model = KNeighborsClassifier(n_neighbors=i,metric='euclidean') knn_model.fit(X_train, y_train) y_pred = knn_model.predict(X_test) accuracy.append(accuracy_score(y_test, y_pred)) plt.figure(figsize=(10,6)) plt.plot(range(1,13),accuracy,color='green', linestyle='dashed', marker='o',markerfacecolor='red', markersize=10) plt.title('Accuracy vs. K Value') plt.xlabel('K') plt.ylabel('Accuracy') plt.show() #Error Rate Calculation errorRate = [] for i in range(1,13): knn_model = KNeighborsClassifier(n_neighbors=i, metric='euclidean') knn_model.fit(X_train, y_train) y_pred = knn_model.predict(X_test) errorRate.append(np.mean(y_pred!=y_test)) #print(errorRate) plt.figure(figsize=(10,6)) plt.plot(range(1,13),errorRate,color='black', linestyle='dashed', marker='*', markerfacecolor='red', markersize=10) plt.title('Error vs. K-Value') plt.xlabel('K-Value') plt.ylabel('Error') plt.show()
#From the Accuracy list finding the Max Accuracy #Finding the Index position of that value and addign 1 will give the value of i for which Highest accuracy was observed ll = accuracy.index(max(accuracy)) print("\n\tThe Highest Accuracy Observed with Euclidean Distance is : ",max(accuracy),", for K = ",ll+1) #Distance_Accuracy.append(max(accuracy)) #For Euclidean Distance #From the Error list finding the Min Error #Finding the Index position of that value and addign 1 will give the value of i for which Lowest Error was observed ll = errorRate.index(min(errorRate)) print("\n\tThe Lowest Error Observed with Euclidean Distance is : ",min(errorRate),", for K = ",ll+1) print("\n\tThe Appropriate Value of K will be ",ll+1) kvalue = ll+1
df = new_df #Using Euclidean distance for Best Value of K got from Euclidean Distance X_train, X_test, y_train, y_test = train_test_split(df.drop('Type',axis=1), df['Type'], test_size=0.30,random_state=101) #Feature scaling scaling_X_Values = StandardScaler() X_train = scaling_X_Values.fit_transform(X_train) X_test = scaling_X_Values.transform(X_test) #Defining the Model KNN knn_model = KNeighborsClassifier(n_neighbors=kvalue, metric='euclidean') # Fit Model knn_model.fit(X_train, y_train) # Predict the test set results y_pred = knn_model.predict(X_test) y_pred confusionMatrix = confusion_matrix(y_test, y_pred) print ("\n\tThe Confusion Matrix\n",confusionMatrix) print("\n\tThe F1 Score : ",f1_score(y_test, y_pred,average='micro')) print("\n\tThe Accuracy Score is : ",accuracy_score(y_test, y_pred)) Distance_Accuracy.append(accuracy_score(y_test, y_pred)) print(classification_report(y_test,y_pred))
df = new_df #Using minkowski distance for Best Value of K got from Euclidean Distance X_train, X_test, y_train, y_test = train_test_split(df.drop('Type',axis=1), df['Type'], test_size=0.30,random_state=101) #Feature scaling scaling_X_Values = StandardScaler() X_train = scaling_X_Values.fit_transform(X_train) X_test = scaling_X_Values.transform(X_test) #Defining the Model KNN knn_model = KNeighborsClassifier(n_neighbors=kvalue, metric='minkowski') # Fit Model knn_model.fit(X_train, y_train) # Predict the test set results y_pred = knn_model.predict(X_test) y_pred confusionMatrix = confusion_matrix(y_test, y_pred) print ("\n\tThe Confusion Matrix\n",confusionMatrix) print("\n\tThe F1 Score : ",f1_score(y_test, y_pred,average='micro')) print("\n\tThe Accuracy Score is : ",accuracy_score(y_test, y_pred)) Distance_Accuracy.append(accuracy_score(y_test, y_pred))
df = new_df #Using Manhattan distance for the Best Value of K Got for Euclidean Distance X_train, X_test, y_train, y_test = train_test_split(df.drop('Type',axis=1), df['Type'], test_size=0.30,random_state=101) #Feature scaling scaling_X_Values = StandardScaler() X_train = scaling_X_Values.fit_transform(X_train) X_test = scaling_X_Values.transform(X_test) #Defining the Model KNN using Manhattan Distance knn_model = KNeighborsClassifier(n_neighbors=kvalue,metric='manhattan') # Fit Model knn_model.fit(X_train, y_train) # Predict the test set results y_pred = knn_model.predict(X_test) y_pred confusionMatrix = confusion_matrix(y_test, y_pred) print ("\n\tThe Confusion Matrix\n",confusionMatrix) print("\n\tThe F1 Score : ",f1_score(y_test, y_pred,average='micro')) print("\n\tThe Accuracy Score is : ",accuracy_score(y_test, y_pred)) Distance_Accuracy.append(accuracy_score(y_test, y_pred))
print(Distance_Name) print(Distance_Accuracy) print("\n\tDistance Method \t Accuracy") for i in range(3): print("\n\t",Distance_Name[i] ,"\t", Distance_Accuracy[i])
В заключение, евклидовы расстояния и расстояния Минковского более точны, чем манхэттенские для этого набора данных.