Инжиниринг данных и моделирование с использованием pytorch

Процесс обработки данных

Примечание. Это всего лишь демонстрационные данные (совсем ненастоящие). Цель здесь — показать пример кода и логику

Код

  • Напишите код pytorch для набора табличных данных
  • Импорт библиотек
import torch
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
print(torch.__version__)
import pandas as pd
import seaborn as sn
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
df = pd.read_csv('Sample_IssueDataset.csv')
df.head()
df1 = pd.get_dummies(df)
df1.head()
y = df1.iloc[:,1]
X = df1.iloc[:,:18]
X = X.drop(columns=['EmployeeLeft'])
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X_train_torch = torch.tensor(X_train.values)
X_test_torch = torch.tensor(X_test.values)
y_train_torch = torch.tensor(y_train.values)
y_test_torch = torch.tensor(y_test.values)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(9380*17, 10),
            nn.ReLU(),
            #nn.Linear(9380*17, 10),
            #nn.ReLU(),
            nn.Linear(9380*17, 10),
        )
    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits
model = NeuralNetwork().to(device)
print(model)
print("Model structure: ", model, "\n\n")
for name, param in model.named_parameters():
    print(f"Layer: {name} | Size: {param.size()} | Values : {param[:2]} \n")
import torch
 
train_features = torch.tensor(X_train.to_numpy())
train_labels = torch.tensor(y_train.to_numpy())
 
validation_features = torch.tensor(X_test.to_numpy())
validation_labels = torch.tensor(y_test.to_numpy())
n_features = X_train.shape[1]
# 31
model = torch.nn.Sequential(torch.nn.Linear(n_features, 18),
                            torch.nn.ReLU(),
                            torch.nn.Linear(18, 1),
                            torch.nn.Sigmoid())
criterion = torch.nn.BCELoss()
 
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=0.001)
X_train_torch.shape
y_train_torch.shape
n_batches = 2
train_features_batched = train_features.reshape(n_batches,
                                               int(train_features.shape[0]/n_batches),
                                               train_features.shape[1])
train_labels_batched = train_labels.reshape(n_batches,
                                            int(train_labels.shape[0]/n_batches))
n_epochs = 100
loss_list = []
validate_loss_list = []
 
for epoch in range(n_epochs):
    for batch_idx in range(n_batches):
        optimizer.zero_grad()
         
        outputs = model(train_features_batched[batch_idx].float())
         
     
        loss = criterion(outputs.flatten().float(),
                         train_labels_batched[batch_idx].float())
     
         
        loss.backward()
         
        optimizer.step()
         
    outputs = model(train_features.float())
     
    validation_outputs = model(validation_features.float())
     
         
    loss = criterion(outputs.flatten().float(),
                     train_labels.float())
     
    validate_loss = criterion(validation_outputs.flatten().float(),
                              validation_labels.float())
     
    loss_list.append(loss.item())
     
    validate_loss_list.append(validate_loss)
 
print('Finished Training')
 
import matplotlib.pyplot as plt
plt.plot(loss_list, linewidth=3)
plt.plot(validate_loss_list, linewidth=3)
plt.legend(("Training Loss", "Validation Loss"))
plt.xlabel("Epoch")
plt.ylabel("BCE Loss")
y_pred = model(validation_features[1].flatten().float())
print(y_pred)
import matplotlib.pyplot as plt
import numpy as np
type(validation_features.flatten().float())
model.eval()
y_pred = model(validation_features[0].flatten().float())
len(y_pred)
len(validation_features[0].flatten().float())
model.train()
epochs = 5
errors = []
for epoch in range(epochs):
    optimizer.zero_grad()
    # Forward pass
    y_pred = model(train_features.float())
    # Compute Loss
    loss = criterion(y_pred.squeeze(), train_labels.float())
    errors.append(loss.item())
    print('Epoch {}: train loss: {}'.format(epoch, loss.item()))
    # Backward pass
    loss.backward()
    optimizer.step()
model.train()
epochs = 500
errors = []
for epoch in range(epochs):
    optimizer.zero_grad()
    # Forward pass
    y_pred = model(validation_features.float())
    # Compute Loss
    loss = criterion(y_pred.squeeze(), validation_labels.float())
    errors.append(loss.item())
    print('Epoch {}: train loss: {}'.format(epoch, loss.item()))
    # Backward pass
    loss.backward()
    optimizer.step()
import matplotlib.pyplot as plt
import numpy as np
def plotcharts(errors):
    errors = np.array(errors)
    plt.figure(figsize=(12, 5))
    graf02 = plt.subplot(1, 2, 1) # nrows, ncols, index
    graf02.set_title('Errors')
    plt.plot(errors, '-')
    plt.xlabel('Epochs')
    graf03 = plt.subplot(1, 2, 2)
    graf03.set_title('Tests')
    a = plt.plot(train_labels.numpy(), 'yo', label='Real')
    plt.setp(a, markersize=10)
    a = plt.plot(y_pred.detach().numpy(), 'b+', label='Predicted')
    plt.setp(a, markersize=10)
    plt.legend(loc=7)
    plt.show()
plotcharts(errors)
model.eval()
y_pred = model(validation_features.float())
after_train = criterion(y_pred.squeeze(), validation_labels.float())
print('Test loss after Training' , after_train.item())
import matplotlib.pyplot as plt
import numpy as np
def plotcharts(errors):
    errors = np.array(errors)
    plt.figure(figsize=(12, 5))
    graf02 = plt.subplot(1, 2, 1) # nrows, ncols, index
    graf02.set_title('Errors')
    plt.plot(errors, '-')
    plt.xlabel('Epochs')
    graf03 = plt.subplot(1, 2, 2)
    graf03.set_title('Tests')
    a = plt.plot(train_labels.numpy(), 'yo', label='Real')
    plt.setp(a, markersize=10)
    a = plt.plot(y_pred.detach().numpy(), 'b+', label='Predicted')
    plt.setp(a, markersize=10)
    plt.legend(loc=7)
    plt.show()
plotcharts(errors)
probs = torch.sigmoid(y_pred)
print(probs)
  • Теперь RAI занялся реализацией
class WrappedPytorchModel(object):
    """A class for wrapping a PyTorch model in the scikit-learn specification."""
    def __init__(self, model):
        """Initialize the PytorchModelWrapper with the model and evaluation function."""
        self._model = model
        # Set eval automatically for user for batchnorm and dropout layers
        self._model.eval()
    def predict(self, dataset):
        """Predict the output using the wrapped PyTorch model.
        :param dataset: The dataset to predict on.
        :type dataset: interpret_community.dataset.dataset_wrapper.DatasetWrapper
        """
        # Convert the data to pytorch Variable
        if isinstance(dataset, pd.DataFrame):
            dataset = dataset.values
        wrapped_dataset = torch.Tensor(dataset)
        with torch.no_grad():
            result = self._model(wrapped_dataset).numpy()
        # Reshape to 2D if output is 1D and input has one row
        if len(dataset.shape) == 1:
            result = result.reshape(1, -1)
        return result
    def predict_classes(self, dataset):
        """Predict the class using the wrapped PyTorch model.
        :param dataset: The dataset to predict on.
        :type dataset: interpret_community.dataset.dataset_wrapper.DatasetWrapper
        """
        # Convert the data to pytorch Variable
        if isinstance(dataset, pd.DataFrame):
            dataset = dataset.values
        wrapped_dataset = torch.Tensor(dataset)
        with torch.no_grad():
            result = torch.max(self._model(wrapped_dataset), 1)[0].numpy()
        # Reshape to 2D if output is 1D and input has one row
        if len(dataset.shape) == 1:
            result = result.reshape(1, -1)
        return result
    def predict_proba(self, dataset):
        """Predict the output probability using the wrapped PyTorch model.
        :param dataset: The dataset to predict_proba on.
        :type dataset: interpret_community.dataset.dataset_wrapper.DatasetWrapper
        """
        return self.predict(dataset)
class WrappedClassificationModel(object):
    """A class for wrapping a classification model."""
    def __init__(self, model, eval_function):
        """Initialize the WrappedClassificationModel with the model and evaluation function."""
        self._eval_function = eval_function
        self._model = model

    
    def predict(self, dataset):
        probabilities = self._model.predict_classes(dataset).flatten()
        return [1 if proba > 0.5 else 0 for proba in probabilities]
#        return self._model.predict_classes(dataset).flatten()
    def predict_proba(self, dataset):
        """Predict the output probability using the wrapped model.
        :param dataset: The dataset to predict_proba on.
        :type dataset: interpret_community.dataset.dataset_wrapper.DatasetWrapper
        """
        proba_preds = self._eval_function(dataset)
        if isinstance(proba_preds, pd.DataFrame):
            proba_preds = proba_preds.values
        return proba_preds
from interpret_community.common.model_wrapper import _eval_model
from interpret_community.common.model_wrapper import wrap_model
from interpret_community.dataset.dataset_wrapper import DatasetWrapper
eval_function, eval_ml_domain = _eval_model(WrappedPytorchModel(model), DatasetWrapper(validation_features.float()), "classification")
newmodel = WrappedClassificationModel(WrappedPytorchModel(model), eval_function)
  • прогноз
newmodel.predict(validation_features.float())
y_pred = newmodel.predict(validation_features.float())
  • вероятности предсказания
newmodel.predict_proba(validation_features.float())
  • тестовые классы
WrappedPytorchModel(model).predict_classes(validation_features.float()).flatten()
  • отключить журналы игры в кости
import logging
logging.basicConfig()
logging.getLogger().setLevel(logging.WARN)
  • Конфигурация и запуск объяснения
from interpret.ext.blackbox import KernelExplainer
explainer = KernelExplainer(newmodel, np.array(validation_features.float()))
global_explanation = explainer.explain_global(np.array(validation_features.float()))

# Sorted SHAP values
print('ranked global importance values: {}'.format(global_explanation.get_ranked_global_values()))
# Corresponding feature names
print('ranked global importance names: {}'.format(global_explanation.get_ranked_global_names()))
# Feature ranks (based on original order of features)
print('global importance rank: {}'.format(global_explanation.global_importance_rank))
# Note: Do not run this cell if using PFIExplainer, it does not support per class explanations
# Per class feature names
print('ranked per class feature names: {}'.format(global_explanation.get_ranked_per_class_names()))
# Per class feature importance values
print('ranked per class feature values: {}'.format(global_explanation.get_ranked_per_class_values()))
# Print out a dictionary that holds the sorted feature importance names and values
print('global importance rank: {}'.format(global_explanation.get_feature_importance_dict()))
  • Объяснение
ExplanationDashboard(global_explanation, newmodel, dataset=np.array(validation_features.float()), true_y=np.array(validation_labels.float()))

  • Анализ Фаринеса
A_test = X_test["Survey, Relative, Peer's Average Review of Employee"]
from raiwidgets import FairnessDashboard
# A_test contains your sensitive features (e.g., age, binary gender)
# y_true contains ground truth labels
# y_pred contains prediction labels
FairnessDashboard(sensitive_features=A_test,
                  y_true=np.array(validation_labels.float()).tolist(),
                  y_pred=y_pred)

features = ['Activity on Company Forums', 'Hired through SMTP','National Origin (code)', 'Negative Review in Past 5 Years', 'Survey, Relative, Attitude toward Peers', "Survey, Relative, Peer's Average Attitude toward Environment","Survey, Relative, Peer's Average Attitude toward Resources", "Survey, Relative, Peer's Average Attitude toward WorkType", "Survey, Relative, Peer's Average Attitude toward Workload", "Survey, Relative, Peer's Average Review of Employee", "University_Americanos College", 'University_Kyrgyz National University', 'University_Rice University', 'University_Smolensk Humanitarian University', 'University_Universitas Negeri Jakarta', 'University_Universitas Pasundan', 'University_University of Commerce Luigi Bocconi']
  • Анализ ошибок
from raiwidgets import ErrorAnalysisDashboard
ErrorAnalysisDashboard(global_explanation, newmodel, dataset=np.array(validation_features.float()), true_y=np.array(validation_labels.float()), features=features)

  • Выполнено

Первоначально опубликовано на https://github.com.