반응형
01. Import library & make dataset
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm_notebook as tqdm
import os
import time
from math import ceil
from IPython.display import display, Markdown
train = pd.read_csv('..')
X = train.drop(['target'], axis=1).values
y = train['target'].values
02. Fitting Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
rf = RandomForestClassifier(n_jobs=-1, max_depth=8, random_state=42).fit(X_train, y_train)
preds = rf.predict_proba(X_test)[:,1] #클래스1에 대한 확률값만
03. ECE 함수 정의
def ece(y_test, preds, strategy = 'uniform'):
df = pd.DataFrame({'target': y_test, 'proba': preds, 'bin': np.nan})
if(strategy == 'uniform'):
lim_inf = np.linspace(0, 0.9, 10)
for idx, lim in enumerate(lim_inf):
df.loc[df['proba'] >= lim, 'bin'] = idx
elif(strategy == 'quantile'):
pass
df_bin_groups = pd.concat([df.groupby('bin').mean(), df['bin'].value_counts()], axis = 1)
df_bin_groups['ece'] = (df_bin_groups['target'] - df_bin_groups['proba']).abs() * (df_bin_groups['bin'] / df.shape[0])
return df_bin_groups['ece'].sum()
04. make_report 함수 정의
def make_report(y_test, preds):
# Computing AUC
auc = roc_auc_score(y_test, preds)
display(Markdown(f'AUROC: {auc}'))
display(Markdown(f'AUROC: {2*auc-1}'))
display(Markdown(f'Fraction of positive cases in the test set: {y_test.mean()}'))
display(Markdown(f'Mean predicted value in the test set: {preds.mean()}'))
display(Markdown(f'ECE (equal width bins): {ece(y_test, preds)}'))
# Plotting probabilities
display(Markdown('#### Histogram of the probability distribution'))
sns.histplot(preds)
plt.show()
# Plotting KDE by class
display(Markdown('#### KDE plots of the probability distribution by class'))
fig, ax1 = plt.subplots()
sns.kdeplot(preds[y_test == 0], label = 'Class 0', ax = ax1)
ax2 = ax1.twinx()
sns.kdeplot(preds[y_test == 1], label = 'Class 1', color = 'red', ax = ax2)
lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax2.legend(lines + lines2, labels + labels2, loc=0)
plt.show()
# Plotting calibration
display(Markdown('#### Calibration curve (equal width bins)'))
fop, mpv = calibration_curve(y_test, preds, n_bins=10)
plt.plot(mpv, fop, "s-", label='model')
plt.plot([0,1],[0,1], label='ideal')
plt.xlabel('Mean predicted value')
plt.ylabel('Fraction of positives')
plt.legend()
plt.show()
display(Markdown('#### Calibration curve (equal size bins)'))
fop, mpv = calibration_curve(y_test, preds, n_bins=10, strategy='quantile')
plt.plot(mpv, fop, "s-", label='model')
plt.plot([0,1],[0,1], label='ideal')
plt.xlabel('Mean predicted value')
plt.ylabel('Fraction of positives')
plt.legend()
plt.show()
05. 출력
make_report(y_test, preds)
참고문헌
https://www.kaggle.com/code/mateuscco/how-to-evaluate-model-calibration/notebook
반응형