跳到主要内容

3分类问题概率校准模板

https://scikit-learn.org/stable/auto_examples/calibration/plot_calibration_multiclass.html#sphx-glr-auto-examples-calibration-plot-calibration-multiclass-py

下图为项目总览: image.png 项目可以分为四个部分:

  • 第一部分,数据生成,分别生成训练数据,校准数据以及测试数据
import numpy as np
import pandas
from sklearn.datasets import make_blobs

import suanpan
from suanpan.app import app
from suanpan.app.arguments import Csv


def createDataFrame(X, y):
featureCount = X.shape[1]

featureColumns = ["feature_{}".format(i) for i in range(featureCount)]
labelColumns = ["label"]

features = pandas.DataFrame(X, columns=featureColumns)
label = pandas.DataFrame(y, columns=labelColumns)

return pandas.concat([features, label], axis=1, join_axes=[features.index])


@app.output(Csv(key="outputData1"))
@app.output(Csv(key="outputData2"))
@app.output(Csv(key="outputData3"))
@app.output(Csv(key="outputData4"))
def Demo(context):
args = context.args

np.random.seed(0)

# Generate data
X, y = make_blobs(n_samples=1000, n_features=2, random_state=42, cluster_std=5.0)
X_train, y_train = X[:600], y[:600]
X_valid, y_valid = X[600:800], y[600:800]
X_train_valid, y_train_valid = X[:800], y[:800]
X_test, y_test = X[800:], y[800:]

return (
createDataFrame(X_train, y_train),
createDataFrame(X_valid, y_valid),
createDataFrame(X_train_valid, y_train_valid),
createDataFrame(X_test, y_test),
)


if __name__ == "__main__":
suanpan.run(app)

  • 第二部分,模型训练
    • 随机森林分类模型
      • 参数:n_estimators=25
    • 随机森林分类模型加上概率校准分类模型
      • 随机森林参数:n_estimators=25
      • 概率校准模型参数:method=sigmoid,prefit=True
  • 第三部分,计算两种模型的log loss以及预测概率
import joblib
from sklearn.metrics import log_loss

import suanpan
from suanpan.app import app
from suanpan.app.arguments import Csv, File, Json, ListOfString, Npy, String


@app.input(Csv(key="inputData1", alias="inputData", required=True))
@app.input(
File(
key="inputModel2", alias="inputModel", name="model", type="model", required=True
)
)
@app.column(ListOfString(key="param1", alias="featureColumns", required=True))
@app.column(String(key="param2", alias="labelColumn", required=True))
@app.output(Json(key="outputData1"))
@app.output(Npy(key="outputData2"))
def Demo(context):
args = context.args

df = args.inputData
clf = joblib.load(args.inputModel)

clf_probs = clf.predict_proba(df[args.featureColumns].values)
score = log_loss(df[args.labelColumn].values, clf_probs)

return {"score": score}, clf_probs


if __name__ == "__main__":
suanpan.run(app)

  • 第四部分,概率校准图解
    • Sigmoid校准图解
import os

import joblib
import numpy as np
from matplotlib import pyplot as plt

import suanpan
from suanpan.app import app
from suanpan.app.arguments import File, Folder

TMP_FOLDER = "/tmp/result"


@app.input(
File(
key="inputModel1",
alias="calibratedModel",
name="model",
type="model",
required=True,
)
)
@app.output(Folder(key="outputData1"))
def Demo(context):
args = context.args

if not os.path.exists(TMP_FOLDER):
os.makedirs(TMP_FOLDER)

calibratedModel = joblib.load(args.calibratedModel)

# Illustrate calibrator
plt.figure(figsize=(12.8, 9.6))
colors = ["r", "g", "b"]
# generate grid over 2-simplex
p1d = np.linspace(0, 1, 20)
p0, p1 = np.meshgrid(p1d, p1d)
p2 = 1 - p0 - p1
p = np.c_[p0.ravel(), p1.ravel(), p2.ravel()]
p = p[p[:, 2] >= 0]

calibratedClassifier = calibratedModel.calibrated_classifiers_[0]
prediction = np.vstack(
[
calibrator.predict(this_p)
for calibrator, this_p in zip(calibratedClassifier.calibrators_, p.T)
]
).T
prediction /= prediction.sum(axis=1)[:, None]

# Plot modifications of calibrator
for i in range(prediction.shape[0]):
plt.arrow(
p[i, 0],
p[i, 1],
prediction[i, 0] - p[i, 0],
prediction[i, 1] - p[i, 1],
head_width=1e-2,
color=colors[np.argmax(p[i])],
)
# Plot boundaries of unit simplex
plt.plot([0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0], "k", label="Simplex")

plt.grid(False)
for x in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
plt.plot([0, x], [x, 0], "k", alpha=0.2)
plt.plot([0, 0 + (1 - x) / 2], [x, x + (1 - x) / 2], "k", alpha=0.2)
plt.plot([x, x + (1 - x) / 2], [0, 0 + (1 - x) / 2], "k", alpha=0.2)

plt.title("Illustration of sigmoid calibrator")
plt.xlabel("Probability class 1")
plt.ylabel("Probability class 2")
plt.xlim(-0.05, 1.05)
plt.ylim(-0.05, 1.05)
plt.savefig(os.path.join(TMP_FOLDER, "calibration.png"), format="png")

return TMP_FOLDER


if __name__ == "__main__":
suanpan.run(app)

效果图如下:calibration.png

  • Sigmoid校准后预测概率的变化图
import os

from matplotlib import pyplot as plt

import suanpan
from suanpan.app import app
from suanpan.app.arguments import Csv, Folder, Npy, String

TMP_FOLDER = "/tmp/result"


@app.input(Npy(key="inputData1", alias="calibratedProbs", required=True))
@app.input(Npy(key="inputData2", alias="probs", required=True))
@app.input(Csv(key="inputData3", alias="testData", required=True))
@app.param(String(key="param1", alias="labelColumn", required=True))
@app.output(Folder(key="outputData1"))
def Demo(context):
args = context.args

if not os.path.exists(TMP_FOLDER):
os.makedirs(TMP_FOLDER)

calibratedProbs = args.calibratedProbs
probs = args.probs
yTest = args.testData[args.labelColumn].values

# Plot changes in predicted probabilities via arrows
plt.figure(figsize=(12.8, 9.6))
colors = ["r", "g", "b"]
for i in range(probs.shape[0]):
plt.arrow(
probs[i, 0],
probs[i, 1],
calibratedProbs[i, 0] - probs[i, 0],
calibratedProbs[i, 1] - probs[i, 1],
color=colors[yTest[i]],
head_width=1e-2,
)

# Plot perfect predictions
plt.plot([1.0], [0.0], "ro", ms=20, label="Class 1")
plt.plot([0.0], [1.0], "go", ms=20, label="Class 2")
plt.plot([0.0], [0.0], "bo", ms=20, label="Class 3")

# Plot boundaries of unit simplex
plt.plot([0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0], "k", label="Simplex")

# Annotate points on the simplex
plt.annotate(
r"($\frac{1}{3}$, $\frac{1}{3}$, $\frac{1}{3}$)",
xy=(1.0 / 3, 1.0 / 3),
xytext=(1.0 / 3, 0.23),
xycoords="data",
arrowprops=dict(facecolor="black", shrink=0.05),
horizontalalignment="center",
verticalalignment="center",
)
plt.plot([1.0 / 3], [1.0 / 3], "ko", ms=5)
plt.annotate(
r"($\frac{1}{2}$, $0$, $\frac{1}{2}$)",
xy=(0.5, 0.0),
xytext=(0.5, 0.1),
xycoords="data",
arrowprops=dict(facecolor="black", shrink=0.05),
horizontalalignment="center",
verticalalignment="center",
)
plt.annotate(
r"($0$, $\frac{1}{2}$, $\frac{1}{2}$)",
xy=(0.0, 0.5),
xytext=(0.1, 0.5),
xycoords="data",
arrowprops=dict(facecolor="black", shrink=0.05),
horizontalalignment="center",
verticalalignment="center",
)
plt.annotate(
r"($\frac{1}{2}$, $\frac{1}{2}$, $0$)",
xy=(0.5, 0.5),
xytext=(0.6, 0.6),
xycoords="data",
arrowprops=dict(facecolor="black", shrink=0.05),
horizontalalignment="center",
verticalalignment="center",
)
plt.annotate(
r"($0$, $0$, $1$)",
xy=(0, 0),
xytext=(0.1, 0.1),
xycoords="data",
arrowprops=dict(facecolor="black", shrink=0.05),
horizontalalignment="center",
verticalalignment="center",
)
plt.annotate(
r"($1$, $0$, $0$)",
xy=(1, 0),
xytext=(1, 0.1),
xycoords="data",
arrowprops=dict(facecolor="black", shrink=0.05),
horizontalalignment="center",
verticalalignment="center",
)
plt.annotate(
r"($0$, $1$, $0$)",
xy=(0, 1),
xytext=(0.1, 1),
xycoords="data",
arrowprops=dict(facecolor="black", shrink=0.05),
horizontalalignment="center",
verticalalignment="center",
)

# Add grid
plt.grid(False)
for x in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
plt.plot([0, x], [x, 0], "k", alpha=0.2)
plt.plot([0, 0 + (1 - x) / 2], [x, x + (1 - x) / 2], "k", alpha=0.2)
plt.plot([x, x + (1 - x) / 2], [0, 0 + (1 - x) / 2], "k", alpha=0.2)

plt.title("Change of predicted probabilities after sigmoid calibration")
plt.xlabel("Probability class 1")
plt.ylabel("Probability class 2")
plt.xlim(-0.05, 1.05)
plt.ylim(-0.05, 1.05)
plt.legend(loc="best")
plt.savefig(os.path.join(TMP_FOLDER, "calibration.png"), format="png")

return TMP_FOLDER


if __name__ == "__main__":
suanpan.run(app)

效果图如下:calibration.png