跳到主要内容

梯度增强OutOfBag预测模板

https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_oob.html#sphx-glr-auto-examples-ensemble-plot-gradient-boosting-oob-py

下图为项目总览: image.png

项目分为四个部分:
第一部分,数据生成,生成用于训练以及预测的数据

import numpy as np
import pandas as pd
from scipy.special import expit

import suanpan
from suanpan.app import app
from suanpan.app.arguments import Csv


@app.output(Csv(key="outputData1"))
def Demo(context):
args = context.args

# Generate data (adapted from G. Ridgeway's gbm example)
n_samples = 1000
random_state = np.random.RandomState(13)
x1 = random_state.uniform(size=n_samples)
x2 = random_state.uniform(size=n_samples)
x3 = random_state.randint(0, 4, size=n_samples)

p = expit(np.sin(3 * x1) - 4 * x2 + x3)
y = random_state.binomial(1, p, size=n_samples)

X = np.c_[x1, x2, x3]

X = X.astype(np.float32)

feature_columns = ["feature_{}".format(i) for i in range(X.shape[1])]
label_columns = ["label"]

feature = pd.DataFrame(X, columns=feature_columns)
label = pd.DataFrame(y, columns=label_columns)

return pd.concat([feature, label], axis=1, join_axes=[feature.index])


if __name__ == "__main__":
suanpan.run(app)

第二部分,计算得到交叉验证分数

  • 梯度提升决策树分类模型
  • KFold模型
  • 计算分数的节点
import joblib
import numpy as np

import suanpan
from suanpan.app import app
from suanpan.app.arguments import Csv, File, Json, ListOfString, String

n_estimators = 1200


def heldout_score(clf, X_test, y_test):
score = np.zeros((n_estimators,), dtype=np.float64)
for i, y_pred in enumerate(clf.staged_decision_function(X_test)):
score[i] = clf.loss_(y_test, y_pred)
return score


def cv_estimate(kfold, classifier, X_train, y_train):
val_scores = np.zeros((n_estimators,), dtype=np.float64)
for train, test in kfold.split(X_train, y_train):
classifier.fit(X_train[train], y_train[train])
val_scores += heldout_score(classifier, X_train[test], y_train[test])
val_scores /= kfold.n_splits
return val_scores


@app.input(
File(
key="inputModel1", alias="gbtcModel", name="model", type="model", required=True
)
)
@app.input(
File(
key="inputModel2", alias="kfoldModel", name="model", type="model", required=True
)
)
@app.input(Csv(key="inputData3", alias="trainData", required=True))
@app.param(ListOfString(key="param1", alias="featureColumns", required=True))
@app.param(String(key="param2", alias="labelColumn", required=True))
@app.output(Json(key="outputData1"))
def Demo(context):
args = context.args

classifier = joblib.load(args.gbtcModel)
kfold = joblib.load(args.kfoldModel)
df = args.trainData

X_train = df[args.featureColumns].values
y_train = df[args.labelColumn].values

return {"cv_score": cv_estimate(kfold, classifier, X_train, y_train)}


if __name__ == "__main__":
suanpan.run(app)

第三部分,计算得到测试分数

  • 梯度提升决策树分类模型
  • 计算分数节点
import joblib
import numpy as np

import suanpan
from suanpan.app import app
from suanpan.app.arguments import Csv, File, Json, ListOfString, String

n_estimators = 1200


def heldout_score(classifier, X_test, y_test):
score = np.zeros((n_estimators,), dtype=np.float64)
for i, y_pred in enumerate(classifier.staged_decision_function(X_test)):
score[i] = classifier.loss_(y_test, y_pred)
return score


@app.input(
File(
key="inputModel1", alias="inputModel", name="model", type="model", required=True
)
)
@app.input(Csv(key="inputData2", alias="testData", required=True))
@app.param(ListOfString(key="param1", alias="featureColumns", required=True))
@app.param(String(key="param2", alias="labelColumn", required=True))
@app.output(Json(key="outputData1"))
def Demo(context):
args = context.args

classifier = joblib.load(args.inputModel)

df = args.testData
X_test = df[args.featureColumns].values
y_test = df[args.labelColumn].values

return {"test_score": heldout_score(classifier, X_test, y_test)}


if __name__ == "__main__":
suanpan.run(app)

第四部分,结果展示

import os

import joblib
import numpy as np
from matplotlib import pyplot as plt

import suanpan
from suanpan.app import app
from suanpan.app.arguments import File, Folder, Json

n_estimators = 1200
TMP_FOLDER = "/tmp/result"


@app.input(
File(
key="inputModel1", alias="inputModel", name="model", type="model", required=True
)
)
@app.input(Json(key="inputData2", alias="testScore", required=True))
@app.input(Json(key="inputData3", alias="cvScore", required=True))
@app.output(Folder(key="outputData1"))
def Demo(context):
args = context.args

if not os.path.exists(TMP_FOLDER):
os.makedirs(TMP_FOLDER)

x = np.arange(n_estimators) + 1

classifier = joblib.load(args.inputModel)
# Estimate best n_estimator using cross-validation
cv_score = np.array(args.cvScore["cv_score"])

# Compute best n_estimator for test data
test_score = np.array(args.testScore["test_score"])

# negative cumulative sum of oob improvements
cumsum = -np.cumsum(classifier.oob_improvement_)

# min loss according to OOB
oob_best_iter = x[np.argmin(cumsum)]

# min loss according to test (normalize such that first loss is 0)
test_score -= test_score[0]
test_best_iter = x[np.argmin(test_score)]

# min loss according to cv (normalize such that first loss is 0)
cv_score -= cv_score[0]
cv_best_iter = x[np.argmin(cv_score)]

# color brew for the three curves
oob_color = list(map(lambda x: x / 256.0, (190, 174, 212)))
test_color = list(map(lambda x: x / 256.0, (127, 201, 127)))
cv_color = list(map(lambda x: x / 256.0, (253, 192, 134)))

# plot curves and vertical lines for best iterations
plt.plot(x, cumsum, label="OOB loss", color=oob_color)
plt.plot(x, test_score, label="Test loss", color=test_color)
plt.plot(x, cv_score, label="CV loss", color=cv_color)
plt.axvline(x=oob_best_iter, color=oob_color)
plt.axvline(x=test_best_iter, color=test_color)
plt.axvline(x=cv_best_iter, color=cv_color)

# add three vertical lines to xticks
xticks = plt.xticks()
xticks_pos = np.array(
xticks[0].tolist() + [oob_best_iter, cv_best_iter, test_best_iter]
)
xticks_label = np.array(
list(map(lambda t: int(t), xticks[0])) + ["OOB", "CV", "Test"]
)
ind = np.argsort(xticks_pos)
xticks_pos = xticks_pos[ind]
xticks_label = xticks_label[ind]
plt.xticks(xticks_pos, xticks_label)

plt.legend(loc="upper right")
plt.ylabel("normalized loss")
plt.xlabel("number of iterations")
plt.savefig(os.path.join(TMP_FOLDER, "oob.png"), format="png")

return TMP_FOLDER


if __name__ == "__main__":
suanpan.run(app)

效果图如下:oob.png