跳到主要内容

异常点检测

异常点检测

项目id:5784
https://scikit-learn.org/stable/auto_examples/applications/plot_outlier_detection_housing.html#sphx-glr-auto-examples-applications-plot-outlier-detection-housing-py 下图为项目总览: image.png

项目分为三个部分:
第一部分,数据生成,生成用于预测的数据,这里使用的是波士顿房价的数据,再进行了预处理

  • 数据预处理节点代码
import suanpan
from suanpan.app import app
from suanpan.app.arguments import Csv, Npy
import pandas as pd

def transformDF(df, startIndex, stopIndex):
df = df.values[:, [startIndex, stopIndex]]
columns = df.shape[1]
columns = ["feature_{}".format(str(index)) for index in range(columns)]
return pd.DataFrame(df, columns=columns)


@app.input(Csv(key="inputData1"))
@app.output(Csv(key="outputData1"))
@app.output(Csv(key="outputData2"))
def Demo(context):
args = context.args

df = args.inputData1

return transformDF(df, 8, 10), transformDF(df, 5, 12)


if __name__ == "__main__":
suanpan.run(app)

第二部分,模型训练与预测,这里针对两种数据,分别使用了三种模型
第三部分,分析模型的结果,绘图

  • 结果分析
    • 代码
import os

import joblib
import matplotlib.font_manager
import matplotlib.pyplot as plt
import numpy as np

import suanpan
from suanpan.app import app
from suanpan.app.arguments import Csv, File, Folder, Model, String

TMP_FOLDER = "/tmp/result"


@app.input(File(key="inputModel1", name="model", type="model"))
@app.input(File(key="inputModel2", name="model", type="model"))
@app.input(File(key="inputModel3", name="model", type="model"))
@app.input(Csv(key="inputData4", required=True))
@app.output(Folder(key="outputData1"))
def Demo(context):
args = context.args

if not os.path.exists(TMP_FOLDER):
os.makedirs(TMP_FOLDER)

X = args.inputData4.values
classifiers = {
"Empirical Covariance": joblib.load(args.inputModel1),
"Robust Covariance (Minimum Covariance Determinant)": joblib.load(
args.inputModel2
),
"OCSVM": joblib.load(args.inputModel3),
}
colors = ["m", "g", "b"]
legend = {}

# Learn a frontier for outlier detection with several classifiers
xx1, yy1 = np.meshgrid(np.linspace(-8, 28, 500), np.linspace(3, 40, 500))
xx2, yy2 = np.meshgrid(np.linspace(3, 10, 500), np.linspace(-5, 45, 500))

for i, (clf_name, clf) in enumerate(classifiers.items()):
plt.figure(1)
Z1 = clf.decision_function(np.c_[xx1.ravel(), yy1.ravel()])
Z1 = Z1.reshape(xx1.shape)
legend[clf_name] = plt.contour(
xx1, yy1, Z1, levels=[0], linewidths=2, colors=colors[i]
)

legend_values_list = list(legend.values())
legend_keys_list = list(legend.keys())

# Plot the results (= shape of the data points cloud)
plt.figure(1) # two clusters
plt.title("Outlier detection on a real data set (boston housing)")
plt.scatter(X[:, 0], X[:, 1], color="black")
bbox_args = dict(boxstyle="round", fc="0.8")
arrow_args = dict(arrowstyle="->")
plt.annotate(
"several confounded points",
xy=(24, 19),
xycoords="data",
textcoords="data",
xytext=(13, 10),
bbox=bbox_args,
arrowprops=arrow_args,
)
plt.xlim((xx1.min(), xx1.max()))
plt.ylim((yy1.min(), yy1.max()))
plt.legend(
(
legend_values_list[0].collections[0],
legend_values_list[1].collections[0],
legend_values_list[2].collections[0],
),
(legend_keys_list[0], legend_keys_list[1], legend_keys_list[2]),
loc="upper center",
prop=matplotlib.font_manager.FontProperties(size=12),
)
plt.ylabel("accessibility to radial highways")
plt.xlabel("pupil-teacher ratio by town")
plt.savefig(os.path.join(TMP_FOLDER, "demo_result.png"), format="png")

return TMP_FOLDER


if __name__ == "__main__":
suanpan.run(app)

  • 效果图

demo_result.png