异常点检测

异常点检测

项目id：5784
https://scikit-learn.org/stable/auto_examples/applications/plot_outlier_detection_housing.html#sphx-glr-auto-examples-applications-plot-outlier-detection-housing-py 下图为项目总览：

项目分为三个部分：
第一部分，数据生成，生成用于预测的数据，这里使用的是波士顿房价的数据，再进行了预处理

数据预处理节点代码

import suanpan
from suanpan.app import app
from suanpan.app.arguments import Csv, Npy
import pandas as pd

def transformDF(df, startIndex, stopIndex):
    df = df.values[:, [startIndex, stopIndex]]
    columns = df.shape[1]
    columns = ["feature_{}".format(str(index)) for index in range(columns)]
    return pd.DataFrame(df, columns=columns)


@app.input(Csv(key="inputData1"))
@app.output(Csv(key="outputData1"))
@app.output(Csv(key="outputData2"))
def Demo(context):
    args = context.args

    df = args.inputData1

    return transformDF(df, 8, 10), transformDF(df, 5, 12)


if __name__ == "__main__":
    suanpan.run(app)

第二部分，模型训练与预测，这里针对两种数据，分别使用了三种模型
第三部分，分析模型的结果，绘图

结果分析
- 代码

import os

import joblib
import matplotlib.font_manager
import matplotlib.pyplot as plt
import numpy as np

import suanpan
from suanpan.app import app
from suanpan.app.arguments import Csv, File, Folder, Model, String

TMP_FOLDER = "/tmp/result"


@app.input(File(key="inputModel1", name="model", type="model"))
@app.input(File(key="inputModel2", name="model", type="model"))
@app.input(File(key="inputModel3", name="model", type="model"))
@app.input(Csv(key="inputData4", required=True))
@app.output(Folder(key="outputData1"))
def Demo(context):
    args = context.args

    if not os.path.exists(TMP_FOLDER):
        os.makedirs(TMP_FOLDER)

    X = args.inputData4.values
    classifiers = {
        "Empirical Covariance": joblib.load(args.inputModel1),
        "Robust Covariance (Minimum Covariance Determinant)": joblib.load(
            args.inputModel2
        ),
        "OCSVM": joblib.load(args.inputModel3),
    }
    colors = ["m", "g", "b"]
    legend = {}

    # Learn a frontier for outlier detection with several classifiers
    xx1, yy1 = np.meshgrid(np.linspace(-8, 28, 500), np.linspace(3, 40, 500))
    xx2, yy2 = np.meshgrid(np.linspace(3, 10, 500), np.linspace(-5, 45, 500))

    for i, (clf_name, clf) in enumerate(classifiers.items()):
        plt.figure(1)
        Z1 = clf.decision_function(np.c_[xx1.ravel(), yy1.ravel()])
        Z1 = Z1.reshape(xx1.shape)
        legend[clf_name] = plt.contour(
            xx1, yy1, Z1, levels=[0], linewidths=2, colors=colors[i]
        )

    legend_values_list = list(legend.values())
    legend_keys_list = list(legend.keys())

    # Plot the results (= shape of the data points cloud)
    plt.figure(1)  # two clusters
    plt.title("Outlier detection on a real data set (boston housing)")
    plt.scatter(X[:, 0], X[:, 1], color="black")
    bbox_args = dict(boxstyle="round", fc="0.8")
    arrow_args = dict(arrowstyle="->")
    plt.annotate(
        "several confounded points",
        xy=(24, 19),
        xycoords="data",
        textcoords="data",
        xytext=(13, 10),
        bbox=bbox_args,
        arrowprops=arrow_args,
    )
    plt.xlim((xx1.min(), xx1.max()))
    plt.ylim((yy1.min(), yy1.max()))
    plt.legend(
        (
            legend_values_list[0].collections[0],
            legend_values_list[1].collections[0],
            legend_values_list[2].collections[0],
        ),
        (legend_keys_list[0], legend_keys_list[1], legend_keys_list[2]),
        loc="upper center",
        prop=matplotlib.font_manager.FontProperties(size=12),
    )
    plt.ylabel("accessibility to radial highways")
    plt.xlabel("pupil-teacher ratio by town")
    plt.savefig(os.path.join(TMP_FOLDER, "demo_result.png"), format="png")

    return TMP_FOLDER


if __name__ == "__main__":
    suanpan.run(app)

效果图