def three_sigma(s):
    mu, std = np.mean(s), np.std(s)
    lower, upper = mu-3*std, mu+3*std
    return lower, upper


import numpy as np
import matplotlib.pyplot as plt
from pyod.models.knn import KNN
from pyod.utils.data import generate_data

outlier_fraction = 0.1  #异常值的比例
n_train = 200 # 训练集的样本个数
n_test = 100 # 测试集的样本个数

X_train,X_test,y_train, y_test = generate_data(n_train=n_train, n_test=n_test, contamination=outlier_fraction) # 生成随机数据
#绘制图像
feature_1_train = X_train[:,0].reshape(-1,1)
feature_2_train = X_train[:,1].reshape(-1,1)
feature_1_test = X_test[:,0].reshape(-1,1)
feature_2_test = X_test[:,1].reshape(-1,1)
#散点图
plt.scatter(feature_1_train,feature_2_train)
plt.scatter(feature_1_test,feature_2_test)
plt.xlabel('feature_1')
plt.ylabel('feature_2')

Text(0, 0.5, 'feature_2')


knn=KNN(contamination=outlier_fraction)
knn.fit(X_train)
# prediction labels and outlier scores of the training data
y_train_pred = knn.labels_  
y_train_scores = knn.decision_scores_ 
# prediction on the test data
y_test_pred = knn.predict(X_test)  
y_test_scores = knn.decision_function(X_test)
# errors in test set
n_errors = (y_test_pred != y_test).sum()
print('No of Errors in test set: {}'.format(n_errors))
# accuracy in test set
print('Accuracy in test set: {}'.format((n_test-n_errors)/n_test))

No of Errors in test set: 0
Accuracy in test set: 1.0


from pyod.utils import example
example.visualize(knn, X_train, y_train, X_test, y_test, y_train_pred, y_test_pred, show_figure=True, save_figure=False)

	Type	Abbr	Algorithm	Year
0	Probabilistic	ECOD	Unsupervised Outlier Detection Using Empirical Cumulative Distribution Functions	2022
1	Probabilistic	ABOD	Angle-Based Outlier Detection	2008
2	Probabilistic	FastABOD	Fast Angle-Based Outlier Detection using approximation	2008
3	Probabilistic	COPOD	COPOD: Copula-Based Outlier Detection	2020
4	Probabilistic	MAD	Median Absolute Deviation (MAD)	1993
5	Probabilistic	SOS	Stochastic Outlier Selection	2012
6	Probabilistic	KDE	Outlier Detection with Kernel Density Functions	2007
7	Probabilistic	Sampling	Rapid distance-based outlier detection via sampling	2013
8	Probabilistic	GMM	Probabilistic Mixture Modeling for Outlier Analysis	nan
9	Linear Model	PCA	Principal Component Analysis (the sum of weighted projected distances to the eigenvector hyperplanes)	2003
10	Linear Model	MCD	Minimum Covariance Determinant (use the mahalanobis distances as the outlier scores)	1999
11	Linear Model	CD	Use Cook's distance for outlier detection	1977
12	Linear Model	OCSVM	One-Class Support Vector Machines	2001
13	Linear Model	LMDD	Deviation-based Outlier Detection (LMDD)	1996
14	Proximity-Based	LOF	Local Outlier Factor	2000
15	Proximity-Based	COF	Connectivity-Based Outlier Factor	2002
16	Proximity-Based	(Incremental) COF	Memory Efficient Connectivity-Based Outlier Factor (slower but reduce storage complexity)	2002
17	Proximity-Based	CBLOF	Clustering-Based Local Outlier Factor	2003
18	Proximity-Based	LOCI	LOCI: Fast outlier detection using the local correlation integral	2003
19	Proximity-Based	HBOS	Histogram-based Outlier Score	2012
20	Proximity-Based	kNN	k Nearest Neighbors (use the distance to the kth nearest neighbor as the outlier score)	2000
21	Proximity-Based	AvgKNN	Average kNN (use the average distance to k nearest neighbors as the outlier score)	2002
22	Proximity-Based	MedKNN	Median kNN (use the median distance to k nearest neighbors as the outlier score)	2002
23	Proximity-Based	SOD	Subspace Outlier Detection	2009
24	Proximity-Based	ROD	Rotation-based Outlier Detection	2020
25	Outlier Ensembles	IForest	Isolation Forest	2008
26	Outlier Ensembles	INNE	Isolation-based Anomaly Detection Using Nearest-Neighbor Ensembles	2018
27	Outlier Ensembles	FB	Feature Bagging	2005
28	Outlier Ensembles	LSCP	LSCP: Locally Selective Combination of Parallel Outlier Ensembles	2019
29	Outlier Ensembles	XGBOD	Extreme Boosting Based Outlier Detection (Supervised)	2018
30	Outlier Ensembles	LODA	Lightweight On-line Detector of Anomalies	2016
31	Outlier Ensembles	SUOD	SUOD: Accelerating Large-scale Unsupervised Heterogeneous Outlier Detection (Acceleration)	2021
32	Neural Networks	AutoEncoder	Fully connected AutoEncoder (use reconstruction error as the outlier score)	nan
33	Neural Networks	VAE	Variational AutoEncoder (use reconstruction error as the outlier score)	2013
34	Neural Networks	Beta-VAE	Variational AutoEncoder (all customized loss term by varying gamma and capacity)	2018
35	Neural Networks	SO_GAAL	Single-Objective Generative Adversarial Active Learning	2019
36	Neural Networks	MO_GAAL	Multiple-Objective Generative Adversarial Active Learning	2019
37	Neural Networks	DeepSVDD	Deep One-Class Classification	2018
38	Neural Networks	AnoGAN	Anomaly Detection with Generative Adversarial Networks	2017
39	Graph-based	R-Graph	Outlier detection by R-graph	2017
40	Graph-based	LUNAR	LUNAR: Unifying Local Outlier Detection Methods via Graph Neural Networks	2022

异常点检测的常用方法¶

3-$\sigma$原则¶

箱线图¶

KNN¶

参考资料¶