def three_sigma(s):
mu, std = np.mean(s), np.std(s)
lower, upper = mu-3*std, mu+3*std
return lower, upper
def boxplot(s):
q1, q3 = s.quantile(.25), s.quantile(.75)
iqr = q3 - q1
lower, upper = q1 - 1.5*iqr, q3 + 1.5*iqr
return lower, upper
我们首先确定Q1和Q3的四分位数。 $$四分位差IQR = Q3 - Q1$$ $$上限= Q3+1.5*IQR$$ $$下限= Q1-1.5 *IQR$$ 任何低于下限和高于上限的值都被认为是异常值
Type | Abbr | Algorithm | Year | |
0 | Probabilistic | ECOD | Unsupervised Outlier Detection Using Empirical Cumulative Distribution Functions | 2022 |
1 | Probabilistic | ABOD | Angle-Based Outlier Detection | 2008 |
2 | Probabilistic | FastABOD | Fast Angle-Based Outlier Detection using approximation | 2008 |
3 | Probabilistic | COPOD | COPOD: Copula-Based Outlier Detection | 2020 |
4 | Probabilistic | MAD | Median Absolute Deviation (MAD) | 1993 |
5 | Probabilistic | SOS | Stochastic Outlier Selection | 2012 |
6 | Probabilistic | KDE | Outlier Detection with Kernel Density Functions | 2007 |
7 | Probabilistic | Sampling | Rapid distance-based outlier detection via sampling | 2013 |
8 | Probabilistic | GMM | Probabilistic Mixture Modeling for Outlier Analysis | nan |
9 | Linear Model | PCA | Principal Component Analysis (the sum of weighted projected distances to the eigenvector hyperplanes) | 2003 |
10 | Linear Model | MCD | Minimum Covariance Determinant (use the mahalanobis distances as the outlier scores) | 1999 |
11 | Linear Model | CD | Use Cook's distance for outlier detection | 1977 |
12 | Linear Model | OCSVM | One-Class Support Vector Machines | 2001 |
13 | Linear Model | LMDD | Deviation-based Outlier Detection (LMDD) | 1996 |
14 | Proximity-Based | LOF | Local Outlier Factor | 2000 |
15 | Proximity-Based | COF | Connectivity-Based Outlier Factor | 2002 |
16 | Proximity-Based | (Incremental) COF | Memory Efficient Connectivity-Based Outlier Factor (slower but reduce storage complexity) | 2002 |
17 | Proximity-Based | CBLOF | Clustering-Based Local Outlier Factor | 2003 |
18 | Proximity-Based | LOCI | LOCI: Fast outlier detection using the local correlation integral | 2003 |
19 | Proximity-Based | HBOS | Histogram-based Outlier Score | 2012 |
20 | Proximity-Based | kNN | k Nearest Neighbors (use the distance to the kth nearest neighbor as the outlier score) | 2000 |
21 | Proximity-Based | AvgKNN | Average kNN (use the average distance to k nearest neighbors as the outlier score) | 2002 |
22 | Proximity-Based | MedKNN | Median kNN (use the median distance to k nearest neighbors as the outlier score) | 2002 |
23 | Proximity-Based | SOD | Subspace Outlier Detection | 2009 |
24 | Proximity-Based | ROD | Rotation-based Outlier Detection | 2020 |
25 | Outlier Ensembles | IForest | Isolation Forest | 2008 |
26 | Outlier Ensembles | INNE | Isolation-based Anomaly Detection Using Nearest-Neighbor Ensembles | 2018 |
27 | Outlier Ensembles | FB | Feature Bagging | 2005 |
28 | Outlier Ensembles | LSCP | LSCP: Locally Selective Combination of Parallel Outlier Ensembles | 2019 |
29 | Outlier Ensembles | XGBOD | Extreme Boosting Based Outlier Detection (Supervised) | 2018 |
30 | Outlier Ensembles | LODA | Lightweight On-line Detector of Anomalies | 2016 |
31 | Outlier Ensembles | SUOD | SUOD: Accelerating Large-scale Unsupervised Heterogeneous Outlier Detection (Acceleration) | 2021 |
32 | Neural Networks | AutoEncoder | Fully connected AutoEncoder (use reconstruction error as the outlier score) | nan |
33 | Neural Networks | VAE | Variational AutoEncoder (use reconstruction error as the outlier score) | 2013 |
34 | Neural Networks | Beta-VAE | Variational AutoEncoder (all customized loss term by varying gamma and capacity) | 2018 |
35 | Neural Networks | SO_GAAL | Single-Objective Generative Adversarial Active Learning | 2019 |
36 | Neural Networks | MO_GAAL | Multiple-Objective Generative Adversarial Active Learning | 2019 |
37 | Neural Networks | DeepSVDD | Deep One-Class Classification | 2018 |
38 | Neural Networks | AnoGAN | Anomaly Detection with Generative Adversarial Networks | 2017 |
39 | Graph-based | R-Graph | Outlier detection by R-graph | 2017 |
40 | Graph-based | LUNAR | LUNAR: Unifying Local Outlier Detection Methods via Graph Neural Networks | 2022 |
from pyod.models.knn import KNN # kNN detector
# train kNN detector
clf_name = 'KNN'
clf = KNN()
# get the prediction label and outlier scores of the training data
y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_ # raw outlier scores
# get the prediction on the test data
y_test_pred = clf.predict(X_test) # outlier labels (0 or 1)
y_test_scores = clf.decision_function(X_test) # outlier scores
# it is possible to get the prediction confidence as well
y_test_pred, y_test_pred_confidence = clf.predict(X_test, return_confidence=True) # outlier labels (0 or 1) and confidence in the range of [0,1]
from pyod.utils.data import evaluate_print
# evaluate and print the results
print("\nOn Training Data:")
evaluate_print(clf_name, y_train, y_train_scores)
print("\nOn Test Data:")
evaluate_print(clf_name, y_test, y_test_scores)
visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred,
y_test_pred, show_figure=True, save_figure=False)
import numpy as np
import matplotlib.pyplot as plt
from pyod.models.knn import KNN
from pyod.utils.data import generate_data
outlier_fraction = 0.1 #异常值的比例
n_train = 200 # 训练集的样本个数
n_test = 100 # 测试集的样本个数
X_train,X_test,y_train, y_test = generate_data(n_train=n_train, n_test=n_test, contamination=outlier_fraction) # 生成随机数据
feature_1_train = X_train[:,0].reshape(-1,1)
feature_2_train = X_train[:,1].reshape(-1,1)
feature_1_test = X_test[:,0].reshape(-1,1)
feature_2_test = X_test[:,1].reshape(-1,1)
Text(0, 0.5, 'feature_2')
# prediction labels and outlier scores of the training data
y_train_pred = knn.labels_
y_train_scores = knn.decision_scores_
# prediction on the test data
y_test_pred = knn.predict(X_test)
y_test_scores = knn.decision_function(X_test)
# errors in test set
n_errors = (y_test_pred != y_test).sum()
print('No of Errors in test set: {}'.format(n_errors))
# accuracy in test set
print('Accuracy in test set: {}'.format((n_test-n_errors)/n_test))
No of Errors in test set: 0 Accuracy in test set: 1.0
from pyod.utils import example
example.visualize(knn, X_train, y_train, X_test, y_test, y_train_pred, y_test_pred, show_figure=True, save_figure=False)