from pyod.models.cblof import CBLOF
from dtaianomaly.anomaly_detection.BaseDetector import Supervision
from dtaianomaly.anomaly_detection.PyODAnomalyDetector import PyODAnomalyDetector
[docs]
class ClusterBasedLocalOutlierFactor(PyODAnomalyDetector):
"""
Anomaly detector based on the Cluster-based Local Outlier Factor (CBLOF) :cite:`he2003discovering`.
CBLOF is a cluster-based LOF which uses the distance to
clusters in the data to compute an outlier score. Specifically, CBLOF first
clusters the data using some clustering algorithm (K-means in this implemention). Next,
the clusters are separated in the so-called 'large clusters' :math:`LC` and 'small
clusters' :math:`SC`, depending on the parameters :math:`\\alpha` and :math:`\\beta`.
Then, the Cluster-based Local outlier Factor of an observation :math:`o` belonging
to cluster :math:`C_i` is computed as follows:
.. math::
\\begin{equation}
CBLOF(o) = \\lvert C_i \\rvert \\cdot
\\begin{cases}
dist(o, C_i), & \\text{if $C_i \\in LC$}. \\\\
min_{C_j \\in LC} (dist(o, C_j)), & \\text{if $C_i \\in SC$}.
\\end{cases}
\\end{equation}
Specifically, if :math:`o` is part of a large cluster :math:`C_i`, we multiply the size
of :math:`C_i` with the distance of :math:`o` to :math:`C_i`. If :math:`o` is in a small
cluster, then the size of :math:`C_i` is multiplied by the distance to the nearest
*large* cluster :math:`C_j`.
Parameters
----------
window_size: int or str
The window size to use for extracting sliding windows from the time series. This
value will be passed to :py:meth:`~dtaianomaly.anomaly_detection.compute_window_size`.
stride: int, default=1
The stride, i.e., the step size for extracting sliding windows from the time series.
n_clusters: int, default=8
The number of clusters to form and the number of centroids to generate.
alpha: float in [0.5, 1.0], default=0.9
The ratio for deciding small and large clusters. :math:`\\alpha` equals the ratio of number
of samlples in large clusters to the number of samples in small clusters.
beta: float, default=5.0
The ratio for deciding small and large clusters. :math:`\\beta` equals a cutoff for
the small and large clusters, such that for clusters ordered by size, we have that
:math:`\\lvert C_k \\rvert / \\lvert C_{k+1} \\rvert = \\beta`.
**kwargs
Arguments to be passed to the PyOD CBLOF.
Attributes
----------
window_size_: int
The effectively used window size for this anomaly detector
pyod_detector_ : CBLOF
A CBLOF detector of PyOD
Examples
--------
>>> from dtaianomaly.anomaly_detection import ClusterBasedLocalOutlierFactor
>>> from dtaianomaly.data import demonstration_time_series
>>> x, y = demonstration_time_series()
>>> cblof = ClusterBasedLocalOutlierFactor(10).fit(x)
>>> cblof.decision_function(x) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
array([0.50321076, 0.5753145 , 0.61938076, ..., 0.29794485, 0.30720306, 0.29857479]...)
Notes
-----
CBLOF inherets from :py:class:`~dtaianomaly.anomaly_detection.PyodAnomalyDetector`.
"""
n_clusters: int
alpha: float
beta: float
def __init__(
self,
window_size: int | str,
stride: int = 1,
n_clusters: int = 8,
alpha: float = 0.9,
beta: float = 5.0,
**kwargs,
):
if not isinstance(n_clusters, int) or isinstance(n_clusters, bool):
raise TypeError("`n_clusters` should be integer")
if n_clusters <= 1:
raise ValueError("`n_clusters` should be at least 2")
if not isinstance(alpha, (float, int)) or isinstance(alpha, bool):
raise TypeError("`alpha` should be numeric")
if alpha < 0.5 or alpha > 1:
raise ValueError("`alpha` must be in [0.5, 1]")
if not isinstance(beta, (float, int)) or isinstance(beta, bool):
raise TypeError("`beta` should be numeric")
if beta < 1.0:
raise ValueError("`beta` must be at least 1")
self.n_clusters = n_clusters
self.alpha = alpha
self.beta = beta
super().__init__(window_size, stride, **kwargs)
def _initialize_detector(self, **kwargs) -> CBLOF:
return CBLOF(
n_clusters=self.n_clusters, alpha=self.alpha, beta=self.beta, **kwargs
)
def _supervision(self):
return Supervision.UNSUPERVISED