Source code for dtaianomaly.anomaly_detection._ClusterBasedLocalOutlierFactor

from pyod.models.cblof import CBLOF

from dtaianomaly.anomaly_detection._BaseDetector import Supervision
from dtaianomaly.anomaly_detection._BasePyODAnomalyDetector import (
    BasePyODAnomalyDetector,
)
from dtaianomaly.type_validation import FloatAttribute, IntegerAttribute
from dtaianomaly.windowing import WINDOW_SIZE_TYPE

__all__ = ["ClusterBasedLocalOutlierFactor"]


[docs] class ClusterBasedLocalOutlierFactor(BasePyODAnomalyDetector): """ Anomaly detector based on the Cluster-based Local Outlier Factor (CBLOF) :cite:`he2003discovering`. CBLOF is a cluster-based LOF which uses the distance to clusters in the data to compute an outlier score. Specifically, CBLOF first clusters the data using some clustering algorithm (K-means in this implemention). Next, the clusters are separated in the so-called 'large clusters' :math:`LC` and 'small clusters' :math:`SC`, depending on the parameters :math:`\\alpha` and :math:`\\beta`. Then, the Cluster-based Local outlier Factor of an observation :math:`o` belonging to cluster :math:`C_i` is computed as follows: .. math:: \\begin{equation} CBLOF(o) = \\lvert C_i \\rvert \\cdot \\begin{cases} dist(o, C_i), & \\text{if $C_i \\in LC$}. \\\\ min_{C_j \\in LC} (dist(o, C_j)), & \\text{if $C_i \\in SC$}. \\end{cases} \\end{equation} Specifically, if :math:`o` is part of a large cluster :math:`C_i`, we multiply the size of :math:`C_i` with the distance of :math:`o` to :math:`C_i`. If :math:`o` is in a small cluster, then the size of :math:`C_i` is multiplied by the distance to the nearest *large* cluster :math:`C_j`. Parameters ---------- window_size : int or str The window size to use for extracting sliding windows from the time series. This value will be passed to :py:meth:`~dtaianomaly.anomaly_detection.compute_window_size`. stride : int, default=1 The stride, i.e., the step size for extracting sliding windows from the time series. n_clusters : int, default=8 The number of clusters to form and the number of centroids to generate. alpha : float in [0.5, 1.0], default=0.9 The ratio for deciding small and large clusters. :math:`\\alpha` equals the ratio of number of samlples in large clusters to the number of samples in small clusters. beta : float, default=5.0 The ratio for deciding small and large clusters. :math:`\\beta` equals a cutoff for the small and large clusters, such that for clusters ordered by size, we have that :math:`\\lvert C_k \\rvert / \\lvert C_{k+1} \\rvert = \\beta`. **kwargs Arguments to be passed to the PyOD CBLOF. Attributes ---------- window_size_ : int The effectively used window size for this anomaly detector pyod_detector_ : CBLOF A CBLOF detector of PyOD Examples -------- >>> from dtaianomaly.anomaly_detection import ClusterBasedLocalOutlierFactor >>> from dtaianomaly.data import demonstration_time_series >>> x, y = demonstration_time_series() >>> cblof = ClusterBasedLocalOutlierFactor(10).fit(x) >>> cblof.decision_function(x) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE array([0.50321076, 0.5753145 , 0.61938076, ..., 0.29794485, 0.30720306, 0.29857479]...) """ n_clusters: int alpha: float beta: float attribute_validation = { "n_clusters": IntegerAttribute(minimum=2), "alpha": FloatAttribute(minimum=0.5, maximum=1.0), "beta": FloatAttribute(minimum=1.0), } def __init__( self, window_size: WINDOW_SIZE_TYPE, stride: int = 1, n_clusters: int = 8, alpha: float = 0.9, beta: float = 5.0, **kwargs, ): self.n_clusters = n_clusters self.alpha = alpha self.beta = beta super().__init__(window_size, stride, **kwargs) def _initialize_detector(self, **kwargs) -> CBLOF: return CBLOF( n_clusters=self.n_clusters, alpha=self.alpha, beta=self.beta, **kwargs ) def _supervision(self): return Supervision.UNSUPERVISED