Source code for dtaianomaly.data.DataSet

from typing import List, Optional

import numpy as np

from dtaianomaly.anomaly_detection.BaseDetector import BaseDetector, Supervision
from dtaianomaly.utils import (
    get_dimension,
    is_univariate,
    is_valid_array_like,
    is_valid_list,
)


[docs] class DataSet: """ A class for time series anomaly detection data sets. These consist of the raw data for training and testing anomaly detectors, as well as the respective ground truth labels. Parameters ---------- X_test: array-like of shape (n_samples_test, n_attributes) The test time series data. y_test: array-like of shape (n_samples_test) The ground truth anomaly labels of the test data. X_train: array-like of shape (n_samples_train, n_attributes), default=None The train time series. If not given, then the test data will be used for training and the data is only compatible with unsupervised anomaly detectors. y_train: array-like of shape (n_samples_train), default=None The ground truth anomaly labels of the training data. If not given, either the train data should not be given either, or the train data is assumed to consist of only normal data. feature_names: list of str, default=None The name of each feature in the data. The number of names must be identical to the number of actual features. If None, then the data is assumed to be unnamed. time_steps_test: array-like of shape (n_samples_test), default=None The time steps corresponding to the test data. If ``None``, then no time steps are known. time_steps_train: array-like of shape (n_samples_train), default=None The time steps corresponding to the train data. If ``None``, then no time steps are known. Can only be provided if there is actually some training data given (``X_train` != None``). """ X_test: np.ndarray y_test: np.array X_train: Optional[np.ndarray] y_train: Optional[np.array] feature_names: Optional[List[str]] time_steps_test: Optional[np.array] time_steps_train: Optional[np.array] def __init__( self, X_test: np.ndarray, y_test: np.array, X_train: np.ndarray = None, y_train: np.array = None, feature_names: List[str] = None, time_steps_test: np.array = None, time_steps_train: np.array = None, ): # Check actual data self.check_is_valid(X_test, y_test, X_train, y_train) # Check feature names if feature_names is not None: if not is_valid_list(feature_names, str): raise ValueError("The given feature_names are not a valid list!") if len(feature_names) != get_dimension(X_test): raise ValueError( "The number of features do not correspond to the given number of feature names!" ) # Check time steps for the test data if time_steps_test is not None: if not is_valid_array_like(time_steps_test): raise ValueError("The given time_steps_test is not a valid array-like!") if time_steps_test.shape[0] != X_test.shape[0]: raise ValueError( "The number of test time steps do not correspond to the actual number of observations in the test data!" ) # Check time steps for the train data if time_steps_train is not None: if X_train is None: raise ValueError( "There have been time steps given for the training data, but no training data!" ) if not is_valid_array_like(time_steps_train): raise ValueError( "The given time_steps_train is not a valid array-like!" ) if time_steps_train.shape[0] != X_train.shape[0]: raise ValueError( "The number of train time steps do not correspond to the actual number of observations in the train data!" ) self.X_test = X_test self.y_test = y_test self.X_train = X_train self.y_train = y_train self.feature_names = feature_names self.time_steps_test = time_steps_test self.time_steps_train = time_steps_train
[docs] @staticmethod def check_is_valid( X_test: np.ndarray, y_test: np.ndarray, X_train: Optional[np.ndarray], y_train: Optional[np.ndarray], ) -> None: """ Checks if the given elements refer o a valid ``DataSet``. If the elements would not give a valid ``DataSet``, then a ``ValueError`` is raised. Parameters ---------- X_test: array-like of shape (n_samples_test, n_attributes) The test time series data. y_test: array-like of shape (n_samples_test) The ground truth anomaly labels of the test data. X_train: array-like of shape (n_samples_train, n_attributes) or ``None`` The train time series data. Note that, even though ``X_train`` can be ``None``, it must be provided. y_train: array-like of shape (n_samples_train) or ``None``. The ground truth anomaly labels of the train data. Note that, even though ``y_train`` can be ``None``, it must be provided. Raises ------ ValueError: If the given variables would not lead to a valid ``DataSet``. This is the case if: - If ``X_test`` or ``y_test`` are not valid array-like. - If ``y_test`` is not univariate and has a value different from 0 or 1. - If ``X_test`` and ``y_test`` consist of a different number of samples. - If ``X_train`` is not ``None``, but it is not a valid array-like. - If ``X_train`` is not ``None`` and consists of a different number of attributes than ``X_test``. - If ``y_train`` is not ``None`` but ``X_train`` is ``None``. - If ``y_train`` is not ``None`` but it is not a valid array-like. - If ``y_train`` is not ``None``, but it is not univariate and has a . value different from 0 or 1. - If ``y_train`` is not ``None`` but consists of a different number of samples than ``X_train``. """ # Check test data if not is_valid_array_like(X_test): raise ValueError("The test data must be a valid array like!") # Check test labels if not is_valid_array_like(y_test): raise ValueError("The test labels must be a valid array like!") if not is_univariate(y_test): raise ValueError( "There can only be one label for each observation in the test data!" ) if not np.all(np.isin(y_test, [0, 1])): raise ValueError("The test labels must be binary!") if not y_test.shape[0] == X_test.shape[0]: raise ValueError( "The test data and labels must consist of the same number of observations!" ) # Check the train data if X_train is not None: if not is_valid_array_like(X_train): raise ValueError("The train data must be a valid array like!") if get_dimension(X_test) != get_dimension(X_train): raise ValueError( "The test and train data must consist of the same number of features!" ) # Check the train data if y_train is not None: if X_train is None: raise ValueError( "There can not be any train labels if there is no train data!" ) if not is_valid_array_like(y_train): raise ValueError("The train labels must be a valid array like!") if not is_univariate(y_train): raise ValueError( "There can only be one label for each observation in the train data!" ) if not np.all(np.isin(y_train, [0, 1])): raise ValueError("The test labels must be binary!") if not X_train.shape[0] == y_train.shape[0]: raise ValueError( "The train data and labels must consist of the same number of observations!" )
[docs] def is_valid(self) -> bool: """ Checks whether this ``DataSet`` is valid or not. Returns ------- is_valid: bool True if and only if this instance is valid, i.e., if the attributes ``X_test``, ``y_test``, ``X_train`` and ``y_train`` of this instance pass all the checks of :py:meth:`~dtaianomaly.data.DataSet.check_is_valid`. """ try: self.check_is_valid( X_test=self.X_test, y_test=self.y_test, X_train=self.X_train, y_train=self.y_train, ) return True except ValueError: return False
[docs] def compatible_supervision(self) -> List[Supervision]: """ Get the compatible supervision types for this data set. Returns ------- compatible_types: list of Supervision A list containing the compatible types for this dataset. The following suprvision types can be compatible: - ``Supervision.UNSUPERVISED``: Always compatible. - ``Supervision.SEMI_SUPERVISED``: Compatible if and only if there is some training data given (which is assumed to be normal). - ``Supervision.SUPERVISED``: Only compatible if both training data and training labels are provided. """ # If there is no train data given at all, then only unsupervised detectors are compatible if self.X_train is None and self.y_train is None: return [Supervision.UNSUPERVISED] # If train data is given but no train labels, then either unsupervised or semi-supervised detectors are compatible elif self.X_train is not None and self.y_train is None: return [Supervision.UNSUPERVISED, Supervision.SEMI_SUPERVISED] # If the train data and train labels are given, then all detectors are compatible. else: return [ Supervision.UNSUPERVISED, Supervision.SEMI_SUPERVISED, Supervision.SUPERVISED, ]
[docs] def is_compatible(self, detector: BaseDetector) -> bool: """ Checks if the given anomaly detector is compatible with this ``DataSet``. Parameters ---------- detector: BaseDetector The anomaly detector to check if it is compatible with this ``DataSet``. Returns ------- is_compatible: bool True if and only if the given anomaly detector is compatible with this ``DataSet``. The detector is compatible if - This ``DataSet`` does not contain any training data or training labels, only unsupervised anomaly detectors are compatible - This ``DataSet`` contains training data but no training labels, then unsupervised and semi-supervised anomaly detectors are compatible. - This ``DataSet`` contains training data and labels, then supervised, unsupervised and semi-supervised anomaly detectors are compatible. """ return detector.supervision in self.compatible_supervision()