from typing import List, Optional
import numpy as np
from dtaianomaly.anomaly_detection.BaseDetector import BaseDetector, Supervision
from dtaianomaly.utils import get_dimension, is_univariate, is_valid_array_like
[docs]
class DataSet:
"""
A class for time series anomaly detection data sets. These
consist of the raw data for training and testing anomaly
detectors, as well as the respective ground truth labels.
Parameters
----------
X_test: array-like of shape (n_samples_test, n_attributes)
The test time series data.
y_test: array-like of shape (n_samples_test)
The ground truth anomaly labels of the test data.
X_train: array-like of shape (n_samples_train, n_attributes), optional
The train time series. If not given, then the test data will
be used for training and the data is only compatible with
unsupervised anomaly detectors.
y_train: array-like of shape (n_samples_train), optional
The ground truth anomaly labels of the training data. If not given,
either the train data should not be given either, or the train
data is assumed to consist of only normal data.
"""
X_test: np.ndarray
y_test: np.ndarray
X_train: np.ndarray = None
y_train: np.ndarray = None
def __init__(
self,
X_test: np.ndarray,
y_test: np.ndarray,
X_train: np.ndarray = None,
y_train: np.ndarray = None,
):
self.check_is_valid(X_test, y_test, X_train, y_train)
self.X_test = X_test
self.y_test = y_test
self.X_train = X_train
self.y_train = y_train
[docs]
@staticmethod
def check_is_valid(
X_test: np.ndarray,
y_test: np.ndarray,
X_train: Optional[np.ndarray],
y_train: Optional[np.ndarray],
) -> None:
"""
Checks if the given elements refer o a valid ``DataSet``. If the elements
would not give a valid ``DataSet``, then a ``ValueError`` is raised.
Parameters
----------
X_test: array-like of shape (n_samples_test, n_attributes)
The test time series data.
y_test: array-like of shape (n_samples_test)
The ground truth anomaly labels of the test data.
X_train: array-like of shape (n_samples_train, n_attributes) or ``None``
The train time series data. Note that, even though ``X_train`` can
be ``None``, it must be provided.
y_train: array-like of shape (n_samples_train) or ``None``.
The ground truth anomaly labels of the train data. Note that, even
though ``y_train`` can be ``None``, it must be provided.
Raises
------
ValueError:
If the given variables would not lead to a valid ``DataSet``. This is the
case if:
- If ``X_test`` or ``y_test`` are not valid array-like.
- If ``y_test`` is not univariate and has a value different from 0 or 1.
- If ``X_test`` and ``y_test`` consist of a different number of samples.
- If ``X_train`` is not ``None``, but it is not a valid array-like.
- If ``X_train`` is not ``None`` and consists of a different number of
attributes than ``X_test``.
- If ``y_train`` is not ``None`` but ``X_train`` is ``None``.
- If ``y_train`` is not ``None`` but it is not a valid array-like.
- If ``y_train`` is not ``None``, but it is not univariate and has a .
value different from 0 or 1.
- If ``y_train`` is not ``None`` but consists of a different number of
samples than ``X_train``.
"""
# Check test data
if not is_valid_array_like(X_test):
raise ValueError("The test data must be a valid array like!")
# Check test labels
if not is_valid_array_like(y_test):
raise ValueError("The test labels must be a valid array like!")
if not is_univariate(y_test):
raise ValueError(
"There can only be one label for each observation in the test data!"
)
if not np.all(np.isin(y_test, [0, 1])):
raise ValueError("The test labels must be binary!")
if not y_test.shape[0] == X_test.shape[0]:
raise ValueError(
"The test data and labels must consist of the same number of observations!"
)
# Check the train data
if X_train is not None:
if not is_valid_array_like(X_train):
raise ValueError("The train data must be a valid array like!")
if get_dimension(X_test) != get_dimension(X_train):
raise ValueError(
"The test and train data must consist of the same number of features!"
)
# Check the train data
if y_train is not None:
if X_train is None:
raise ValueError(
"There can not be any train labels if there is no train data!"
)
if not is_valid_array_like(y_train):
raise ValueError("The train labels must be a valid array like!")
if not is_univariate(y_train):
raise ValueError(
"There can only be one label for each observation in the train data!"
)
if not np.all(np.isin(y_train, [0, 1])):
raise ValueError("The test labels must be binary!")
if not X_train.shape[0] == y_train.shape[0]:
raise ValueError(
"The train data and labels must consist of the same number of observations!"
)
[docs]
def is_valid(self) -> bool:
"""
Checks whether this ``DataSet`` is valid or not.
Returns
-------
is_valid: bool
True if and only if this instance is valid, i.e., if the attributes
``X_test``, ``y_test``, ``X_train`` and ``y_train`` of this instance
pass all the checks of :py:meth:`~dtaianomaly.data.DataSet.check_is_valid`.
"""
try:
self.check_is_valid(
X_test=self.X_test,
y_test=self.y_test,
X_train=self.X_train,
y_train=self.y_train,
)
return True
except ValueError:
return False
[docs]
def compatible_supervision(self) -> List[Supervision]:
"""
Get the compatible supervision types for this data set.
Returns
-------
compatible_types: list of Supervision
A list containing the compatible types for this dataset. The following
suprvision types can be compatible:
- ``Supervision.UNSUPERVISED``: Always compatible.
- ``Supervision.SEMI_SUPERVISED``: Compatible if and only if there
is some training data given (which is assumed to be normal).
- ``Supervision.SUPERVISED``: Only compatible if both training data
and training labels are provided.
"""
# If there is no train data given at all, then only unsupervised detectors are compatible
if self.X_train is None and self.y_train is None:
return [Supervision.UNSUPERVISED]
# If train data is given but no train labels, then either unsupervised or semi-supervised detectors are compatible
elif self.X_train is not None and self.y_train is None:
return [Supervision.UNSUPERVISED, Supervision.SEMI_SUPERVISED]
# If the train data and train labels are given, then all detectors are compatible.
else:
return [
Supervision.UNSUPERVISED,
Supervision.SEMI_SUPERVISED,
Supervision.SUPERVISED,
]
[docs]
def is_compatible(self, detector: BaseDetector) -> bool:
"""
Checks if the given anomaly detector is compatible with this ``DataSet``.
Parameters
----------
detector: BaseDetector
The anomaly detector to check if it is compatible with this ``DataSet``.
Returns
-------
is_compatible: bool
True if and only if the given anomaly detector is compatible with
this ``DataSet``. The detector is compatible if
- This ``DataSet`` does not contain any training data or training labels,
only unsupervised anomaly detectors are compatible
- This ``DataSet`` contains training data but no training labels, then
unsupervised and semi-supervised anomaly detectors are compatible.
- This ``DataSet`` contains training data and labels, then supervised,
unsupervised and semi-supervised anomaly detectors are compatible.
"""
return detector.supervision in self.compatible_supervision()