Source code for dtaianomaly.data._UCRLoader

import numpy as np

from dtaianomaly.data._DataSet import DataSet
from dtaianomaly.data._PathDataLoader import PathDataLoader

__all__ = ["UCRLoader"]


[docs] class UCRLoader(PathDataLoader): """ Lazy dataloader for the UCR suite of anomaly detection data sets :cite:`wu2023current`. The UCR time series anomaly archive consists of 250 time series, which have been published to mitigate common issues in existing time series anomaly detection benchmarks: (1) Triviality: many benchmarks are easily solved without any fancy algorithms; (2) Unrealistic anomaly density: the number of ground truth anomalies is relatively high, even though anomalies should be rare observations; (3) Mislabeling: the ground truth labels might not be perfectly aligned with the actual anomalies in the data; (4) Run-to-failure bias: most anomalies are located near the end of the time series. Parameters ---------- path : str The path at which the data set is located. do_caching : bool, default=False Whether to cache the loaded data or not. Notes ----- This implementation expects the file names to contain the start and stop time stamps of the single anomaly in the time series as: ``*_<train-test-split>_<start>_<stop>.txt``. Examples -------- >>> from dtaianomaly.data import UCRLoader >>> path_to_ucr = "001_UCR_Anomaly_DISTORTED1sddb40_35000_52000_52620.txt" >>> ucr_data_set = UCRLoader(path_to_ucr).load() # doctest: +SKIP """ def _load(self) -> DataSet: # Extract the meta-information from the name of the file [*_, train_test_split, start_anomaly, end_anomaly] = self.path.rstrip( ".txt" ).split("_") train_test_split = int(train_test_split) start_anomaly = int(start_anomaly) end_anomaly = int(end_anomaly) # Load time series X = np.loadtxt(self.path) X_train = X[:train_test_split] X_test = X[train_test_split:] # To ensure the file extensions gets ignored y = np.zeros(shape=X.shape[0], dtype=int) y[start_anomaly:end_anomaly] = 1 y_test = y[train_test_split:] # Return a DataSet object return DataSet(X_test=X_test, y_test=y_test, X_train=X_train)