Source code for dtaianomaly.data.data

import abc
import os
import numpy as np
from pathlib import Path
from typing import NamedTuple, List, Type, Union

from dtaianomaly.PrettyPrintable import PrettyPrintable


[docs] class DataSet(NamedTuple): """ A class for time series anomaly detection data sets. These consist of the raw data itself and the ground truth labels. Parameters ---------- x: array-like of shape (n_samples, n_features) The time series. y: array-like of shape (n_samples) The ground truth anomaly labels. """ x: np.ndarray y: np.ndarray
[docs] class LazyDataLoader(PrettyPrintable): """ A lazy dataloader for anomaly detection workflows This is a data loading utility to point towards a specific data set (with `path`) and to load it at a later point in time during execution of a workflow. This way we limit memory usage and allow for virtually unlimited scaling of the number of data sets in a workflow. Parameters ---------- path: str Path to the relevant data set. do_caching: bool, default=False Whether to cache the loaded data or not Attributes ---------- cache_ : DataSet Cached version of the loaded data set. Only available if ``do_caching==True`` and the data has been loaded before. Raises ------ FileNotFoundError If the given path does not point to an existing file or directory. """ path: str do_caching: bool cache_: DataSet def __init__(self, path: Union[str, Path], do_caching: bool = False): if not (Path(path).is_file() or Path(path).is_dir()): raise FileNotFoundError(f'No such file or directory: {path}') self.path = str(path) self.do_caching = do_caching
[docs] def load(self) -> DataSet: """ Load the dataset. If ``do_caching==True``, the loaded will be saved in the cache if no cache is available yet, and the cached data will be returned. Returns ------- data_set: DataSet The loaded dataset. """ if self.do_caching: if not hasattr(self, 'cache_'): self.cache_ = self._load() return self.cache_ else: return self._load()
@abc.abstractmethod def _load(self) -> DataSet: """ Abstract method to effectively load the data. """
[docs] def from_directory(directory: Union[str, Path], dataloader: Type[LazyDataLoader]) -> List[LazyDataLoader]: """ Construct a `LazyDataLoader` instance for every file in the given `directory` Parameters ---------- directory: str or Path Path to the directory in question dataloader: LazyDataLoader **object** Class object of the data loader, called for constructing each data loader instance Returns ------- data_loaders: List[LazyDataLoader] A list of the initialized data loaders, one for each data set in the given directory. Raises ------ FileNotFoundError If `directory` cannot be found """ if not Path(directory).is_dir(): raise FileNotFoundError(f'No such directory: {directory}') all_files = [ os.path.join(directory, f) for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f)) or os.path.isdir(os.path.join(directory, f)) ] return [dataloader(file) for file in all_files]