Source code for dtaianomaly.data._CustomDataLoader

from pathlib import Path

import numpy as np
import pandas as pd

from dtaianomaly.data._DataSet import DataSet
from dtaianomaly.data._LazyDataLoader import LazyDataLoader
from dtaianomaly.type_validation import NoneAttribute, PathAttribute

__all__ = ["CustomDataLoader"]



[docs]
class CustomDataLoader(LazyDataLoader):
    """
    A data loader for loading custom data.

    The training and testing data is located in different files. Both must be
    readable through ``pandas.read_csv(path)``. The test data must contain a
    column with name 'label', in which the anomalies are marked (1 for anomaly,
    0 for normal). The test data may have an optional column 'time', which will
    be interpreted as the time step of each observation. All other columns are
    assumed to be part of the time series data. The 'label' column is optional
    for the training set. If note present, the training data is assumed to be
    completely normal. The training data may have an optional column 'time',
    similarly as for the test data. All remaining columns are time series data.
    The titles of the training and test set must match exactly, although the
    order may be different.

    Parameters
    ----------
    test_path : str
        The path at which the test data is located.
    train_path : str, default=None
        The path at which the train data is located. If None, then there
        will be no training data in the loaded dataset.
    do_caching : bool, default=False
        Whether to cache the loaded data or not.

    Examples
    --------
    >>> from dtaianomaly.data import CustomDataLoader
    >>> train_path = "path-to-training-data.csv"
    >>> test_path = "path-to-testing-data.csv"
    >>> data_set_train_and_test = CustomDataLoader(test_path, train_path).load()  # doctest: +SKIP
    >>> data_set_only_test = CustomDataLoader(test_path).load()  # No training data  # doctest: +SKIP
    """

    train_path: str | None
    test_path: str

    attribute_validation = {
        "train_path": PathAttribute() | NoneAttribute(),
        "test_path": PathAttribute(),
    }

    def __init__(
        self,
        test_path: str | Path,
        train_path: str | Path = None,
        do_caching: bool = False,
    ):
        super().__init__(do_caching)
        self.train_path = train_path
        self.test_path = test_path

    def _load(self) -> DataSet:

        # Load test data
        time_steps_test = None
        df_test = pd.read_csv(self.test_path)
        if "time" in df_test.columns:
            time_steps_test = df_test.pop("time").values
        y_test = df_test.pop("label").values
        X_test = df_test.values
        features_test = list(df_test.columns)

        # Load train data
        time_steps_train = None
        y_train = None
        X_train = None
        if self.train_path is not None:
            df_train = pd.read_csv(self.train_path)
            if "time" in df_train.columns:
                time_steps_train = df_train.pop("time").values
            if "label" in df_train.columns:
                y_train = df_train.pop("label").values
                if np.all(y_train == 0):
                    y_train = None

            # Check the features
            features_train = list(df_train.columns)
            if set(features_test) != set(features_train):
                raise ValueError(
                    "The train and test time series consist of different features!"
                    f"Train data has features: {features_train}"
                    f"Test data has features: {features_test}"
                )

            # Make sure the features follow the same order
            X_train = df_train[features_test].values

        return DataSet(
            X_test=X_test,
            y_test=y_test,
            X_train=X_train,
            y_train=y_train,
            feature_names=features_test,
            time_steps_test=time_steps_test,
            time_steps_train=time_steps_train,
        )