Source code for vbi.feature_extraction.calc_features

"""High-level batch feature computation for VBI.

Provides a unified interface for computing feature vectors from simulated
or empirical time series, driven by a JSON configuration that specifies
which feature functions to evaluate and with which parameters.
"""
import os
import vbi
import sys
import tqdm
import importlib
import numpy as np
import pandas as pd
from multiprocessing import Pool
import vbi.feature_extraction.features
from vbi.feature_extraction.features_settings import Data_F



[docs]
def calc_features(
    ts: np.ndarray,
    fs: float,
    cfg: dict,
    preprocess=None,
    preprocess_args: dict = {},
    verbose: bool = False,
):
    """
    Extract features from time series data.

    Parameters
    ----------
    ts : np.ndarray
        Time series data
    fs : int, float
        Sampling frequency
    cfg : dict
        Dictionary of features configurations
    preprocess : function
        Function for preprocessing the time series
    preprocess_args : dictionary
        Arguments for preprocessing function

    Returns
    -------
    features : list of numpy arrays

    """

    features_path = cfg["features_path"] if ("features_path" in cfg.keys()) else None

    if features_path:
        module_name = features_path.split(os.sep)[-1][:-3]
        sys.path.append(features_path[: -len(features_path.split(os.sep)[-1]) - 1])
        exec("import " + module_name)
        importlib.reload(sys.modules[features_path.split(os.sep)[-1][:-3]])
        exec("from " + module_name + " import *")

    # module = sys.modules[module_name]
    # print(module.calc_mean)
    # print(module.calc_mean([1,2,3], 1, 2))

    def length(x):
        return (len(x)) if (len(x) > 0) else 0

    labels = []
    features = []
    info = {}

    domain = list(cfg.keys())
    # remove features_path from domain if exists
    if "features_path" in domain:
        domain.remove("features_path")

    for _type in domain:
        domain_feats = cfg[_type]
        for fe in domain_feats:
            if cfg[_type][fe]["use"] == "yes":
                c = length(features)
                func_name = fe
                func = cfg[_type][fe]["function"]
                params = cfg[_type][fe]["parameters"]
                
                if "verbose" in params.keys():
                    params["verbose"] = verbose

                if params is None:
                    params = {}

                if "fs" in params.keys():
                    params["fs"] = fs

                if preprocess is not None:
                    ts = preprocess(ts, **preprocess_args)
                val, lab = eval(func)(ts, **params)

                if isinstance(val, (np.ndarray, list)):
                    labels.extend(lab)
                    features.extend(val)
                else:
                    labels.append(func_name)
                    features.append(val)
                info[func_name] = {"index": [c, length(features)]}

    return features, labels, info




[docs]
def extract_features(
    ts: np.ndarray, fs: float, cfg: dict, output_type=Data_F, **kwargs
):
    """
    Extract features from time series data

    Parameters
    ----------
    ts : list of np.ndarray [[n_regions x n_samples]]
        Input from which the features are extracted
    fs : int, float
        Sampling frequency
    cfg : dictionary
        Dictionary of features to extract
    output_format : string
        Output format, either
        'list' (list of numpy arrays)
        'dataframe' (pandas dataframe)
        (default is 'list')

    **kwargs
    - n_workers : int
        Number of workers for parallelization, default is 1
        Parallelization is done by ensembles (first dimension of ts)
    - dtype : type
        Data type of the features extracted, default is np.float32
    - verbose : boolean
        If True, print the some information
    - preprocess : function
        Function for preprocessing the time series
    - preprocess_args : dictionary
        Arguments for preprocessing function
    - n_trial: int
        Number of trials

    Returns
    -------
    Data Object with the following attributes:
    values: list of numpy arrays or pandas dataframe
        extracted features
    labels: list of strings
        List of labels of the features
    info: dictionary
        Dictionary with the information of the features extracted

    """

    labels = []
    features = []

    n_workers = kwargs.get("n_workers", 1)
    dtype = kwargs.get("dtype", np.float32)
    verbose = kwargs.get("verbose", True)
    preprocess = kwargs.get("preprocess", None)
    preprocess_args = kwargs.get("preprocess_args", {})
    n_trial = kwargs.get("n_trial", len(ts))

    def update_bar(verbose):
        if verbose:
            pbar.update()
        else:
            pass

    # ts, n_trial = prepare_input(ts)
    labels = None
    info = None

    if n_workers == 1:
        features = []
        for i in tqdm.tqdm(range(n_trial), disable=not verbose):
            values, _labels, _info = calc_features(
                ts[i], fs, cfg, preprocess, preprocess_args
            )
            features.append(np.array(values).astype(dtype))
            if (labels is None) and (not np.isnan(values).any()):
                labels = _labels
                info = _info
    else:
        for i in range(n_trial):
            values, _labels, _info = calc_features(
                ts[i],
                fs,
                cfg,
                preprocess=preprocess,
                preprocess_args=preprocess_args
            )
            if (labels is None) and (not np.isnan(values).any()):
                labels = _labels
                info = _info
                break
        with Pool(processes=n_workers) as pool:
            with tqdm.tqdm(total=n_trial, disable=not verbose) as pbar:
                async_res = [
                    pool.apply_async(
                        calc_features,
                        args=(ts[i], fs, cfg, preprocess, preprocess_args),
                        # kwds=dict(kwargs),
                        callback=update_bar,
                    )
                    for i in range(n_trial)
                ]
                features = [np.array(res.get()[0]).astype(dtype) for res in async_res]

    if output_type == "dataframe":
        df = pd.DataFrame(features)
        if labels is not None:
            df.columns = labels
        return df
    elif output_type == "list":
        return features, labels

    data = Data_F(values=features, labels=labels, info=info)

    return data




[docs]
def extract_features_df(ts: np.ndarray, fs: float, cfg: dict, **kwargs):
    """
    Extract features from time series data and return a pandas dataframe

    Parameters
    ----------
    ts : list of np.ndarray [[n_regions x n_samples]]
        Input from which the features are extracted
    fs : int, float
        Sampling frequency
    cfg : dictionary
        Dictionary of features to extract

    **kwargs
    - n_workers : int
        Number of workers for parallelization, default is 1
        Parallelization is done by ensembles (first dimension of ts)
    - dtype : type
        Data type of the features extracted, default is np.float32
    - verbose : boolean
        If True, print the some information
    - preprocess : function
        Function for preprocessing the time series
    - preprocess_args : dictionary
        Arguments for preprocessing function

    Returns
    -------

    Data Object with the following attributes:
    - values: pandas dataframe
        extracted features
    - labels: list of strings
        List of labels of the features
    - info: dictionary
        Dictionary with the information of the features extracted
    """
    return extract_features(ts, fs, cfg, "dataframe", **kwargs)




[docs]
def extract_features_list(ts, fs, cfg, **kwargs):
    """
    extract features from time series data and return a list of features and labels

    Parameters
    ----------
    ts : list of np.ndarray [[n_regions x n_samples]]
        Input from which the features are extracted
    fs : int, float
        Sampling frequency
    cfg : dictionary
        Dictionary of features to extract

    **kwargs
    - n_workers : int
        Number of workers for parallelization, default is 1
        Parallelization is done by ensembles (first dimension of ts)
    - dtype : type
        Data type of the features extracted, default is np.float32
    - verbose : boolean
        If True, print the some information
    - preprocess : function
        Function for preprocessing the time series
    - preprocess_args : dictionary
        Arguments for preprocessing function

    Returns
    -------
    Data Object with the following attributes:
    - values: list of numpy arrays
        extracted features
    - labels: list of strings
        List of labels of the features
    - info: dictionary
        Dictionary with the information of the features extracted
    """
    return extract_features(ts, fs, cfg, "list", **kwargs)