Source code for vbi.feature_extraction.calc_features

import os
import vbi
import sys
import tqdm
import importlib
import numpy as np
import pandas as pd
from multiprocessing import Pool
import vbi.feature_extraction.features
from vbi.feature_extraction.features_settings import Data_F


[docs] def calc_features( ts: np.ndarray, fs: float, cfg: dict, preprocess=None, preprocess_args: dict = {}, verbose: bool = False, ): """ Extract features from time series data. Parameters ---------- ts : np.ndarray Time series data fs : int, float Sampling frequency cfg : dict Dictionary of features configurations preprocess : function Function for preprocessing the time series preprocess_args : dictionary Arguments for preprocessing function Returns ------- features : list of numpy arrays """ features_path = cfg["features_path"] if ("features_path" in cfg.keys()) else None if features_path: module_name = features_path.split(os.sep)[-1][:-3] sys.path.append(features_path[: -len(features_path.split(os.sep)[-1]) - 1]) exec("import " + module_name) importlib.reload(sys.modules[features_path.split(os.sep)[-1][:-3]]) exec("from " + module_name + " import *") # module = sys.modules[module_name] # print(module.calc_mean) # print(module.calc_mean([1,2,3], 1, 2)) def length(x): return (len(x)) if (len(x) > 0) else 0 labels = [] features = [] info = {} domain = list(cfg.keys()) # remove features_path from domain if exists if "features_path" in domain: domain.remove("features_path") for _type in domain: domain_feats = cfg[_type] for fe in domain_feats: if cfg[_type][fe]["use"] == "yes": c = length(features) func_name = fe func = cfg[_type][fe]["function"] params = cfg[_type][fe]["parameters"] if "verbose" in params.keys(): params["verbose"] = verbose if params is None: params = {} if "fs" in params.keys(): params["fs"] = fs if preprocess is not None: ts = preprocess(ts, **preprocess_args) val, lab = eval(func)(ts, **params) if isinstance(val, (np.ndarray, list)): labels.extend(lab) features.extend(val) else: labels.append(func_name) features.append(val) info[func_name] = {"index": [c, length(features)]} return features, labels, info
[docs] def extract_features( ts: np.ndarray, fs: float, cfg: dict, output_type=Data_F, **kwargs ): """ Extract features from time series data Parameters ---------- ts : list of np.ndarray [[n_regions x n_samples]] Input from which the features are extracted fs : int, float Sampling frequency cfg : dictionary Dictionary of features to extract output_format : string Output format, either 'list' (list of numpy arrays) 'dataframe' (pandas dataframe) (default is 'list') **kwargs - n_workers : int Number of workers for parallelization, default is 1 Parallelization is done by ensembles (first dimension of ts) - dtype : type Data type of the features extracted, default is np.float32 - verbose : boolean If True, print the some information - preprocess : function Function for preprocessing the time series - preprocess_args : dictionary Arguments for preprocessing function - n_trial: int Number of trials Returns ------- Data Object with the following attributes: values: list of numpy arrays or pandas dataframe extracted features labels: list of strings List of labels of the features info: dictionary Dictionary with the information of the features extracted """ labels = [] features = [] n_workers = kwargs.get("n_workers", 1) dtype = kwargs.get("dtype", np.float32) verbose = kwargs.get("verbose", True) preprocess = kwargs.get("preprocess", None) preprocess_args = kwargs.get("preprocess_args", {}) n_trial = kwargs.get("n_trial", len(ts)) def update_bar(verbose): if verbose: pbar.update() else: pass # ts, n_trial = prepare_input(ts) labels = None info = None if n_workers == 1: features = [] for i in tqdm.tqdm(range(n_trial), disable=not verbose): values, _labels, _info = calc_features( ts[i], fs, cfg, preprocess, preprocess_args ) features.append(np.array(values).astype(dtype)) if (labels is None) and (not np.isnan(values).any()): labels = _labels info = _info else: for i in range(n_trial): values, _labels, _info = calc_features( ts[i], fs, cfg, preprocess=preprocess, preprocess_args=preprocess_args ) if (labels is None) and (not np.isnan(values).any()): labels = _labels info = _info break with Pool(processes=n_workers) as pool: with tqdm.tqdm(total=n_trial, disable=not verbose) as pbar: async_res = [ pool.apply_async( calc_features, args=(ts[i], fs, cfg, preprocess, preprocess_args), # kwds=dict(kwargs), callback=update_bar, ) for i in range(n_trial) ] features = [np.array(res.get()[0]).astype(dtype) for res in async_res] if output_type == "dataframe": df = pd.DataFrame(features) if labels is not None: df.columns = labels return df elif output_type == "list": return features, labels data = Data_F(values=features, labels=labels, info=info) return data
[docs] def extract_features_df(ts: np.ndarray, fs: float, cfg: dict, **kwargs): """ Extract features from time series data and return a pandas dataframe Parameters ---------- ts : list of np.ndarray [[n_regions x n_samples]] Input from which the features are extracted fs : int, float Sampling frequency cfg : dictionary Dictionary of features to extract **kwargs - n_workers : int Number of workers for parallelization, default is 1 Parallelization is done by ensembles (first dimension of ts) - dtype : type Data type of the features extracted, default is np.float32 - verbose : boolean If True, print the some information - preprocess : function Function for preprocessing the time series - preprocess_args : dictionary Arguments for preprocessing function Returns ------- Data Object with the following attributes: - values: pandas dataframe extracted features - labels: list of strings List of labels of the features - info: dictionary Dictionary with the information of the features extracted """ return extract_features(ts, fs, cfg, "dataframe", **kwargs)
[docs] def extract_features_list(ts, fs, cfg, **kwargs): """ extract features from time series data and return a list of features and labels Parameters ---------- ts : list of np.ndarray [[n_regions x n_samples]] Input from which the features are extracted fs : int, float Sampling frequency cfg : dictionary Dictionary of features to extract **kwargs - n_workers : int Number of workers for parallelization, default is 1 Parallelization is done by ensembles (first dimension of ts) - dtype : type Data type of the features extracted, default is np.float32 - verbose : boolean If True, print the some information - preprocess : function Function for preprocessing the time series - preprocess_args : dictionary Arguments for preprocessing function Returns ------- Data Object with the following attributes: - values: list of numpy arrays extracted features - labels: list of strings List of labels of the features - info: dictionary Dictionary with the information of the features extracted """ return extract_features(ts, fs, cfg, "list", **kwargs)