Source code for vbi.feature_extraction.calc_features
"""High-level batch feature computation for VBI.
Provides a unified interface for computing feature vectors from simulated
or empirical time series, driven by a JSON configuration that specifies
which feature functions to evaluate and with which parameters.
"""
import os
import vbi
import sys
import tqdm
import importlib
import numpy as np
import pandas as pd
from multiprocessing import Pool
import vbi.feature_extraction.features
from vbi.feature_extraction.features_settings import Data_F
[docs]
def calc_features(
ts: np.ndarray,
fs: float,
cfg: dict,
preprocess=None,
preprocess_args: dict = {},
verbose: bool = False,
):
"""
Extract features from time series data.
Parameters
----------
ts : np.ndarray
Time series data
fs : int, float
Sampling frequency
cfg : dict
Dictionary of features configurations
preprocess : function
Function for preprocessing the time series
preprocess_args : dictionary
Arguments for preprocessing function
Returns
-------
features : list of numpy arrays
"""
features_path = cfg["features_path"] if ("features_path" in cfg.keys()) else None
if features_path:
module_name = features_path.split(os.sep)[-1][:-3]
sys.path.append(features_path[: -len(features_path.split(os.sep)[-1]) - 1])
exec("import " + module_name)
importlib.reload(sys.modules[features_path.split(os.sep)[-1][:-3]])
exec("from " + module_name + " import *")
# module = sys.modules[module_name]
# print(module.calc_mean)
# print(module.calc_mean([1,2,3], 1, 2))
def length(x):
return (len(x)) if (len(x) > 0) else 0
labels = []
features = []
info = {}
domain = list(cfg.keys())
# remove features_path from domain if exists
if "features_path" in domain:
domain.remove("features_path")
for _type in domain:
domain_feats = cfg[_type]
for fe in domain_feats:
if cfg[_type][fe]["use"] == "yes":
c = length(features)
func_name = fe
func = cfg[_type][fe]["function"]
params = cfg[_type][fe]["parameters"]
if "verbose" in params.keys():
params["verbose"] = verbose
if params is None:
params = {}
if "fs" in params.keys():
params["fs"] = fs
if preprocess is not None:
ts = preprocess(ts, **preprocess_args)
val, lab = eval(func)(ts, **params)
if isinstance(val, (np.ndarray, list)):
labels.extend(lab)
features.extend(val)
else:
labels.append(func_name)
features.append(val)
info[func_name] = {"index": [c, length(features)]}
return features, labels, info
[docs]
def extract_features(
ts: np.ndarray, fs: float, cfg: dict, output_type=Data_F, **kwargs
):
"""
Extract features from time series data
Parameters
----------
ts : list of np.ndarray [[n_regions x n_samples]]
Input from which the features are extracted
fs : int, float
Sampling frequency
cfg : dictionary
Dictionary of features to extract
output_format : string
Output format, either
'list' (list of numpy arrays)
'dataframe' (pandas dataframe)
(default is 'list')
**kwargs
- n_workers : int
Number of workers for parallelization, default is 1
Parallelization is done by ensembles (first dimension of ts)
- dtype : type
Data type of the features extracted, default is np.float32
- verbose : boolean
If True, print the some information
- preprocess : function
Function for preprocessing the time series
- preprocess_args : dictionary
Arguments for preprocessing function
- n_trial: int
Number of trials
Returns
-------
Data Object with the following attributes:
values: list of numpy arrays or pandas dataframe
extracted features
labels: list of strings
List of labels of the features
info: dictionary
Dictionary with the information of the features extracted
"""
labels = []
features = []
n_workers = kwargs.get("n_workers", 1)
dtype = kwargs.get("dtype", np.float32)
verbose = kwargs.get("verbose", True)
preprocess = kwargs.get("preprocess", None)
preprocess_args = kwargs.get("preprocess_args", {})
n_trial = kwargs.get("n_trial", len(ts))
def update_bar(verbose):
if verbose:
pbar.update()
else:
pass
# ts, n_trial = prepare_input(ts)
labels = None
info = None
if n_workers == 1:
features = []
for i in tqdm.tqdm(range(n_trial), disable=not verbose):
values, _labels, _info = calc_features(
ts[i], fs, cfg, preprocess, preprocess_args
)
features.append(np.array(values).astype(dtype))
if (labels is None) and (not np.isnan(values).any()):
labels = _labels
info = _info
else:
for i in range(n_trial):
values, _labels, _info = calc_features(
ts[i],
fs,
cfg,
preprocess=preprocess,
preprocess_args=preprocess_args
)
if (labels is None) and (not np.isnan(values).any()):
labels = _labels
info = _info
break
with Pool(processes=n_workers) as pool:
with tqdm.tqdm(total=n_trial, disable=not verbose) as pbar:
async_res = [
pool.apply_async(
calc_features,
args=(ts[i], fs, cfg, preprocess, preprocess_args),
# kwds=dict(kwargs),
callback=update_bar,
)
for i in range(n_trial)
]
features = [np.array(res.get()[0]).astype(dtype) for res in async_res]
if output_type == "dataframe":
df = pd.DataFrame(features)
if labels is not None:
df.columns = labels
return df
elif output_type == "list":
return features, labels
data = Data_F(values=features, labels=labels, info=info)
return data
[docs]
def extract_features_df(ts: np.ndarray, fs: float, cfg: dict, **kwargs):
"""
Extract features from time series data and return a pandas dataframe
Parameters
----------
ts : list of np.ndarray [[n_regions x n_samples]]
Input from which the features are extracted
fs : int, float
Sampling frequency
cfg : dictionary
Dictionary of features to extract
**kwargs
- n_workers : int
Number of workers for parallelization, default is 1
Parallelization is done by ensembles (first dimension of ts)
- dtype : type
Data type of the features extracted, default is np.float32
- verbose : boolean
If True, print the some information
- preprocess : function
Function for preprocessing the time series
- preprocess_args : dictionary
Arguments for preprocessing function
Returns
-------
Data Object with the following attributes:
- values: pandas dataframe
extracted features
- labels: list of strings
List of labels of the features
- info: dictionary
Dictionary with the information of the features extracted
"""
return extract_features(ts, fs, cfg, "dataframe", **kwargs)
[docs]
def extract_features_list(ts, fs, cfg, **kwargs):
"""
extract features from time series data and return a list of features and labels
Parameters
----------
ts : list of np.ndarray [[n_regions x n_samples]]
Input from which the features are extracted
fs : int, float
Sampling frequency
cfg : dictionary
Dictionary of features to extract
**kwargs
- n_workers : int
Number of workers for parallelization, default is 1
Parallelization is done by ensembles (first dimension of ts)
- dtype : type
Data type of the features extracted, default is np.float32
- verbose : boolean
If True, print the some information
- preprocess : function
Function for preprocessing the time series
- preprocess_args : dictionary
Arguments for preprocessing function
Returns
-------
Data Object with the following attributes:
- values: list of numpy arrays
extracted features
- labels: list of strings
List of labels of the features
- info: dictionary
Dictionary with the information of the features extracted
"""
return extract_features(ts, fs, cfg, "list", **kwargs)