Preprocessing

This module provides the Data class.

Classes

Data: public class for handling data used for the causal discovery.

`Data`

Data class manages the preprocess of the data before the causal analysis.

Source code in causalflow/preprocessing/data.py

class Data():
    """Data class manages the preprocess of the data before the causal analysis."""

    def __init__(self, data, vars = None, fill_nan = True, stand = False, subsampling : SubsamplingMethod = None, show_subsampling = False):
        """
        Class constructor.

        Args:
            data (str/DataFrame/np.array): it can be a string specifing the path of a csv file to load/pandas.DataFrame/numpy.array.
            vars (list(str), optional): List containing variable names. If unset then, 
                if data = (str/DataFrame) vars = data columns name elif data = np.array vars = [X_0 .. X_N]
                Defaults to None.
            fill_nan (bool, optional): Fill NaNs bit. Defaults to True.
            stand (bool, optional): Standardization bit. Defaults to False.
            subsampling (SubsamplingMethod, optional): Subsampling method. If None not active. Defaults to None.
            show_subsampling (bool, optional): If True shows subsampling result. Defaults to False.

        Raises:
            TypeError: if data is not str - DataFrame - ndarray.
        """
        # Data handling
        if type(data) == np.ndarray:
            self.d = pd.DataFrame(data)
            if vars is None: self.d.columns = list(['X_' + str(f) for f in range(len(self.d.columns))])
        elif type(data) == pd.DataFrame:
            self.d = data
        elif type(data) == str:
            self.d = pd.read_csv(data)
        else:
            raise TypeError("data field not in the correct type\ndata must be one of the following type:\n- numpy.ndarray\n- pandas.DataFrame\n- .csv path")


        # Columns name handling
        if vars is not None:
            self.d.columns = list(vars)


        self.orig_features = self.features
        self.orig_pretty_features = self.pretty_features
        self.orig_N = self.N
        self.orig_T = len(self.d)

        # Filling NaNs
        if fill_nan:
            if self.d.isnull().values.any():
                self.d.fillna(inplace=True, method="ffill")
                self.d.fillna(inplace=True, method="bfill")

        # Subsampling data
        if subsampling is not None:
            subsampler = Subsampler(self.d, ss_method = subsampling)
            self.d = pd.DataFrame(subsampler.subsample(), columns = self.features)
            if show_subsampling: subsampler.plot_subsampled_data()

        # Standardize data
        if stand:
            scaler = StandardScaler()
            scaler = scaler.fit(self.d)
            self.d = pd.DataFrame(scaler.transform(self.d), columns = self.features)

    @property  
    def features(self):
        """
        Return list of features.

        Returns:
            list(str): list of feature names.
        """
        return list(self.d.columns)

    @property
    def pretty_features(self):
        """
        Return list of features with LATEX symbols.

        Returns:
            list(str): list of feature names.
        """
        return [r'$' + str(v) + '$' for v in self.d.columns]

    @property
    def N(self):
        """
        Number of features.

        Returns:
            (int): number of features.
        """
        return len(self.d.columns)

    @property
    def T(self):
        """
        Dataframe length.

        Returns:
            (int): dataframe length.
        """
        return len(self.d)


    def shrink(self, selected_features):
        """
        Shrink dataframe d on the selected features.

        Args:
            selected_features (list(str)): list of variables.
        """
        self.d = self.d[selected_features]


    def plot_timeseries(self, savefig = None):
        """
        Plot timeseries data.

        Args:
            savefig (str): figure path.
        """
        # Create grid
        gs = gridspec.GridSpec(self.N, 1)

        # Time vector
        T = list(range(self.T))

        plt.figure()
        for i in range(0, self.d.shape[1]):
            ax = plt.subplot(gs[i, 0])
            plt.plot(T, self.d.values[:, i], color = 'tab:red')
            plt.ylabel(str(self.pretty_features[i]))

        if savefig is not None:
            plt.savefig(savefig)
        else:
            plt.show()


    def save_csv(self, csvpath):
        """
        Save timeseries data into a CSV file.

        Args:
            csvpath (str): CSV path.
        """
        self.d.to_csv(csvpath, index=False)

`N` `property`

Number of features.

Returns:

Type	Description
`int`	number of features.

`T` `property`

Dataframe length.

Returns:

Type	Description
`int`	dataframe length.

`features` `property`

Return list of features.

Returns:

Name	Type	Description
`list`	`str`	list of feature names.

`pretty_features` `property`

Return list of features with LATEX symbols.

Returns:

Name	Type	Description
`list`	`str`	list of feature names.

`init(data, vars=None, fill_nan=True, stand=False, subsampling=None, show_subsampling=False)`

Class constructor.

Parameters:

Name	Type	Description	Default
`data`	`str / DataFrame / np.array`	it can be a string specifing the path of a csv file to load/pandas.DataFrame/numpy.array.	required
`vars`	`list(str)`	List containing variable names. If unset then, if data = (str/DataFrame) vars = data columns name elif data = np.array vars = [X_0 .. X_N] Defaults to None.	`None`
`fill_nan`	`bool`	Fill NaNs bit. Defaults to True.	`True`
`stand`	`bool`	Standardization bit. Defaults to False.	`False`
`subsampling`	`SubsamplingMethod`	Subsampling method. If None not active. Defaults to None.	`None`
`show_subsampling`	`bool`	If True shows subsampling result. Defaults to False.	`False`

Raises:

Type	Description
`TypeError`	if data is not str - DataFrame - ndarray.

Source code in causalflow/preprocessing/data.py

def __init__(self, data, vars = None, fill_nan = True, stand = False, subsampling : SubsamplingMethod = None, show_subsampling = False):
    """
    Class constructor.

    Args:
        data (str/DataFrame/np.array): it can be a string specifing the path of a csv file to load/pandas.DataFrame/numpy.array.
        vars (list(str), optional): List containing variable names. If unset then, 
            if data = (str/DataFrame) vars = data columns name elif data = np.array vars = [X_0 .. X_N]
            Defaults to None.
        fill_nan (bool, optional): Fill NaNs bit. Defaults to True.
        stand (bool, optional): Standardization bit. Defaults to False.
        subsampling (SubsamplingMethod, optional): Subsampling method. If None not active. Defaults to None.
        show_subsampling (bool, optional): If True shows subsampling result. Defaults to False.

    Raises:
        TypeError: if data is not str - DataFrame - ndarray.
    """
    # Data handling
    if type(data) == np.ndarray:
        self.d = pd.DataFrame(data)
        if vars is None: self.d.columns = list(['X_' + str(f) for f in range(len(self.d.columns))])
    elif type(data) == pd.DataFrame:
        self.d = data
    elif type(data) == str:
        self.d = pd.read_csv(data)
    else:
        raise TypeError("data field not in the correct type\ndata must be one of the following type:\n- numpy.ndarray\n- pandas.DataFrame\n- .csv path")


    # Columns name handling
    if vars is not None:
        self.d.columns = list(vars)


    self.orig_features = self.features
    self.orig_pretty_features = self.pretty_features
    self.orig_N = self.N
    self.orig_T = len(self.d)

    # Filling NaNs
    if fill_nan:
        if self.d.isnull().values.any():
            self.d.fillna(inplace=True, method="ffill")
            self.d.fillna(inplace=True, method="bfill")

    # Subsampling data
    if subsampling is not None:
        subsampler = Subsampler(self.d, ss_method = subsampling)
        self.d = pd.DataFrame(subsampler.subsample(), columns = self.features)
        if show_subsampling: subsampler.plot_subsampled_data()

    # Standardize data
    if stand:
        scaler = StandardScaler()
        scaler = scaler.fit(self.d)
        self.d = pd.DataFrame(scaler.transform(self.d), columns = self.features)

`plot_timeseries(savefig=None)`

Plot timeseries data.

Parameters:

Name	Type	Description	Default
`savefig`	`str`	figure path.	`None`

Source code in causalflow/preprocessing/data.py

def plot_timeseries(self, savefig = None):
    """
    Plot timeseries data.

    Args:
        savefig (str): figure path.
    """
    # Create grid
    gs = gridspec.GridSpec(self.N, 1)

    # Time vector
    T = list(range(self.T))

    plt.figure()
    for i in range(0, self.d.shape[1]):
        ax = plt.subplot(gs[i, 0])
        plt.plot(T, self.d.values[:, i], color = 'tab:red')
        plt.ylabel(str(self.pretty_features[i]))

    if savefig is not None:
        plt.savefig(savefig)
    else:
        plt.show()

`save_csv(csvpath)`

Save timeseries data into a CSV file.

Parameters:

Name	Type	Description	Default
`csvpath`	`str`	CSV path.	required

Source code in causalflow/preprocessing/data.py

def save_csv(self, csvpath):
    """
    Save timeseries data into a CSV file.

    Args:
        csvpath (str): CSV path.
    """
    self.d.to_csv(csvpath, index=False)

`shrink(selected_features)`

Shrink dataframe d on the selected features.

Parameters:

Name	Type	Description	Default
`selected_features`	`list(str`	list of variables.	required

Source code in causalflow/preprocessing/data.py

def shrink(self, selected_features):
    """
    Shrink dataframe d on the selected features.

    Args:
        selected_features (list(str)): list of variables.
    """
    self.d = self.d[selected_features]

This module provides the Subsampler class.

Classes

Subsampler: public class for subsampling.

`Subsampler`

Subsampler class.

It subsamples the data by using a subsampling method chosen among

Static - subsamples data by taking one sample each step-samples
WSDynamic - entropy based method with dynamic window size computed by breakpoint analysis
WSFFTStatic - entropy based method with fixed window size computed by FFT analysis
WSStatic - entropy base method with predefined window size

Source code in causalflow/preprocessing/Subsampler.py

class Subsampler():
    """
    Subsampler class.

    It subsamples the data by using a subsampling method chosen among:
        - Static - subsamples data by taking one sample each step-samples
        - WSDynamic - entropy based method with dynamic window size computed by breakpoint analysis
        - WSFFTStatic - entropy based method with fixed window size computed by FFT analysis
        - WSStatic - entropy base method with predefined window size
    """

    def __init__(self, 
                 df: pd.DataFrame, 
                 ss_method: SubsamplingMethod):
        """
        Class constructor.

        Args:
            df (pd.DataFrame): dataframe to subsample.
            ss_method (SubsamplingMethod): subsampling method.
        """
        self.df = df
        self.ss_method = ss_method
        self.ss_method.initialise(df)


    def subsample(self):
        """
        Run the subsampling algorithm and returns the subsapled ndarray.

        Returns:
            (ndarray): Subsampled dataframe value.
        """
        self.result = self.ss_method.run()
        return self.df.values[self.result, :]


    def plot_subsampled_data(self, dpi = 100, show = True):
        """
        Plot dataframe sub-sampled data.

        Args:
            dpi (int, optional): image dpi. Defaults to 100.
            show (bool, optional): if True it shows the figure and block the process. Defaults to True.
        """
        n_plot = self.df.shape[1]

        # Create grid
        gs = gridspec.GridSpec(n_plot, 1)

        # Time vector
        T = list(range(0, self.df.shape[0]))

        pl.figure(dpi = dpi)
        for i in range(0, n_plot):
            ax = pl.subplot(gs[i, 0])
            pl.plot(T, self.df.values[:, i], color = 'tab:red')
            pl.scatter(np.array(T)[self.result],
                       self.df.values[self.result, i],
                       s = 80,
                       facecolors = 'none',
                       edgecolors = 'b')
            pl.gca().set(ylabel = r'$' + str(self.df.columns.values[i]) + '$')
        if show:
            pl.show()

`init(df, ss_method)`

Class constructor.

Parameters:

Name	Type	Description	Default
`df`	`pd.DataFrame`	dataframe to subsample.	required
`ss_method`	`SubsamplingMethod`	subsampling method.	required

Source code in causalflow/preprocessing/Subsampler.py

def __init__(self, 
             df: pd.DataFrame, 
             ss_method: SubsamplingMethod):
    """
    Class constructor.

    Args:
        df (pd.DataFrame): dataframe to subsample.
        ss_method (SubsamplingMethod): subsampling method.
    """
    self.df = df
    self.ss_method = ss_method
    self.ss_method.initialise(df)

`plot_subsampled_data(dpi=100, show=True)`

Plot dataframe sub-sampled data.

Parameters:

Name	Type	Description	Default
`dpi`	`int`	image dpi. Defaults to 100.	`100`
`show`	`bool`	if True it shows the figure and block the process. Defaults to True.	`True`

Source code in causalflow/preprocessing/Subsampler.py

def plot_subsampled_data(self, dpi = 100, show = True):
    """
    Plot dataframe sub-sampled data.

    Args:
        dpi (int, optional): image dpi. Defaults to 100.
        show (bool, optional): if True it shows the figure and block the process. Defaults to True.
    """
    n_plot = self.df.shape[1]

    # Create grid
    gs = gridspec.GridSpec(n_plot, 1)

    # Time vector
    T = list(range(0, self.df.shape[0]))

    pl.figure(dpi = dpi)
    for i in range(0, n_plot):
        ax = pl.subplot(gs[i, 0])
        pl.plot(T, self.df.values[:, i], color = 'tab:red')
        pl.scatter(np.array(T)[self.result],
                   self.df.values[self.result, i],
                   s = 80,
                   facecolors = 'none',
                   edgecolors = 'b')
        pl.gca().set(ylabel = r'$' + str(self.df.columns.values[i]) + '$')
    if show:
        pl.show()

`subsample()`

Run the subsampling algorithm and returns the subsapled ndarray.

Returns:

Type	Description
`ndarray`	Subsampled dataframe value.

Source code in causalflow/preprocessing/Subsampler.py

def subsample(self):
    """
    Run the subsampling algorithm and returns the subsapled ndarray.

    Returns:
        (ndarray): Subsampled dataframe value.
    """
    self.result = self.ss_method.run()
    return self.df.values[self.result, :]

This module provides the EntropyBasedMethod class.

Classes

EntropyBasedMethod: EntropyBasedMethod abstract class.