Preprocessing

`Data`

Data class manages the preprocess of the data before the causal analysis

Source code in fpcmci/preprocessing/data.py

class Data():
    """
    Data class manages the preprocess of the data before the causal analysis
    """
    def __init__(self, data, vars = None, fill_nan = True, stand = False, subsampling : SubsamplingMethod = None, show_subsampling = False):
        """
        Data class constructor

        Args:
            data (str/DataFrame/np.array): it can be a string specifing the path of a csv file to load/pandas.DataFrame/numpy.array
            vars (list(str), optional): List containing variable names. If unset then, 
                if data = (str/DataFrame) vars = data columns name elif data = np.array vars = [X_0 .. X_N]
                Defaults to None.
            fill_nan (bool, optional): Fill NaNs bit. Defaults to True.
            stand (bool, optional): Standardization bit. Defaults to False.
            subsampling (SubsamplingMethod, optional): Subsampling method. If None not active. Defaults to None.
            show_subsampling (bool, optional): If True shows subsampling result. Defaults to False.

        Raises:
            TypeError: if data is not str - DataFrame - ndarray
        """
        # Data handling
        if type(data) == np.ndarray:
            self.d = pd.DataFrame(data)
            if vars is None: self.d.columns = list(['X_' + str(f) for f in range(len(self.d.columns))])
        elif type(data) == pd.DataFrame:
            self.d = data
        elif type(data) == str:
            self.d = pd.read_csv(data)
        else:
            raise TypeError("data field not in the correct type\ndata must be one of the following type:\n- numpy.ndarray\n- pandas.DataFrame\n- .csv path")


        # Columns name handling
        if vars is not None:
            self.d.columns = list(vars)


        self.orig_features = self.features
        self.orig_pretty_features = self.pretty_features
        self.orig_N = self.N
        self.orig_T = len(self.d)

        # Filling NaNs
        if fill_nan:
            if self.d.isnull().values.any():
                self.d.fillna(inplace=True, method="ffill")
                self.d.fillna(inplace=True, method="bfill")

        # Subsampling data
        if subsampling is not None:
            subsampler = Subsampler(self.d, ss_method = subsampling)
            self.d = pd.DataFrame(subsampler.subsample(), columns = self.features)
            if show_subsampling: subsampler.plot_subsampled_data()

        # Standardize data
        if stand:
            scaler = StandardScaler()
            scaler = scaler.fit(self.d)
            self.d = pd.DataFrame(scaler.transform(self.d), columns = self.features)


    @property  
    def features(self):
        """
        Returns list of features

        Returns:
            list(str): list of feature names
        """
        return list(self.d.columns)

    @property
    def pretty_features(self):
        """
        Returns list of features with LATEX symbols

        Returns:
            list(str): list of feature names
        """
        return [r'$' + str(v) + '$' for v in self.d.columns]

    @property
    def N(self):
        """
        Number of features

        Returns:
            (int): number of features
        """
        return len(self.d.columns)

    @property
    def T(self):
        """
        Dataframe length

        Returns:
            (int): dataframe length
        """
        return len(self.d)


    def shrink(self, selected_features):
        """
        Shrinks dataframe d and dependencies based on the selected features

        Args:
            selected_features (list(str)): features selected by the selector
        """
        self.d = self.d[selected_features]


    def plot_timeseries(self):
        """
        Plots timeseries data
        """
        # Create grid
        gs = gridspec.GridSpec(self.N, 1)

        # Time vector
        T = list(range(self.T))

        plt.figure()
        for i in range(0, self.d.shape[1]):
            ax = plt.subplot(gs[i, 0])
            plt.plot(T, self.d.values[:, i], color = 'tab:red')
            plt.ylabel(str(self.pretty_features[i]))

        plt.show()

`N` `property`

Number of features

Returns:

Type	Description
`int`	number of features

`T` `property`

Dataframe length

Returns:

Type	Description
`int`	dataframe length

`features` `property`

Returns list of features

Returns:

Name	Type	Description
`list`	`str`	list of feature names

`pretty_features` `property`

Returns list of features with LATEX symbols

Returns:

Name	Type	Description
`list`	`str`	list of feature names

`init(data, vars=None, fill_nan=True, stand=False, subsampling=None, show_subsampling=False)`

Data class constructor

Parameters:

Name	Type	Description	Default
`data`	`str / DataFrame / np.array`	it can be a string specifing the path of a csv file to load/pandas.DataFrame/numpy.array	required
`vars`	`list(str)`	List containing variable names. If unset then, if data = (str/DataFrame) vars = data columns name elif data = np.array vars = [X_0 .. X_N] Defaults to None.	`None`
`fill_nan`	`bool`	Fill NaNs bit. Defaults to True.	`True`
`stand`	`bool`	Standardization bit. Defaults to False.	`False`
`subsampling`	`SubsamplingMethod`	Subsampling method. If None not active. Defaults to None.	`None`
`show_subsampling`	`bool`	If True shows subsampling result. Defaults to False.	`False`

Raises:

Type	Description
`TypeError`	if data is not str - DataFrame - ndarray

Source code in fpcmci/preprocessing/data.py

def __init__(self, data, vars = None, fill_nan = True, stand = False, subsampling : SubsamplingMethod = None, show_subsampling = False):
    """
    Data class constructor

    Args:
        data (str/DataFrame/np.array): it can be a string specifing the path of a csv file to load/pandas.DataFrame/numpy.array
        vars (list(str), optional): List containing variable names. If unset then, 
            if data = (str/DataFrame) vars = data columns name elif data = np.array vars = [X_0 .. X_N]
            Defaults to None.
        fill_nan (bool, optional): Fill NaNs bit. Defaults to True.
        stand (bool, optional): Standardization bit. Defaults to False.
        subsampling (SubsamplingMethod, optional): Subsampling method. If None not active. Defaults to None.
        show_subsampling (bool, optional): If True shows subsampling result. Defaults to False.

    Raises:
        TypeError: if data is not str - DataFrame - ndarray
    """
    # Data handling
    if type(data) == np.ndarray:
        self.d = pd.DataFrame(data)
        if vars is None: self.d.columns = list(['X_' + str(f) for f in range(len(self.d.columns))])
    elif type(data) == pd.DataFrame:
        self.d = data
    elif type(data) == str:
        self.d = pd.read_csv(data)
    else:
        raise TypeError("data field not in the correct type\ndata must be one of the following type:\n- numpy.ndarray\n- pandas.DataFrame\n- .csv path")


    # Columns name handling
    if vars is not None:
        self.d.columns = list(vars)


    self.orig_features = self.features
    self.orig_pretty_features = self.pretty_features
    self.orig_N = self.N
    self.orig_T = len(self.d)

    # Filling NaNs
    if fill_nan:
        if self.d.isnull().values.any():
            self.d.fillna(inplace=True, method="ffill")
            self.d.fillna(inplace=True, method="bfill")

    # Subsampling data
    if subsampling is not None:
        subsampler = Subsampler(self.d, ss_method = subsampling)
        self.d = pd.DataFrame(subsampler.subsample(), columns = self.features)
        if show_subsampling: subsampler.plot_subsampled_data()

    # Standardize data
    if stand:
        scaler = StandardScaler()
        scaler = scaler.fit(self.d)
        self.d = pd.DataFrame(scaler.transform(self.d), columns = self.features)

`plot_timeseries()`

Plots timeseries data

Source code in fpcmci/preprocessing/data.py

def plot_timeseries(self):
    """
    Plots timeseries data
    """
    # Create grid
    gs = gridspec.GridSpec(self.N, 1)

    # Time vector
    T = list(range(self.T))

    plt.figure()
    for i in range(0, self.d.shape[1]):
        ax = plt.subplot(gs[i, 0])
        plt.plot(T, self.d.values[:, i], color = 'tab:red')
        plt.ylabel(str(self.pretty_features[i]))

    plt.show()

`shrink(selected_features)`

Shrinks dataframe d and dependencies based on the selected features

Parameters:

Name	Type	Description	Default
`selected_features`	`list(str`	features selected by the selector	required

Source code in fpcmci/preprocessing/data.py

def shrink(self, selected_features):
    """
    Shrinks dataframe d and dependencies based on the selected features

    Args:
        selected_features (list(str)): features selected by the selector
    """
    self.d = self.d[selected_features]

Preprocessing

Data

N property

T property

features property

pretty_features property

__init__(data, vars=None, fill_nan=True, stand=False, subsampling=None, show_subsampling=False)

plot_timeseries()

shrink(selected_features)

`Data`

`N` `property`

`T` `property`

`features` `property`

`pretty_features` `property`

`init(data, vars=None, fill_nan=True, stand=False, subsampling=None, show_subsampling=False)`

`plot_timeseries()`

`shrink(selected_features)`