Skip to content

Preprocessing

Data

Data class manages the preprocess of the data before the causal analysis

Source code in fpcmci/preprocessing/data.py
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
class Data():
    """
    Data class manages the preprocess of the data before the causal analysis
    """
    def __init__(self, data, vars = None, fill_nan = True, stand = False, subsampling : SubsamplingMethod = None, show_subsampling = False):
        """
        Data class constructor

        Args:
            data (str/DataFrame/np.array): it can be a string specifing the path of a csv file to load/pandas.DataFrame/numpy.array
            vars (list(str), optional): List containing variable names. If unset then, 
                if data = (str/DataFrame) vars = data columns name elif data = np.array vars = [X_0 .. X_N]
                Defaults to None.
            fill_nan (bool, optional): Fill NaNs bit. Defaults to True.
            stand (bool, optional): Standardization bit. Defaults to False.
            subsampling (SubsamplingMethod, optional): Subsampling method. If None not active. Defaults to None.
            show_subsampling (bool, optional): If True shows subsampling result. Defaults to False.

        Raises:
            TypeError: if data is not str - DataFrame - ndarray
        """
        # Data handling
        if type(data) == np.ndarray:
            self.d = pd.DataFrame(data)
            if vars is None: self.d.columns = list(['X_' + str(f) for f in range(len(self.d.columns))])
        elif type(data) == pd.DataFrame:
            self.d = data
        elif type(data) == str:
            self.d = pd.read_csv(data)
        else:
            raise TypeError("data field not in the correct type\ndata must be one of the following type:\n- numpy.ndarray\n- pandas.DataFrame\n- .csv path")


        # Columns name handling
        if vars is not None:
            self.d.columns = list(vars)


        self.orig_features = self.features
        self.orig_pretty_features = self.pretty_features
        self.orig_N = self.N
        self.orig_T = len(self.d)

        # Filling NaNs
        if fill_nan:
            if self.d.isnull().values.any():
                self.d.fillna(inplace=True, method="ffill")
                self.d.fillna(inplace=True, method="bfill")

        # Subsampling data
        if subsampling is not None:
            subsampler = Subsampler(self.d, ss_method = subsampling)
            self.d = pd.DataFrame(subsampler.subsample(), columns = self.features)
            if show_subsampling: subsampler.plot_subsampled_data()

        # Standardize data
        if stand:
            scaler = StandardScaler()
            scaler = scaler.fit(self.d)
            self.d = pd.DataFrame(scaler.transform(self.d), columns = self.features)


    @property  
    def features(self):
        """
        Returns list of features

        Returns:
            list(str): list of feature names
        """
        return list(self.d.columns)

    @property
    def pretty_features(self):
        """
        Returns list of features with LATEX symbols

        Returns:
            list(str): list of feature names
        """
        return [r'$' + str(v) + '$' for v in self.d.columns]

    @property
    def N(self):
        """
        Number of features

        Returns:
            (int): number of features
        """
        return len(self.d.columns)

    @property
    def T(self):
        """
        Dataframe length

        Returns:
            (int): dataframe length
        """
        return len(self.d)


    def shrink(self, selected_features):
        """
        Shrinks dataframe d and dependencies based on the selected features

        Args:
            selected_features (list(str)): features selected by the selector
        """
        self.d = self.d[selected_features]


    def plot_timeseries(self):
        """
        Plots timeseries data
        """
        # Create grid
        gs = gridspec.GridSpec(self.N, 1)

        # Time vector
        T = list(range(self.T))

        plt.figure()
        for i in range(0, self.d.shape[1]):
            ax = plt.subplot(gs[i, 0])
            plt.plot(T, self.d.values[:, i], color = 'tab:red')
            plt.ylabel(str(self.pretty_features[i]))

        plt.show()

N property

Number of features

Returns:

Type Description
int

number of features

T property

Dataframe length

Returns:

Type Description
int

dataframe length

features property

Returns list of features

Returns:

Name Type Description
list str

list of feature names

pretty_features property

Returns list of features with LATEX symbols

Returns:

Name Type Description
list str

list of feature names

__init__(data, vars=None, fill_nan=True, stand=False, subsampling=None, show_subsampling=False)

Data class constructor

Parameters:

Name Type Description Default
data str / DataFrame / np.array

it can be a string specifing the path of a csv file to load/pandas.DataFrame/numpy.array

required
vars list(str)

List containing variable names. If unset then, if data = (str/DataFrame) vars = data columns name elif data = np.array vars = [X_0 .. X_N] Defaults to None.

None
fill_nan bool

Fill NaNs bit. Defaults to True.

True
stand bool

Standardization bit. Defaults to False.

False
subsampling SubsamplingMethod

Subsampling method. If None not active. Defaults to None.

None
show_subsampling bool

If True shows subsampling result. Defaults to False.

False

Raises:

Type Description
TypeError

if data is not str - DataFrame - ndarray

Source code in fpcmci/preprocessing/data.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
def __init__(self, data, vars = None, fill_nan = True, stand = False, subsampling : SubsamplingMethod = None, show_subsampling = False):
    """
    Data class constructor

    Args:
        data (str/DataFrame/np.array): it can be a string specifing the path of a csv file to load/pandas.DataFrame/numpy.array
        vars (list(str), optional): List containing variable names. If unset then, 
            if data = (str/DataFrame) vars = data columns name elif data = np.array vars = [X_0 .. X_N]
            Defaults to None.
        fill_nan (bool, optional): Fill NaNs bit. Defaults to True.
        stand (bool, optional): Standardization bit. Defaults to False.
        subsampling (SubsamplingMethod, optional): Subsampling method. If None not active. Defaults to None.
        show_subsampling (bool, optional): If True shows subsampling result. Defaults to False.

    Raises:
        TypeError: if data is not str - DataFrame - ndarray
    """
    # Data handling
    if type(data) == np.ndarray:
        self.d = pd.DataFrame(data)
        if vars is None: self.d.columns = list(['X_' + str(f) for f in range(len(self.d.columns))])
    elif type(data) == pd.DataFrame:
        self.d = data
    elif type(data) == str:
        self.d = pd.read_csv(data)
    else:
        raise TypeError("data field not in the correct type\ndata must be one of the following type:\n- numpy.ndarray\n- pandas.DataFrame\n- .csv path")


    # Columns name handling
    if vars is not None:
        self.d.columns = list(vars)


    self.orig_features = self.features
    self.orig_pretty_features = self.pretty_features
    self.orig_N = self.N
    self.orig_T = len(self.d)

    # Filling NaNs
    if fill_nan:
        if self.d.isnull().values.any():
            self.d.fillna(inplace=True, method="ffill")
            self.d.fillna(inplace=True, method="bfill")

    # Subsampling data
    if subsampling is not None:
        subsampler = Subsampler(self.d, ss_method = subsampling)
        self.d = pd.DataFrame(subsampler.subsample(), columns = self.features)
        if show_subsampling: subsampler.plot_subsampled_data()

    # Standardize data
    if stand:
        scaler = StandardScaler()
        scaler = scaler.fit(self.d)
        self.d = pd.DataFrame(scaler.transform(self.d), columns = self.features)

plot_timeseries()

Plots timeseries data

Source code in fpcmci/preprocessing/data.py
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
def plot_timeseries(self):
    """
    Plots timeseries data
    """
    # Create grid
    gs = gridspec.GridSpec(self.N, 1)

    # Time vector
    T = list(range(self.T))

    plt.figure()
    for i in range(0, self.d.shape[1]):
        ax = plt.subplot(gs[i, 0])
        plt.plot(T, self.d.values[:, i], color = 'tab:red')
        plt.ylabel(str(self.pretty_features[i]))

    plt.show()

shrink(selected_features)

Shrinks dataframe d and dependencies based on the selected features

Parameters:

Name Type Description Default
selected_features list(str

features selected by the selector

required
Source code in fpcmci/preprocessing/data.py
113
114
115
116
117
118
119
120
def shrink(self, selected_features):
    """
    Shrinks dataframe d and dependencies based on the selected features

    Args:
        selected_features (list(str)): features selected by the selector
    """
    self.d = self.d[selected_features]