import os
import pickle
import numpy as np
import pandas as pd
from numpy import arange
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from ..utilities.loadConfigFile import loadConfigFile
from itertools import chain
[docs]class FullPipeline:
"""
This class chains multiple pipelines (the missing values pipeline, the preprocessing pipeline, ...)
into one single pipeline.
"""
pipelines = []
columns = None
[docs] @classmethod
def initialize(cls, data):
"""
This function initializes the full pipeline with the Pandas dataframe.
The column names will be stored as a class attribute and then recovered when needed.
:param data: Pandas dataframe needed for the initialization
:type data: Pandas Dataframe
"""
cls.columns = data.columns
[docs] @classmethod
def recoverColumnsNames(cls):
"""
This function returns the column names of the Pandas dataframe
:return: Column names of the Pandas dataframe
:rtype: List
"""
return cls.columns
[docs] def addPipeline(self, pipeline):
"""
This function adds a given pipeline to the full pipeline
:param pipeline: Data pipeline to be added the full pipeline
:type pipeline: Pipeline
"""
self.__class__.pipelines.append(pipeline)
[docs]class MissingValuesPipeline:
"""
This class provides a pipeline for completing missing values.
"""
allowed_strategies = ("mean", "median", "most_frequent", "constant")
pipelines = {}
def __init__(self):
"""
Check whether the full pipeline is initialized with the dataframe.
"""
assert not (
FullPipeline.columns.empty), "You have to initialize the entire pipeline with the data in order to keep " \
"the columns names! "
[docs] def addSimpleImputerPipeline(self, column, strategy="most_frequent"):
"""
This function provides basic strategies for imputing missing values that can be imputed with a provided
constant value, or using the statistics (mean, median or most frequent) of a column in which the
missing values are located.
:param column: Column of the Pandas dataframe
:type column: String
:param strategy: Strategy of the imputation, defaults to "most_frequent"
:type strategy: String, optional
"""
assert strategy in self.__class__.allowed_strategies, f"{strategy}: Unknown imputation strategy!"
self.__class__.pipelines[column] = Pipeline(
[("fillna", SimpleImputer(strategy=strategy, missing_values=np.NaN)), ])
[docs] def buildPipeline(self):
"""
This function builds the missing values pipeline and prepares it to be fed with the Pandas dataframe.
"""
pickleDir = loadConfigFile().get("dirConfig").get("pklDir")
pickleFile = loadConfigFile().get("fileConfig").get("pickledData_afterThFiltering")
filePath = os.path.join(pickleDir, pickleFile)
data = pickle.load(open(filePath,'rb'))
for featureOrLabel, strategy in chain(loadConfigFile().get("features").items(),
loadConfigFile().get("labels").items()):
if featureOrLabel in data.columns:
self.addSimpleImputerPipeline(featureOrLabel, strategy.get("missing"))
[docs]class PreprocessingPipeline:
"""
This class provides a pipeline for data preprocessing like one-hot encoding.
"""
allowed_strategies = ("one_hot", "None")
pipelines = {}
def __init__(self):
"""
Check whether the full pipeline is initialized with the dataframe.
"""
assert not (
FullPipeline.columns.empty), "You have to initialize the entire pipeline with the data in order to keep the columns names!"
[docs] def addOnehotEncoderPipeline(self, column, strategy=None):
"""
Encode categorical feature as a one-hot numeric array
:param column: Column of the Pandas dataframe
:type column: String
:param strategy: Preprocessing strategy, defaults to None
:type strategy: String or None, optional
"""
assert strategy in self.__class__.allowed_strategies, f"{strategy}: Unknown strategy!"
if strategy != "None":
self.__class__.pipelines[column] = lambda dataframe: pd.get_dummies(dataframe[column], prefix=column)
else:
self.__class__.pipelines[column] = None
[docs] def buildPipeline(self):
"""
This function builds the preprocessing pipeline and prepares it to be fed with the Pandas dataframe.
"""
pickleDir = loadConfigFile().get("dirConfig").get("pklDir")
pickleFile = loadConfigFile().get("fileConfig").get("pickledData_afterThFiltering")
filePath = os.path.join(pickleDir, pickleFile)
data = pickle.load(open(filePath,'rb'))
for featureOrLabel, strategy in chain(loadConfigFile().get("features").items(),
loadConfigFile().get("labels").items()):
if featureOrLabel in data.columns:
self.addOnehotEncoderPipeline(featureOrLabel, strategy.get("preprocessing"))