如何将递归 VIF 消除合并到 scikit-learn 的管道中？答案

【问题标题】：How to incorporate recursive VIF elimination into scikit-learn's pipeline?如何将递归 VIF 消除合并到 scikit-learn 的管道中？
【发布时间】：2021-11-23 05:09:27
【问题描述】：

我正在尝试在scikit-learn 的pipeline 对象中实现自定义管道。管道是使用 VIF 递归地消除特征。我参考了代码here

class ReduceVIF(base.BaseEstimator, base.TransformerMixin):
    def __init__(self, thresh=10.0):
        # From looking at documentation, values between 5 and 10 are "okay".
        # Above 10 is too high and so should be removed.
        self.thresh = thresh
        
        self.scaler = preprocessing.StandardScaler()


    def fit(self, X, y=None):
        X_copy = X.copy()
        print("ReduceVIF fit")
        if hasattr(self, 'scaler'):
            X = self.scaler.fit(X)
        X = ReduceVIF.calculate_vif(X, self.thresh)
        print(X)
        self.predictors = X.columns
        return self

    def transform(self, X, y=None):
        print("ReduceVIF transform")
        columns = self.predictors
        if hasattr(self, 'scaler'):
            X = pd.DataFrame(self.scaler.transform(X), columns=columns)
        return X
        # return ReduceVIF.calculate_vif(X, self.thresh)

    @staticmethod
    def calculate_vif(X, thresh=10.0):
        # Taken from https://stats.stackexchange.com/a/253620/53565 and modified
        dropped = True
        count = 0
        while dropped and count <= 15:
            
            print(count)
            variables = X.columns

            dropped = False
            
            
            vif = [
                variance_inflation_factor(X[variables].values, X.columns.get_loc(var))
                for var in X.columns
            ]

            max_vif = max(vif)
            if max_vif > thresh:
                maxloc = vif.index(max_vif)
                print(f"Dropping {X.columns[maxloc]} with vif={max_vif}")
                X = X.drop([X.columns.tolist()[maxloc]], axis=1)
                dropped = True
                count +=1
            print(X.shape)
            
        return X

我试图调用/制作这样的管道

# create a feature preparation pipeline for a model
def make_finetuning_pipeline(model):
    steps = list()
    # standardization
    #steps.append(('standardize', preprocessing.StandardScaler()))
    steps.append(('remove_multicollinearity', ReduceVIF(thresh=10)))
    #steps.append(("feature_selection", feature_selection.RFE(linear_model.LogisticRegression(penalty='l1', solver='liblinear'))))
    # the model
    steps.append(('model', model))
    # create pipeline
    _pipeline = pipeline.Pipeline(steps=steps)
    return _pipeline

但不知何故它不起作用，错误是每个折叠都有不同的列，或者存在属性错误。有谁知道如何将 VIF 加入到 scikit 学习管道中？

这是我在 github gist 中的 sn-p 代码，用于重现性。 Github Gist

【问题讨论】：

标签： python machine-learning scikit-learn pipeline

【解决方案1】：

回答我自己的问题：我花了半天时间尝试调试，工作的初步版本如下，不优雅，但现在它按预期工作。

class ReduceVIF(base.BaseEstimator, base.TransformerMixin):
    def __init__(self, thresh=10):
        # From looking at documentation, values between 5 and 10 are "okay".
        # Above 10 is too high and so should be removed.
        self.thresh = thresh
        self.predictor_cols = [
            "radius_mean",
            "texture_mean",
            "perimeter_mean",
            "area_mean",
            "smoothness_mean",
            "compactness_mean",
            "concavity_mean",
            "concave points_mean",
            "symmetry_mean",
            "fractal_dimension_mean",
            "radius_se",
            "texture_se",
            "perimeter_se",
            "area_se",
            "smoothness_se",
            "compactness_se",
            "concavity_se",
            "concave points_se",
            "symmetry_se",
            "fractal_dimension_se",
            "radius_worst",
            "texture_worst",
            "perimeter_worst",
            "area_worst",
            "smoothness_worst",
            "compactness_worst",
            "concavity_worst",
            "concave points_worst",
            "symmetry_worst",
            "fractal_dimension_worst",
        ]

    def reset(self):

        self.predictor_cols = [
            "radius_mean",
            "texture_mean",
            "perimeter_mean",
            "area_mean",
            "smoothness_mean",
            "compactness_mean",
            "concavity_mean",
            "concave points_mean",
            "symmetry_mean",
            "fractal_dimension_mean",
            "radius_se",
            "texture_se",
            "perimeter_se",
            "area_se",
            "smoothness_se",
            "compactness_se",
            "concavity_se",
            "concave points_se",
            "symmetry_se",
            "fractal_dimension_se",
            "radius_worst",
            "texture_worst",
            "perimeter_worst",
            "area_worst",
            "smoothness_worst",
            "compactness_worst",
            "concavity_worst",
            "concave points_worst",
            "symmetry_worst",
            "fractal_dimension_worst",
        ]

    def fit(self, X, y=None):
        print("ReduceVIF fit")
        tmp, self.predictor_cols = ReduceVIF.calculate_vif(X, self.predictor_cols, self.thresh)
        col_index = [self.predictor_cols.index(col_name) for col_name in self.predictor_cols]
        self.col_index = col_index
        print("tmp", self.col_index)
        self.reset()
        return self

    def transform(self, X, y=None):
        print("ReduceVIF transform")
        # columns = X.columns.tolist()
        # print(X.shape)
        return X[:, self.col_index]

    @staticmethod
    def calculate_vif(X, columns, thresh=10.0):
        # Taken from https://stats.stackexchange.com/a/253620/53565 and modified
        dropped = True
        count = 0
        while dropped and count <= 15:
            column_index = X.shape[1]
            predictor_cols = np.arange(X.shape[1])
            dropped = False
            print(count)

            vif = []
            for var in range(column_index):
                # print(predictor_cols.shape)
                vif.append(variance_inflation_factor(X[:, predictor_cols], var))

            max_vif = max(vif)
            if max_vif > thresh:
                maxloc = vif.index(max_vif)
                print(f"Dropping {maxloc} with vif={max_vif}")
                # X = X.drop([X.columns.tolist()[maxloc]], axis=1)
                X = np.delete(X, maxloc, axis=1)
                columns.pop(maxloc)
                dropped = True
                count += 1
        return X, columns

【讨论】：