【问题标题】:How to incorporate recursive VIF elimination into scikit-learn's pipeline?如何将递归 VIF 消除合并到 scikit-learn 的管道中?
【发布时间】:2021-11-23 05:09:27
【问题描述】:

我正在尝试在scikit-learnpipeline 对象中实现自定义管道。管道是使用 VIF 递归地消除特征。我参考了代码here

class ReduceVIF(base.BaseEstimator, base.TransformerMixin):
    def __init__(self, thresh=10.0):
        # From looking at documentation, values between 5 and 10 are "okay".
        # Above 10 is too high and so should be removed.
        self.thresh = thresh
        
        self.scaler = preprocessing.StandardScaler()


    def fit(self, X, y=None):
        X_copy = X.copy()
        print("ReduceVIF fit")
        if hasattr(self, 'scaler'):
            X = self.scaler.fit(X)
        X = ReduceVIF.calculate_vif(X, self.thresh)
        print(X)
        self.predictors = X.columns
        return self

    def transform(self, X, y=None):
        print("ReduceVIF transform")
        columns = self.predictors
        if hasattr(self, 'scaler'):
            X = pd.DataFrame(self.scaler.transform(X), columns=columns)
        return X
        # return ReduceVIF.calculate_vif(X, self.thresh)

    @staticmethod
    def calculate_vif(X, thresh=10.0):
        # Taken from https://stats.stackexchange.com/a/253620/53565 and modified
        dropped = True
        count = 0
        while dropped and count <= 15:
            
            print(count)
            variables = X.columns

            dropped = False
            
            
            vif = [
                variance_inflation_factor(X[variables].values, X.columns.get_loc(var))
                for var in X.columns
            ]

            max_vif = max(vif)
            if max_vif > thresh:
                maxloc = vif.index(max_vif)
                print(f"Dropping {X.columns[maxloc]} with vif={max_vif}")
                X = X.drop([X.columns.tolist()[maxloc]], axis=1)
                dropped = True
                count +=1
            print(X.shape)
            
        return X

我试图调用/制作这样的管道

# create a feature preparation pipeline for a model
def make_finetuning_pipeline(model):
    steps = list()
    # standardization
    #steps.append(('standardize', preprocessing.StandardScaler()))
    steps.append(('remove_multicollinearity', ReduceVIF(thresh=10)))
    #steps.append(("feature_selection", feature_selection.RFE(linear_model.LogisticRegression(penalty='l1', solver='liblinear'))))
    # the model
    steps.append(('model', model))
    # create pipeline
    _pipeline = pipeline.Pipeline(steps=steps)
    return _pipeline

但不知何故它不起作用,错误是每个折叠都有不同的列,或者存在属性错误。有谁知道如何将 VIF 加入到 scikit 学习管道中?

这是我在 github gist 中的 sn-p 代码,用于重现性。 Github Gist

【问题讨论】:

    标签: python machine-learning scikit-learn pipeline


    【解决方案1】:

    回答我自己的问题:我花了半天时间尝试调试,工作的初步版本如下,不优雅,但现在它按预期工作。

    class ReduceVIF(base.BaseEstimator, base.TransformerMixin):
        def __init__(self, thresh=10):
            # From looking at documentation, values between 5 and 10 are "okay".
            # Above 10 is too high and so should be removed.
            self.thresh = thresh
            self.predictor_cols = [
                "radius_mean",
                "texture_mean",
                "perimeter_mean",
                "area_mean",
                "smoothness_mean",
                "compactness_mean",
                "concavity_mean",
                "concave points_mean",
                "symmetry_mean",
                "fractal_dimension_mean",
                "radius_se",
                "texture_se",
                "perimeter_se",
                "area_se",
                "smoothness_se",
                "compactness_se",
                "concavity_se",
                "concave points_se",
                "symmetry_se",
                "fractal_dimension_se",
                "radius_worst",
                "texture_worst",
                "perimeter_worst",
                "area_worst",
                "smoothness_worst",
                "compactness_worst",
                "concavity_worst",
                "concave points_worst",
                "symmetry_worst",
                "fractal_dimension_worst",
            ]
    
        def reset(self):
    
            self.predictor_cols = [
                "radius_mean",
                "texture_mean",
                "perimeter_mean",
                "area_mean",
                "smoothness_mean",
                "compactness_mean",
                "concavity_mean",
                "concave points_mean",
                "symmetry_mean",
                "fractal_dimension_mean",
                "radius_se",
                "texture_se",
                "perimeter_se",
                "area_se",
                "smoothness_se",
                "compactness_se",
                "concavity_se",
                "concave points_se",
                "symmetry_se",
                "fractal_dimension_se",
                "radius_worst",
                "texture_worst",
                "perimeter_worst",
                "area_worst",
                "smoothness_worst",
                "compactness_worst",
                "concavity_worst",
                "concave points_worst",
                "symmetry_worst",
                "fractal_dimension_worst",
            ]
    
        def fit(self, X, y=None):
            print("ReduceVIF fit")
            tmp, self.predictor_cols = ReduceVIF.calculate_vif(X, self.predictor_cols, self.thresh)
            col_index = [self.predictor_cols.index(col_name) for col_name in self.predictor_cols]
            self.col_index = col_index
            print("tmp", self.col_index)
            self.reset()
            return self
    
        def transform(self, X, y=None):
            print("ReduceVIF transform")
            # columns = X.columns.tolist()
            # print(X.shape)
            return X[:, self.col_index]
    
        @staticmethod
        def calculate_vif(X, columns, thresh=10.0):
            # Taken from https://stats.stackexchange.com/a/253620/53565 and modified
            dropped = True
            count = 0
            while dropped and count <= 15:
                column_index = X.shape[1]
                predictor_cols = np.arange(X.shape[1])
                dropped = False
                print(count)
    
                vif = []
                for var in range(column_index):
                    # print(predictor_cols.shape)
                    vif.append(variance_inflation_factor(X[:, predictor_cols], var))
    
                max_vif = max(vif)
                if max_vif > thresh:
                    maxloc = vif.index(max_vif)
                    print(f"Dropping {maxloc} with vif={max_vif}")
                    # X = X.drop([X.columns.tolist()[maxloc]], axis=1)
                    X = np.delete(X, maxloc, axis=1)
                    columns.pop(maxloc)
                    dropped = True
                    count += 1
            return X, columns
    

    【讨论】:

      猜你喜欢
      • 2016-08-09
      • 2017-03-30
      • 2015-11-19
      • 2015-08-14
      • 2016-10-25
      • 1970-01-01
      • 2017-06-18
      • 2013-08-17
      • 2018-08-02
      相关资源
      最近更新 更多