【发布时间】:2021-11-23 05:09:27
【问题描述】:
我正在尝试在scikit-learn 的pipeline 对象中实现自定义管道。管道是使用 VIF 递归地消除特征。我参考了代码here
class ReduceVIF(base.BaseEstimator, base.TransformerMixin):
def __init__(self, thresh=10.0):
# From looking at documentation, values between 5 and 10 are "okay".
# Above 10 is too high and so should be removed.
self.thresh = thresh
self.scaler = preprocessing.StandardScaler()
def fit(self, X, y=None):
X_copy = X.copy()
print("ReduceVIF fit")
if hasattr(self, 'scaler'):
X = self.scaler.fit(X)
X = ReduceVIF.calculate_vif(X, self.thresh)
print(X)
self.predictors = X.columns
return self
def transform(self, X, y=None):
print("ReduceVIF transform")
columns = self.predictors
if hasattr(self, 'scaler'):
X = pd.DataFrame(self.scaler.transform(X), columns=columns)
return X
# return ReduceVIF.calculate_vif(X, self.thresh)
@staticmethod
def calculate_vif(X, thresh=10.0):
# Taken from https://stats.stackexchange.com/a/253620/53565 and modified
dropped = True
count = 0
while dropped and count <= 15:
print(count)
variables = X.columns
dropped = False
vif = [
variance_inflation_factor(X[variables].values, X.columns.get_loc(var))
for var in X.columns
]
max_vif = max(vif)
if max_vif > thresh:
maxloc = vif.index(max_vif)
print(f"Dropping {X.columns[maxloc]} with vif={max_vif}")
X = X.drop([X.columns.tolist()[maxloc]], axis=1)
dropped = True
count +=1
print(X.shape)
return X
我试图调用/制作这样的管道
# create a feature preparation pipeline for a model
def make_finetuning_pipeline(model):
steps = list()
# standardization
#steps.append(('standardize', preprocessing.StandardScaler()))
steps.append(('remove_multicollinearity', ReduceVIF(thresh=10)))
#steps.append(("feature_selection", feature_selection.RFE(linear_model.LogisticRegression(penalty='l1', solver='liblinear'))))
# the model
steps.append(('model', model))
# create pipeline
_pipeline = pipeline.Pipeline(steps=steps)
return _pipeline
但不知何故它不起作用,错误是每个折叠都有不同的列,或者存在属性错误。有谁知道如何将 VIF 加入到 scikit 学习管道中?
这是我在 github gist 中的 sn-p 代码,用于重现性。 Github Gist
【问题讨论】:
标签: python machine-learning scikit-learn pipeline