【问题标题】:How do I add a custom intermediate preprocessor in machine learning pipeline that handles n-gram columns in scikit-learn?如何在机器学习管道中添加一个自定义中间预处理器来处理 scikit-learn 中的 n-gram 列?
【发布时间】:2020-01-13 18:48:11
【问题描述】:

在 ML 预处理步骤中处理 n-gram 变量(例如 SUBSTRING_4L_V3)给我带来了一些问题。

我能够分别转换和标准化数值、分类和 n-gram 变量,

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import CountVectorizer

data = {
    'AGE': [39, np.nan, 21, 13, 45, 26, np.nan, 48],
    'URBAN': ['urban', np.nan, 'urban', 'rural', 'urban', 'rural', 'urban', 'urban'],
    'NAME': ['jack', 'juste', 'ann', np.nan, 'jack', 'gil', 'phil', 'justo'],
    'SUBSTRING_4L': [['jack'], ['just', 'uste'], [], [], ['jack'], [], ['phil'], ['just', 'usto']],
    'SUBSTRING_4L_V2': [['jack'], ['just, uste'], [], [], ['jack'], [], ['phil'], ['just, usto']],
    'SUBSTRING_4L_V3': ['jack', 'just, uste', '', '', 'jack', '', 'phil', 'just, usto'],
    'SUBSTRING_5L': [[], ['juste'], [], [], [], [], [], ['justo']],
    'DISEASE': ['healthy', 'cancer', 'cancer', 'dementia', 'cancer', 'heart', 'healthy', 'cancer'],
    }

df = pd.DataFrame(data)

def transform_numerical():
    x_train, x_test, y_train, y_test = train_test_split(
        df[['AGE']], df['DISEASE'], test_size=0.5, random_state=3)

    scaler = preprocessing.StandardScaler().fit(x_train)
    x_trainT = scaler.transform(x_train)
    x_testT = scaler.transform(x_test)

    print(x_train)
    print(x_trainT)
    print()
    print(x_test)
    print(x_testT)
    print('/////////////////////////', '\n')

transform_numerical()

def transform_categorical():
    x_train, x_test, y_train, y_test = train_test_split(
        df[['URBAN', 'NAME']], df['DISEASE'], test_size=0.5, random_state=3)

    cat_imputer = SimpleImputer(strategy='constant', fill_value='')
    cat_imputer.fit(x_train)
    x_trainT = cat_imputer.transform(x_train)
    x_testT = cat_imputer.transform(x_test)

    encoder = preprocessing.OneHotEncoder(handle_unknown='ignore')
    encoder.fit(x_trainT)
    x_trainT = encoder.transform(x_trainT)
    x_testT = encoder.transform(x_testT)

    print(x_trainT.toarray())
    print(x_train)
    print()
    print(x_testT.toarray())
    print(x_test)
    print('/////////////////////////', '\n')

transform_categorical()

def transform_list():
    x_train, x_test, y_train, y_test = train_test_split(
        df[['SUBSTRING_4L_V3']], df['DISEASE'], test_size=0.5, random_state=3)

    cat_imputer = SimpleImputer(strategy='constant', fill_value='')
    cat_imputer.fit(x_train)
    x_trainT = cat_imputer.transform(x_train)
    x_testT = cat_imputer.transform(x_test)
    x_trainT = x_trainT.ravel()
    x_testT = x_testT.ravel()

    count_vect = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None, max_features=5000) 
    x_trainT = count_vect.fit_transform(x_trainT)

    print(x_trainT.toarray())
    print('/////////////////////////', '\n')

transform_list()

对于SUBSTRING_4L_V3,我需要在应用CountVectorizer() 之前通过ravel() 将其展平。

但是,我不熟悉如何在下面的 ML 管道中按顺序实现它们

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression

class RavelTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self.ravel()

data = {
    'AGE': [39, np.nan, 21, 13, 45, 26, np.nan, 48],
    'URBAN': ['urban', np.nan, 'urban', 'rural', 'urban', 'rural', 'urban', 'urban'],
    'NAME': ['jack', 'juste', 'ann', np.nan, 'jack', 'gil', 'phil', 'justo'],
    'SUBSTRING_4L': [['jack'], ['just', 'uste'], [], [], ['jack'], [], ['phil'], ['just', 'usto']],
    'SUBSTRING_4L_V2': [['jack'], ['just, uste'], [], [], ['jack'], [], ['phil'], ['just, usto']],
    'SUBSTRING_4L_V3': ['jack', 'just, uste', '', '', 'jack', '', 'phil', 'just, usto'],
    'SUBSTRING_5L': [[], ['juste'], [], [], [], [], [], ['justo']],
    'DISEASE': ['healthy', 'cancer', 'cancer', 'dementia', 'cancer', 'heart', 'healthy', 'cancer'],
    }

df = pd.DataFrame(data)

x_train, x_test, y_train, y_test = train_test_split(
    df[['AGE', 'NAME', 'URBAN', 'SUBSTRING_4L_V3']], df['DISEASE'], test_size=0.5, random_state=3)

transformer_num = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

transformer_cat = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='')),
    ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))])

transformer_ngram = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='')),
    ('ravel', RavelTransformer()),
    ('countvectorizer', CountVectorizer(analyzer='word', tokenizer=None, preprocessor=None, stop_words=None, 
        max_features=5000))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', transformer_num, ['AGE']),
        ('cat', transformer_cat, ['NAME', 'URBAN']),
        ('ngram', transformer_ngram, ['SUBSTRING_4L_V3']),
        ])

ml_algo = LogisticRegression(solver='lbfgs', multi_class='ovr', max_iter=4000)
model = make_pipeline(preprocessor, ml_algo)
model.fit(x_train, y_train)
#print('Model score: %.3f' % model.score(x_test, y_test))

错误:

TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'RavelTransformer()' (type <class '__main__.RavelTranformer'>) doesn't

【问题讨论】:

    标签: python-3.x machine-learning scikit-learn countvectorizer


    【解决方案1】:

    错误消息告诉您的是,您的 RavelTransformer 类中没有 transform 函数。

    我的假设是你想做这样的事情:

    class RavelTransformer(BaseEstimator, TransformerMixin):
        def __init__(self):
            pass
    
        def fit(self, X, y=None):
            return self
    
        def transform(self, X, y=None):
            return X.ravel()
    

    在这里,您的RavelTransformerfit 步骤中不执行任何操作,而是按预期通过分解数据来转换您的数据。

    【讨论】:

      猜你喜欢
      • 2018-07-07
      • 2018-02-04
      • 2013-05-01
      • 2020-11-30
      • 2013-04-14
      • 2018-08-03
      • 1970-01-01
      • 2018-03-07
      • 2018-08-18
      相关资源
      最近更新 更多