【发布时间】:2020-04-11 14:27:56
【问题描述】:
我已经使用 sklearn 在公司数据集上训练了机器学习模型。该数据集具有以下属性:name, domain, year_founded, industry, size_range, locality, country, linkedin_url, current_employee_estimate, total_employee_estimate。
我想训练一个机器学习模型来尝试使用name 和year_founded 属性来预测size_range 值(根据公司的规模属于八类之一)。我已经使用以下培训代码完成了这项工作:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import logistic
from tools import pickleFile
from tools import unpickleFile
from tools import cleanDataset
from tools import getPrettyTimestamp
import sklearn
import pandas as pd
import numpy as np
import datetime
import sys
def train_model(clf, X_train, y_train, epochs=10):
"""
Trains a specific model and returns a list of results
:param clf: sklearn model
:param X_train: encoded training data (attributes)
:param y_train: training data (attribute to predict
:param epochs: number of iterations (default=10)
:return: result (accuracy) for this training data
"""
scores = []
print("Starting training...")
for i in range(1, epochs + 1):
print("Epoch:" + str(i) + "/" + str(epochs) + " -- " + str(datetime.datetime.now()))
clf.fit(X_train, y_train)
score = clf.score(X_train, y_train)
scores.append(score)
print("Done training. The score(s) is/are: " + str(scores))
return scores
def main():
# Parse the arguments.
userRequestedTrain, filename = parseArgs()
# Some custom Pandas settings - TODO remove this
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 1000)
dataset = pd.read_csv("companies_sorted.csv", nrows=50000)
origLen = len(dataset)
print(origLen)
dataset = cleanDataset(dataset)
cleanLen = len(dataset)
print(cleanLen)
print("\n======= Some Dataset Info =======\n")
print("Dataset size (original):\t" + str(origLen))
print("Dataset size (cleaned):\t" + str(len(dataset)))
print("\nValues of size_range:\n")
print(dataset['size_range'].value_counts())
print()
# size_range is the attribute to be predicted, so we pop it from the dataset
sizeRange = dataset.pop("size_range").values
# We split our dataset and attribute-to-be-preditcted into training and testing subsets.
xTrain, xTest, yTrain, yTest = train_test_split(dataset, sizeRange, test_size=0.25, random_state=1)
print(xTrain.transpose())
le = LabelEncoder()
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
# Our feature set, i.e. the inputs to our machine-learning model.
featureSet = ['name', 'year_founded']
# Making a copy of test and train sets with only the columns we want.
xTrain_sf = xTrain[featureSet].copy()
xTest_sf = xTest[featureSet].copy()
# Apply one-hot encoding to columns
ohe.fit(xTrain_sf)
print(xTrain_sf)
print(xTest_sf)
featureNames = ohe.get_feature_names()
# Encoding test and train sets
xTrain_sf_encoded = ohe.transform(xTrain_sf)
xTest_sf_encoded = ohe.transform(xTest_sf)
# ------ Using Logistic Regression classifier - TRAINING PHASE ------
if userRequestedTrain:
# We define the model we're going to use.
lrModel = LogisticRegression(solver='lbfgs', multi_class="multinomial", max_iter=1000, random_state=1)
# Now, let's train it.
lrScores = train_model(lrModel, xTrain_sf_encoded, yTrain, 1)
# Save the model as a file.
filename = "models/Model_" + getPrettyTimestamp()
print("Training done! Pickling model to " + str(filename) + "...")
pickleFile(lrModel, filename)
# Reload the model for testing. If we didn't train the model ourselves, then it was specified as an argument.
lrModel = unpickleFile(filename)
PRED = lrModel.predict(xTrain_sf_encoded[0:10])
print("Unpickled successfully from file " + str(filename))
# ------- TESTING PHASE -------
testLrScores = train_model(lrModel, xTest_sf_encoded, yTest, 1)
if userRequestedTrain:
trainScore = lrScores[0]
else:
trainScore = 0.9201578143173162 # Modal training score - substitute if we didn't train model ourselves
testScore = testLrScores[0]
scores = sorted([(trainScore, 'train'), (testScore, 'test')], key=lambda x: x[0], reverse=True)
better_score = scores[0] # largest score
print(scores)
# Which score was better?
print("Better score: %s" % "{}".format(better_score))
print("Pickling....")
pickleFile(lrModel, "models/TESTING_" + getPrettyTimestamp())
此代码运行成功 - 训练和测试阶段完成,测试阶段的准确率约为 60%:
Starting training...
Epoch:1/1 -- 2019-12-18 20:03:13.462479
Done training. The score(s) is/are: [0.8854667949951877]
Training done! Pickling model to models/Model_2019-12-18_2003...
Unpickled successfully from file models/Model_2019-12-18_2003
= = = = = = = = = = = = = = = = = = =
First 10 predictions:
['5001 - 10000' '10001+' '1001 - 5000' '5001 - 10000' '1001 - 5000'
'1001 - 5000' '5001 - 10000' '1001 - 5000' '1001 - 5000' '1001 - 5000']
['5001 - 10000' '10001+' '1001 - 5000' '5001 - 10000' '1001 - 5000'
'1001 - 5000' '5001 - 10000' '1001 - 5000' '1001 - 5000' '1001 - 5000']
= = = = = = = = = = = = =
Starting training...
Epoch:1/1 -- 2019-12-18 20:03:20.775392
Done training. The score(s) is/are: [0.5906466512702079]
[(0.8854667949951877, 'train'), (0.5906466512702079, 'test')]
Better score: (0.8854667949951877, 'train')
Pickling....
Process finished with exit code 0
但是,假设我想使用此模型进行 SINGLE 预测,即通过将公司名称和公司成立年份传递给它。我执行以下操作:
lrModel = pickle.load(open(filename, 'rb'))
predictedSet = lrModel.predict([["SomeRandomCompany", 2019]])
但是当我这样做时,我得到以下 ValueError:
X = check_array(X, accept_sparse='csr')
Traceback (most recent call last):
File "/home/ivor/Documents/companySizeEstimator/companySizeEstimator.py", line 85, in <module>
main()
File "/home/ivor/Documents/companySizeEstimator/companySizeEstimator.py", line 58, in main
predictions(model, reducedSetEncoded, reducedSet)
File "/home/ivor/Documents/companySizeEstimator/companySizeEstimator.py", line 80, in predictions
predictedSet = lrModel.predict([["SomeCompany", 2019]])
File "/home/ivor/Documents/companySizeEstimator/venv/lib/python3.8/site-packages/sklearn/linear_model/_base.py", line 293, in predict
scores = self.decision_function(X)
File "/home/ivor/Documents/companySizeEstimator/venv/lib/python3.8/site-packages/sklearn/linear_model/_base.py", line 272, in decision_function
raise ValueError("X has %d features per sample; expecting %d"
ValueError: X has 2 features per sample; expecting 54897
它似乎想要一个与用于训练它的数据集形状完全相同的数据集,即具有 11,000 行的数据集。它可以在问题的测试阶段给出很好的预测,因此很明显该模型能够很好地做出预测。如上所示,我怎样才能让它仅基于 one 值进行预测?
【问题讨论】:
标签: python pandas machine-learning scikit-learn