线性回归预测–家庭用电功率与时间的关系&&功率与电流的关系
具体过程如下:
import time
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
% matplotlib inline
%config InlineBackend.figure_format = 'svg'
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
# 设置字符集属性,防止中文乱码
mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus'] = False
# 加载数据
path = 'datas/household_power_consumption_1000.txt'
datas = pd.read_csv(path, sep = ';', low_memory = False)
datas.info()
datas.describe().T
# 异常数据过滤
new_datas = datas.replace('?',np.nan)
df = new_datas.dropna(axis = 0, how = 'any')
df.head(2)
df.info()
df.describe().T
#创建一个时间格式化字符串def date_format(dt):
def date_format(dt):
t = time.strptime(' '.join(dt),'%d/%m/%Y %H:%M:%S')
return(t.tm_year,t.tm_mon,t.tm_mday,t.tm_hour,t.tm_min,t.tm_sec)
# 获取X和Y变量,并将时间转换为数值型连续变量
X = df.iloc[:,0:2]
X = X.apply(lambda x:pd.Series(date_format(x)),axis =1)
Y = df['Global_active_power']
# 数据划分
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,train_size=0.8,random_state = 0)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
# 数据标准化
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)
pd.DataFrame(X_train).describe().T
# 模型训练
lr = LinearRegression()
lr = lr.fit(X_train,Y_train)
# 模型校验
y_predict = lr.predict(X_test)
print("训练R2:", lr.score(X_train, Y_train))
print("测试R2:", lr.score(X_test, Y_test))
mse = np.average(y_predict - Y_test)**2
rmse = np.sqrt(mse)
print("rmse:",rmse)
# 模型保存持久化
from sklearn.externals import joblib
joblib.dump(ss, 'df.ss_model')
joblib.dump(lr, 'df.lr_model')
ss = joblib.load('df.ss_model')
lr = joblib.load('df.lr_model')
# 使用加载的模型进行预测
data1 = [[2006, 12, 17, 12, 25, 0]]
data1 = ss.transform(data1)
lr.predict(data1)
# 画图
## 预测值和实际值画图比较
t = np.arange(len(X_test))
plt.figure(facecolor = 'w')
plt.plot(t, Y_test, 'r-',linewidth = 2, label = '真实值')
plt.plot(t, y_predict, 'g-', linewidth = 2, label = '预测值')
plt.legend(loc = 'upper left')
plt.title("线性回归预测时间与功率之间的关系",fontsize = 20)
plt.grid (b = True)
plt.show()
X2 = df.iloc[:,2:4]
Y2 = df.iloc[:,5]
# 数据分割
X2_train, X2_test,Y2_train,Y2_test = train_test_split(X2,Y2,train_size = 0.8,random_state = 0)
# 数据标准化
ss1 = StandardScaler()
X2_train = ss1.fit_transform(X2_train)
X2_test = ss1.transform(X2_test)
# 模型训练
lr1 = LinearRegression()
lr1.fit(X2_train,Y2_train)
y2_predict = lr1.predict(X2_test)
# 模型评估
print("电流预测准确率:",lr1.score(X2_test, Y2_test))
print("电流参数:",lr1.coef_)
# 画图
t1 = np.arange(len(X2_test))
plt.figure(facecolor ='w')
plt.plot(t1,Y2_test,'r-',linewidth = 2, label = '真实值')
plt.plot(t1,y2_predict,'g-',linewidth = 2,label = '预测值')
plt.legend(loc = 'lower right')
plt.title("线性回归预测功率与电流之间的关系",fontsize = 20)
plt.grid(b = True)
plt.show()