二、代码实现
# /usr/bin/python
# -*- encoding:utf-8 -*-
# data analysis
import pandas as pd
import numpy as np
import random as rnd
# visualization
import seaborn as sns
import matplotlib.pyplot as plt
# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
train_df = pd.read_csv("E:\\pyworkpace\\CDA\\data\\tele_cust_train.csv")
test_df = pd.read_csv("E:\\pyworkpace\\CDA\\data\\tele_cust_test.csv")
combine = [train_df, test_df]
# 查看数据的完整性
train_df.info()
test_df.info()
# 查看特征值的分布
train_df.describe()
train_df.describe(include=[\'O\'])
# 地区与对应流失率
train_df[[\'area\', \'churn\']].groupby([\'area\'], as_index=False).mean().sort_values(by=\'churn\', ascending=False)
# 离散变量和连续变量画图
# 居住面积、租房类型和婚姻状况的关系
grid = sns.FacetGrid(train_df, row=\'marital\', size=2.2, aspect=1.6)
grid.map(sns.pointplot, \'dwllsize\', \'churn\', \'dwlltype\', palette=\'deep\')
grid.add_legend()
plt.show()
# 离散变量补充
# (1)eqpdays 以众数补齐,删除错误值,例如为负数的值,清空再补(112条为负的数据)
freq_port = train_df.eqpdays.dropna().mode()[0] # 众数
for dataset in combine:
dataset[\'eqpdays\'] = dataset[\'eqpdays\'].replace([-5, -4, -3, -2, -1], np.nan)
dataset[\'eqpdays\'] = dataset[\'eqpdays\'].fillna(freq_port)
# (2)dulband 以众数补齐数据
freq_port = train_df.dualband.dropna().mode()[0] # 众数
for dataset in combine:
dataset[\'dualband\'] = dataset[\'dualband\'].fillna(freq_port)
# (4)creditcd、truck、ethnic、marital、kid0_2至kid16_16共9个数据同时缺失,这几个变量同时用未知分类代替并生成一个新的变量new,1为补的缺失值,0为有值的数据
for dataset in combine:
dataset[\'new\'] = 0 # 没有填补过的设置为0
dataset.loc[np.where(np.isnan(dataset[\'truck\']))[0], \'new\'] = 1
dataset[\'truck\'] = dataset[\'truck\'].replace(np.nan, 3)
dataset[\'ethnic\'] = dataset[\'ethnic\'].replace(np.nan, \'Z\')
dataset[\'marital\'] = dataset[\'marital\'].fillna(\'Z\')
dataset[\'creditcd\'] = dataset[\'creditcd\'].fillna(\'Z\')
dataset[[\'kid0_2\', \'kid3_5\', \'kid6_10\', \'kid11_15\', \'kid16_17\']] = \
dataset[[\'kid0_2\', \'kid3_5\', \'kid6_10\', \'kid11_15\', \'kid16_17\']].fillna(\'Y\') # 将为空的替换为Y(1)
# 将U替换成0,Y替换成1
dataset[\'kid0_2\'] = dataset[\'kid0_2\'].map({\'U\': 0, \'Y\': 1}).astype(int)
dataset[\'kid3_5\'] = dataset[\'kid3_5\'].map({\'U\': 0, \'Y\': 1}).astype(int)
dataset[\'kid6_10\'] = dataset[\'kid6_10\'].map({\'U\': 0, \'Y\': 1}).astype(int)
dataset[\'kid11_15\'] = dataset[\'kid11_15\'].map({\'U\': 0, \'Y\': 1}).astype(int)
dataset[\'kid16_17\'] = dataset[\'kid16_17\'].map({\'U\': 0, \'Y\': 1}).astype(int)
# 生成新的kids(孩子个数)用于取代kid0_2至kid16_16
for dataset in combine:
dataset[\'kids\'] = dataset.apply(lambda x: x[\'kid0_2\'] + x[\'kid3_5\'] + x[\'kid6_10\'] + x[\'kid11_15\'] + x[\'kid16_17\'],
axis=1)
# 连续变量补充
new_df = train_df.append(test_df, sort=True)
new_df = new_df.reset_index(drop=True) # 重排索引
corrmat = new_df.corr() # 得到连续变量间的相关关系
f, ax = plt.subplots(figsize=(20, 9))
sns.heatmap(corrmat, vmax=0.8, square=True) # 绘制关系矩阵图
\'\'\'
补充da_Mean的数据 (其他连续变量以此类似)
与此相关关系较大的变量有:\'adjmou\',\'adjrev\',\'avgrev\',\'avgmou\',\'totcalls\',\'avg3mou\',\'avg3rev\',\'totmou\',peak_vce_Mean,mou_Mean,rev_Mean
\'\'\'
# 补充da_Mean数据
train_df_da_Mean = new_df.dropna(subset=[\'da_Mean\']).copy()
test_df_da_Mean = new_df[np.isnan(new_df[\'da_Mean\'])]
X_train = train_df_da_Mean[[\'adjmou\',\'adjrev\',\'avgrev\',\'avgmou\',\'totcalls\',\'avg3mou\',\'avg3rev\',\'totmou\']]
Y_train = train_df_da_Mean["da_Mean"]
X_test = test_df_da_Mean[[\'adjmou\',\'adjrev\',\'avgrev\',\'avgmou\',\'totcalls\',\'avg3mou\',\'avg3rev\',\'totmou\']]
model = RandomForestRegressor(n_estimators=100, oob_score=True, criterion=\'mse\')
model.fit(X_train, Y_train.ravel())
Y_test= model.predict(X_test)
r2 = model.score(X_train, Y_train)
test_df_da_Mean[\'da_Mean\'] = Y_test
neww = test_df_da_Mean.append(train_df_da_Mean) # 未改变索引值
neww = neww.sort_index()
new_df[\'da_Mean\'] = neww[\'da_Mean\']
# 编码
new_df[\'dualband\'] = new_df[\'dualband\'].map({\'Y\':0, \'N\':1, \'T\':2, \'U\':3}).astype(int)
new_df[\'creditcd\'] = new_df[\'creditcd\'].map({\'Y\':0, \'N\':1, \'Z\':3}).astype(int)
new_df[\'ethnic\'] = new_df[\'ethnic\'].map({\'U\':0, \'N\':1, \'H\':2, \'Z\':3, \'F\':4, \'S\':5, \'R\':6, \'O\':7, \'G\':8, \'J\':9,
\'P\':10, \'I\':11, \'B\':12, \'D\':13, \'X\':14, \'C\':15, \'M\':16}).astype(int)
new_df[\'marital\'] = new_df[\'marital\'].map({\'B\':0,\'M\':1,\'U\':2,\'A\':3,\'S\':4,\'Z\':5}).astype(int)
new_df = new_df.drop([\'Customer_ID\', \'HHstatin\', \'area\',\'kid0_2\', \'kid3_5\', \'kid6_10\', \'kid11_15\', \'kid16_17\'],axis=1)
# 没有处理\'numbcars\',\'income\',\'dwlltype\',\'dwllsize\',\'adult\',\'ownrent\',\'hnd_price\'这几个变量
yy = new_df.copy()
yy = yy.drop([\'numbcars\',\'income\',\'dwlltype\',\'dwllsize\',\'adult\',\'ownrent\',\'hnd_price\'],axis=1)
train_df = yy.dropna(subset=[\'churn\']).copy()
test_df = yy[np.isnan(new_df[\'churn\'])]
X_train=train_df.drop([\'churn\'],axis=1)
Y_train=train_df[\'churn\']
X_test = test_df.drop([\'churn\'],axis=1)
# 用决策树
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)