内容目录
- 1 衍生特征分析过程
衍生特征分析过程
import pandas as pd
import numpy as np
import datetime
import xgboost as xgb
import re
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib notebook
plt.rcParams['font.sans-serif']=['SimHei'] # show Chinese in chart
pd.set_option('display.max_columns', 40) # 显示隐藏列
数据读取
path2 = r'/home/hadoop/jack_xxx/xianxia_client_2.csv'
dataset2 = pd.read_csv(path2, sep=',', encoding='utf-8', index_col=[0])
C:\Users\M4500\Anaconda3\envs\py35_xgboost\lib\site-packages\numpy\lib\arraysetops.py:472: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
mask |= (ar1 == a)
dataset2.drop_duplicates(subset=['intopieces_id'], inplace=True)
dataset2.tail()
定义处理单列的数据
单列数据的观察
家庭结构有关
child_sum
child_sum数据出现的值,
[ 0., 2., 1., 3., 4., nan, 5., 8., 62., 6., 7.,
11., 68., 100., 22., 15., 12., 25., 113., 20., 10., 41.,
33., 14., 9., 21., 24., 23.],需预先对该列处理
dataset2.groupby('child_sum').count()[['intopieces_id']].T
| child_sum | 0.0 | 1.0 | 2.0 | 3.0 | 4.0 | 5.0 | 6.0 | 7.0 | 8.0 | 9.0 | 10.0 | 11.0 | 12.0 | 14.0 | 15.0 | 20.0 | 21.0 | 22.0 | 23.0 | 24.0 | 25.0 | 33.0 | 41.0 | 62.0 | 68.0 | 100.0 | 113.0 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| intopieces_id | 323802 | 633416 | 304946 | 40737 | 5433 | 789 | 143 | 20 | 13 | 1 | 6 | 41 | 5 | 2 | 1 | 8 | 2 | 11 | 1 | 1 | 2 | 1 | 1 | 10 | 1 | 5 | 1 |
fig1,ax1 = plt.subplots(1,1)
ax1.get_xaxis().set_visible(False) # Hide Ticks
# pd.plotting.table(ax1, dataset2.groupby('child_sum').count()[['intopieces_id']], loc='upper right',colWidths=[0.2,0.2,0.2])
pl11 = dataset2.groupby(['child_sum']).count()['intopieces_id'].plot(kind='bar', title='child_sum',table=True, ax=ax1,figsize=(10,10),fontsize=12)
将child_sum>=5的部分视为异常数据,暂且先保留,在处理当中视为其它。
marriage
进件信息表中该字段的说明,婚姻状态0未婚1已婚2离异3丧偶。
dataset2.groupby(['marriage']).count()['intopieces_id']
marriage
0 65785
1 211796
2 937405
3 143062
4 7971
5 8866
Name: intopieces_id, dtype: int64
fig2,ax2 = plt.subplots(1,1)
ax2.get_xaxis().set_visible(False) # Hide Ticks
dataset2.groupby(['marriage']).count()['intopieces_id'].plot(kind='bar', title='婚姻状态',table=True, ax=ax2,figsize=(10,10),fontsize=12)
<matplotlib.axes._subplots.AxesSubplot at 0xa6b6518>
对该字段,marriage>=4的部分视为异常,暂且保留,归为其它类别。
is_child
进件信息表对该字段说明,有无子女/1.有2.无。
dataset2.groupby(['is_child']).count()['intopieces_id']
is_child
0 1372175
1 2071
2 639
Name: intopieces_id, dtype: int64
# fig1 = plt.figure()
# fig1.add_subplot(111)
fig3,ax3 = plt.subplots(1,1)
ax3.get_xaxis().set_visible(False) # Hide Ticks
pl2 = dataset2.groupby(['is_child']).count()['intopieces_id'].plot(kind='bar', title='子女(有无)',table=True, ax=ax3, figsize=(8,8))
对该字段的观察,字段说明可能有问题,在这里理解应为0-有,1-无,2-其它。故这里的处理,将is_child>1视为其它。
support_pre_count
dataset2.groupby(['support_pre_count']).count()['intopieces_id']
support_pre_count
0 814769
1 398249
2 144316
3 15426
4 1766
5 253
6 43
7 4
9 1
10 3
11 10
12 1
14 2
15 1
16 1
20 3
21 1
22 3
25 1
36 2
41 1
127 29
Name: intopieces_id, dtype: int64
fig4,ax4 = plt.subplots(1,1)
ax4.get_xaxis().set_visible(False) # Hide Ticks
dataset2.groupby(['support_pre_count']).count()['intopieces_id'].plot(kind='bar', title='供养人数',table=True, ax=ax4, figsize=(10,10),fontsize=12)
<matplotlib.axes._subplots.AxesSubplot at 0xa73e2e8>
这里看到供养人数有出现>6的情况。这儿采用的处理方式,对support_pre_count>6的部分视为异常,归为其它类别。
处理方法结论
筛选该条件(support_pre_count <=6) & (is_child<2) & (marriage<4) & (child_sum<5),在此基础上做家庭结构的划分;对于该条件之外的视为其它类别(归因于数据记录不规范、错误所带来的)
社交账号有关
dataset2.columns
Index(['intopieces_id', 'cd', 'weixin', 'weibo', 'qq', 'marriage', 'is_child',
'support_pre_count', 'child_sum', 'alipay_account', 'jd_account',
'id_num', 'house_pay', 'support_pre_pay', 'month_income'],
dtype='object')
weixin
print('空值:',dataset2[dataset2['weixin'].isnull()].count()['intopieces_id'],'\n',
'非空:',dataset2[dataset2['weixin'].notnull()].count()['intopieces_id'],'\n',
'无:',dataset2[(dataset2['weixin']=='无')].count()['intopieces_id'])
# [Nn][Aa][Nn]
空值: 1181935
非空: 192950
无: 2434
dataset2[((dataset2['weixin']=='nan')|(dataset2['weixin']=='NULL')|(dataset2['weixin']=='空')|(dataset2['weixin']=='无'))].count()['intopieces_id']
2434
print('空值:',dataset2[dataset2['weibo'].isnull()].count()['intopieces_id'],'\n',
'非空:',dataset2[dataset2['weibo'].notnull()].count()['intopieces_id'],'\n',
'无:',dataset2[(dataset2['weibo']==u'无')].count()['intopieces_id'])
# [Nn][Aa][Nn]
空值: 1369308
非空: 5577
无: 3908
dataset2[((dataset2['weibo']=='nan')|(dataset2['weibo']=='NULL')|(dataset2['weibo']=='空')|(dataset2['weibo']=='无'))].count()['intopieces_id']
3908
print('空值:',dataset2[dataset2['qq'].isnull()].count()['intopieces_id'],'\n',
'非空:',dataset2[dataset2['qq'].notnull()].count()['intopieces_id'],'\n',
'无:',dataset2[(dataset2['qq']=='无')].count()['intopieces_id'])
# [Nn][Aa][Nn]
空值: 1284821
非空: 90064
无: 2611
dataset2[((dataset2['qq']=='nan')|(dataset2['qq']=='NULL')|(dataset2['qq']=='空')|(dataset2['qq']=='无'))].count()['intopieces_id']
2611
处理方式
考虑到三个字段的缺失值比例较大,有几种处理方式:
1、保存原有的缺失(不推荐)
2、插补:选择多重插补法(最合适)
购物数据(京东账号jd_account、淘宝账号alipay_account)
jd_account
print('空值:',dataset2[dataset2['jd_account'].isnull()].count()['intopieces_id'],'\n',
'非空:',dataset2[dataset2['jd_account'].notnull()].count()['intopieces_id'],'\n',
'无:',dataset2[(dataset2['jd_account']=='无')].count()['intopieces_id'])
空值: 1371477
非空: 3408
无: 1775
dataset2[((dataset2['jd_account']=='nan')|(dataset2['jd_account']=='NULL')|(dataset2['jd_account']=='空')|(dataset2['jd_account']=='无'))].count()['intopieces_id']
1775
alipay_account
dataset2.columns
Index(['intopieces_id', 'cd', 'weixin', 'weibo', 'qq', 'marriage', 'is_child',
'support_pre_count', 'child_sum', 'alipay_account', 'jd_account',
'id_num', 'house_pay', 'support_pre_pay', 'month_income'],
dtype='object')
print('空值:',dataset2[dataset2['alipay_account'].isnull()].count()['intopieces_id'],'\n',
'非空:',dataset2[dataset2['alipay_account'].notnull()].count()['intopieces_id'],'\n',
'无:',dataset2[(dataset2['alipay_account']=='无')].count()['intopieces_id'])
# [Nn][Aa][Nn]
空值: 1363912
非空: 10973
无: 1369
dataset2[((dataset2['alipay_account']=='nan')|(dataset2['alipay_account']=='NULL')|(dataset2['alipay_account']=='空')|(dataset2['alipay_account']=='无'))].count()['intopieces_id']
1369
有关收入(月收入month_income、家庭支出support_pre_pay)
月收入month_income
print('空值:',dataset2[dataset2['month_income'].isnull()].count()['intopieces_id'],'\n',
'非空:',dataset2[dataset2['month_income'].notnull()].count()['intopieces_id'],'\n',
# '无:',dataset2[(dataset2['month_income']=='无')].count()['intopieces_id']
)
# [Nn][Aa][Nn]
空值: 0
非空: 1374885
dataset2[['month_income']].describe()
| month_income | |
|---|---|
| count | 1.374885e+06 |
| mean | 3.247747e+04 |
| std | 1.201198e+06 |
| min | 0.000000e+00 |
| 25% | 4.500000e+03 |
| 50% | 8.000000e+03 |
| 75% | 2.000000e+04 |
| max | 1.111111e+09 |
fig41 = plt.figure()
fig41.add_subplot(111)
dataset2[dataset2['month_income']>= 5000000].groupby('cd').count()['intopieces_id'].plot(kind='bar',title='month_income')
<matplotlib.axes._subplots.AxesSubplot at 0xa80cc50>
fig41 = plt.figure()
fig41.add_subplot(111)
dataset2[dataset2['month_income']<10000000].groupby('cd').count()['intopieces_id'].plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0xaef1cc0>
fig41 = plt.figure()
fig41.add_subplot(111)
plot1 = sns.boxplot(y='month_income',x='cd',data=dataset2, )
plot1.set(ylim=(0,100000))
plt.title('家庭月收入分布')
Text(0.5,1,'家庭月收入分布')
家庭支出support_pre_pay
print('空值:',dataset2[dataset2['support_pre_pay'].isnull()].count()['intopieces_id'],'\n',
'非空:',dataset2[dataset2['support_pre_pay'].notnull()].count()['intopieces_id'],'\n',
# '无:',dataset2[(dataset2['month_income']=='无')].count()['intopieces_id']
)
# [Nn][Aa][Nn]
空值: 0
非空: 1374885
dataset2[['support_pre_pay']].describe()
| support_pre_pay | |
|---|---|
| count | 1.374885e+06 |
| mean | 5.408445e+03 |
| std | 3.172173e+06 |
| min | 0.000000e+00 |
| 25% | 0.000000e+00 |
| 50% | 6.000000e+02 |
| 75% | 1.000000e+03 |
| max | 2.147484e+09 |
fig42 = plt.figure()
fig42.add_subplot(111)
plot1 = sns.boxplot(y='support_pre_pay',x='cd',data=dataset2)
plot1.set(ylim=(0,10000))
plt.title('家庭月支出分布')
Text(0.5,1,'家庭月支出分布')
处理的基本函数
dataset2[dataset2['child_sum'].isnull()].head()
def one_feature(x):
'''
该函数用于处理单变量特征:社交账号、购物账号
Args:
x: string, 账号信息
Returns:number, 打标签[0,1,10]
'''
x = str(x)
r = '\s|\.|\,|\:|\"|\'|' # '.', ',',' '
x = re.sub(r, "", x)
if x == u'无':
label = 0 # 0表示没有账号
elif x == 'nan':
label = 10 # 10表示空值标示
else:
label = 1 # 1标示有账号
return label
def shejiao_func(x, y, z):
'''
组合-社交账户数据的处理
Args:
x: string, 微信账号
y: string, 微博账号
z: string, qq账号
Returns: string, 打标签-社交账号类别[0,1,2,3,4,5,6,7,8,9]
'''
label_wx = one_feature(x)
label_wb = one_feature(y)
label_qq = one_feature(z)
sum_num = label_wb + label_wx + label_qq
if (sum_num <= 3) and (sum_num >= 0):
label = str(sum_num) # 正常计数
elif sum_num == 11:
label = str(4) # 计数1+未登记1
elif sum_num == 21:
label = str(5) # 计数1+未登记2
elif sum_num == 12:
label = str(6) # 计数2+未登记1
elif sum_num == 10:
label = str(7) # 计数0+未登记1
elif sum_num == 20:
label = str(8) # 计数0+未登记2
elif sum_num == 30:
label = str(9) # 全未登记
else:
label = str(10) # other
return label
def gouwu_func(x, y):
'''
组合-购物账户数据的处理
Args:
x: string, 京东账号
y: string, 淘宝账号
Returns: string, 打标签-购物账号类别[0,1,2,3,4,5,6]
'''
label_jd = one_feature(x)
label_tb = one_feature(y)
sum_num = label_jd + label_tb
if (sum_num <= 2) and (sum_num >= 0):
label = str(sum_num) # 正常计数
elif sum_num == 11:
label = str(3) # 计数1+未登记1
elif sum_num == 10:
label = str(4) # 计数0+未登记1
elif sum_num == 20:
label = str(5) # 全未登记
else:
label = str(6) # other
return label
def muti_family_func(marriage, is_child, child_sum, support_pre_count):
'''
组合 - 家庭结构处理
Args:
marriage: number, 婚姻状态
is_child: number, 是否子女
child_sum: number, 孩子数量
support_pre_count: number, 供养人数
Returns: string, 打标签-家庭结构类别[0,1,2,3,4]
'''
if ((support_pre_count <= 6) & (is_child < 2) & (marriage < 4) & (child_sum < 5)):
if ((marriage == 0) | ((marriage == 2) & (is_child == 1)) | ((marriage == 3) & (is_child == 1))):
label = str(0) # 单身家庭
elif (((marriage == 1) & (is_child == 1)) | (
(marriage == 1) & (is_child == 0) & ((support_pre_count == 0) | (support_pre_count <= child_sum)))):
label = str(1) # 夫妻二人
elif ((marriage == 1) & (is_child == 0) & (support_pre_count >= child_sum)):
label = str(2) # 核心家庭和主干家庭
elif ((marriage == 2) & (is_child == 0) & (child_sum >= 1) & (support_pre_count >= child_sum)):
label = str(3) # 单亲家庭
else:
label = str(4) # other
else:
label = str(5) # other,error_data
return label
def month_income(pd_series=dataset2['month_income'],
divition=[float('-inf'), 3000, 30000, 80000, 150000, 300000, 1000000, 5000000, float('inf')],
labels=['0', '1', '2', '3', '4', '5', '6', '7']):
'''
单列 - 月收入处理
Args:
pd_series: pd.Series,里面的元素number, 欲进行分箱数据
divition: list,里面元素number, 用于分箱的临界值数据
labels:
Returns:pd.Series, 分箱之后的标签结果.['0', '1', '2', '3', '4', '5', '6', '7']分别对应
贫困人口,穷人,低收入,初级小康,高级小康,中高收入,高收入,三者归一类(富翁,富豪,超级富豪)
'''
bins_series = pd.cut(pd_series* 12, bins=divition, right=True, precision=1, retbins=False,
labels=labels)
return bins_series
def income_cata(outcome, income):
'''
组合-辅助1,家庭支出和月收入
用于对家庭支出、月收入各单列中的数据进行处理,为计算家庭支出比重income_ratio函数做准备。
Args:
outcome: number, 家庭支出
income: number, 家庭收入
Returns:string, 打标签-辅助1类别[0,1,2,3]
'''
if (outcome <= 0.0) and (income <= 0):
label = str(1) # 表示家庭支出、月收入,全部<=0的情况
elif (outcome <= 0.0) and (income > 0):
label = str(2) # 表示家庭支出<=0的情况
elif (outcome > 0.0) and (income <= 0):
label = str(3) # 表示月收入<=0的情况
else:
label = str(0) # 表示家庭支出、月收入全部>0的情况
return label
def income_ratio(outcome, income):
'''
组合-比值,家庭支出和月收入
Args:
outcome: number, 家庭支出
income: number, 家庭收入
Returns:number, 家庭支出比重[0,inf],inf表示无穷
'''
label = income_cata(outcome, income)
if label == '0':
x = float(outcome)
y = float(income)
value = x / y
elif label == '1':
value = 1
elif label == '2':
value = 0
else:
value = 100 # 对应于income_cata返回结果3,即家庭收入<=0的情况,赋给指定值100。
return value
def income_ratio_addcata(outcome, income):
'''
组合-比值衍生类别1,家庭支出和月收入
衍生的类别变量,和家庭支出、月收入组合特征共同使用
Args:
outcome: number, 家庭支出
income: number, 家庭收入
Returns:string, 打标签-支出比重的类别[0,1,2,3,4,5]
'''
value = income_ratio(outcome, income)
if value == 0:
label = str(0) # 表示家庭支出比重=0的情况
elif (value > 0) & (value <= .01):
label = str(1) # 表示家庭支出比重(0,0.01]的情况,’(]‘表示左开右闭
elif (value > 0.01) & (value <= .1):
label = str(2) # 表示家庭支出比重(0.01,0.1]的情况
elif (value > 0.1) & (value <= 1):
label = str(3) # 表示家庭支出比重(0.1,1]的情况
elif (value > 1) & (value <= 10):
label = str(4) # 表示家庭支出比重(1,10]的情况
else:
label = str(5) # 表示家庭支出比重(10,inf]的情况,inf表示无穷
return label
def cd_flag(x):
'''
目标变量的处理
逾期数>0即认为逾期
Args:
x: number, 目标变量数值
Returns:string, 打标签-目标变量类别[0,1]
'''
if x > 0:
label = str(1) # 1表示逾期
else:
label = str(0) # 0表示正常
return label
目标变量处理(逾期数cd)
dataset2['cd'] = dataset2['cd'].apply(cd_flag)
组合-微信、微博、qq账户数量:社交数量shejiao_num
对缺失的类别保留,并相应处理
dataset2['shejiao_num'] = list(map(lambda x,y,z: shejiao_func(x,y,z), dataset2['weixin'],dataset2['weibo'],dataset2['qq']))
dataset2.groupby(['cd','shejiao_num']).agg({'intopieces_id':'count'}).unstack()
| intopieces_id | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|
| shejiao_num | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
| cd | ||||||||||
| 0 | 1126 | 691 | 538 | 1084 | 89 | 115332 | 29575 | 185 | 33 | 784481 |
| 1 | 734 | 268 | 128 | 201 | 37 | 62621 | 18125 | 131 | 11 | 359495 |
# fig1 = plt.figure()
# fig1.add_subplot(111)
dataset2.groupby(['cd','shejiao_num']).agg({'intopieces_id':'count'}).unstack().T.plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x37e4c1d0>
dataset2.groupby(['cd','shejiao_num']).agg({'intopieces_id':'count'}).unstack()/dataset2.groupby(['cd','shejiao_num']).agg({'intopieces_id':'count'}).unstack().sum()
| intopieces_id | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|
| shejiao_num | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
| cd | ||||||||||
| 0 | 0.605376 | 0.720542 | 0.807808 | 0.84358 | 0.706349 | 0.648104 | 0.620021 | 0.585443 | 0.75 | 0.68575 |
| 1 | 0.394624 | 0.279458 | 0.192192 | 0.15642 | 0.293651 | 0.351896 | 0.379979 | 0.414557 | 0.25 | 0.31425 |
shejiao_perc = dataset2.groupby(['cd','shejiao_num']).agg({'intopieces_id':'count'}).unstack()/dataset2.groupby(['cd','shejiao_num']).agg({'intopieces_id':'count'}).unstack().sum()
shejiao_perc.T.plot(kind='bar',stacked=True)
<matplotlib.axes._subplots.AxesSubplot at 0x26f18dd8>
从上面观察考虑,认为同时拥有3个账号的客户,其所逾期的可能性会更低。同时怀疑,是否存在某一个账号是影响的关键?
组合-京东账号、淘宝账号:购物账号数量gouwu_num
dataset2['gouwu_num'] = list(map(lambda x,y: gouwu_func(x,y), dataset2['alipay_account'],dataset2['jd_account']))
dataset2.groupby(['cd','gouwu_num']).agg({'intopieces_id':'count'}).unstack().T.plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x3bd39c50>
dataset2.groupby(['cd','gouwu_num']).agg({'intopieces_id':'count'}).unstack()
| intopieces_id | ||||||
|---|---|---|---|---|---|---|
| gouwu_num | 0 | 1 | 2 | 3 | 4 | 5 |
| cd | ||||||
| 0 | 1052 | 354 | 1331 | 5774 | 8 | 924615 |
| 1 | 308 | 60 | 181 | 2023 | 4 | 439175 |
dataset2.groupby(['cd','gouwu_num']).agg({'intopieces_id':'count'}).unstack()/dataset2.groupby(['cd','gouwu_num']).agg({'intopieces_id':'count'}).unstack().sum()
| intopieces_id | ||||||
|---|---|---|---|---|---|---|
| gouwu_num | 0 | 1 | 2 | 3 | 4 | 5 |
| cd | ||||||
| 0 | 0.773529 | 0.855072 | 0.880291 | 0.740541 | 0.666667 | 0.677975 |
| 1 | 0.226471 | 0.144928 | 0.119709 | 0.259459 | 0.333333 | 0.322025 |
gouwu_perc = dataset2.groupby(['cd','gouwu_num']).agg({'intopieces_id':'count'}).unstack()/dataset2.groupby(['cd','gouwu_num']).agg({'intopieces_id':'count'}).unstack().sum()
gouwu_perc.T.plot(kind='bar',stacked=True)
<matplotlib.axes._subplots.AxesSubplot at 0x2ea68320>
组合-marriage、is_child、child_sum、support_pre_count:家庭结构hunyin_cata
dataset2['child_sum'].unique()
array([ 0., 2., 1., 3., 4., nan, 5., 8., 62., 6., 7.,
11., 68., 100., 22., 15., 12., 25., 113., 20., 10., 41.,
33., 14., 9., 21., 24., 23.])
dataset2['hunyin_cata'] = list(map(lambda x,y,z,w: muti_family_func(x,y,z,w), dataset2['marriage'],dataset2['is_child'],dataset2['child_sum'],dataset2['support_pre_count']))
# marriage, is_child, child_sum, support_pre_count
dataset2.groupby(['hunyin_cata']).count()['intopieces_id']
hunyin_cata
0 2452
1 209424
2 1909
3 451980
4 625078
5 84042
Name: intopieces_id, dtype: int64
dataset2.groupby(['cd','hunyin_cata']).agg({'intopieces_id':'count'}).unstack().T.plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x43d76b38>
dataset2.groupby(['cd','hunyin_cata']).agg({'intopieces_id':'count'}).unstack()/dataset2.groupby(['cd','hunyin_cata']).agg({'intopieces_id':'count'}).unstack().sum()
| intopieces_id | ||||||
|---|---|---|---|---|---|---|
| hunyin_cata | 0 | 1 | 2 | 3 | 4 | 5 |
| cd | ||||||
| 0 | 0.790783 | 0.67412 | 0.641173 | 0.610206 | 0.744593 | 0.565955 |
| 1 | 0.209217 | 0.32588 | 0.358827 | 0.389794 | 0.255407 | 0.434045 |
hunyin_perc = dataset2.groupby(['cd','hunyin_cata']).agg({'intopieces_id':'count'}).unstack()/dataset2.groupby(['cd','hunyin_cata']).agg({'intopieces_id':'count'}).unstack().sum()
hunyin_perc.T.plot(kind='bar',stacked=True)
<matplotlib.axes._subplots.AxesSubplot at 0x2ead0eb8>
单列-月收入数据:收入分级monthincome_bins
dataset2['monthincome_bins'] = month_income(dataset2['month_income'])
dataset2.groupby(['monthincome_bins']).count()['intopieces_id']
monthincome_bins
0 93175
1 26904
2 475463
3 291892
4 191145
5 209742
6 74058
7 12506
Name: intopieces_id, dtype: int64
dataset2.groupby(['cd','monthincome_bins']).agg({'intopieces_id':'count'}).unstack()
| intopieces_id | ||||||||
|---|---|---|---|---|---|---|---|---|
| monthincome_bins | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
| cd | ||||||||
| 0 | 50840 | 18555 | 322996 | 204990 | 134515 | 147132 | 46699 | 7407 |
| 1 | 42335 | 8349 | 152467 | 86902 | 56630 | 62610 | 27359 | 5099 |
# fig7 = plt.figure()
# fig7.add_subplot(111)
dataset2.groupby(['cd','monthincome_bins']).agg({'intopieces_id':'count'}).unstack().T.plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x2eb27470>
dataset2.groupby(['cd','monthincome_bins']).agg({'intopieces_id':'count'}).unstack()/dataset2.groupby(['cd','monthincome_bins']).agg({'intopieces_id':'count'}).unstack().sum()
| intopieces_id | ||||||||
|---|---|---|---|---|---|---|---|---|
| monthincome_bins | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
| cd | ||||||||
| 0 | 0.54564 | 0.689674 | 0.679329 | 0.70228 | 0.703733 | 0.70149 | 0.630573 | 0.592276 |
| 1 | 0.45436 | 0.310326 | 0.320671 | 0.29772 | 0.296267 | 0.29851 | 0.369427 | 0.407724 |
monthincome_perc = dataset2.groupby(['cd','monthincome_bins']).agg({'intopieces_id':'count'}).unstack()/dataset2.groupby(['cd','monthincome_bins']).agg({'intopieces_id':'count'}).unstack().sum()
monthincome_perc.T.plot(kind='bar',stacked=True)
<matplotlib.axes._subplots.AxesSubplot at 0x2ecdb748>
组合-月收入、家庭支出:家庭支出比重zhichu_ratio
zhichu_ratio生成和分布情况观察
dataset2[dataset2['support_pre_pay']>dataset2['month_income']].head()
dataset2['zhichu_ratio'] = list(map(lambda x,y: income_ratio(x,y), dataset2['support_pre_pay'],dataset2['month_income']))
值分布图
fig81 = plt.figure(figsize=(16,10))
fig81.suptitle('家庭月支出比重(值分布)')
plt.tight_layout(pad=2) #设置默认的间距
plt.subplots_adjust(wspace=0.5, hspace=0.5)
fig81.add_subplot(241)
plot1 = sns.boxplot(y='zhichu_ratio',x='cd',data=dataset2)
plot1.set(ylim=(0,1))
plt.title('ALL')
fig81.add_subplot(242)
plot1 = sns.boxplot(y='zhichu_ratio',x='cd',data=dataset2[(dataset2['zhichu_ratio']== 0)])
plt.title('= 0')
fig81.add_subplot(243)
plot1 = sns.boxplot(y='zhichu_ratio',x='cd',data=dataset2[(dataset2['zhichu_ratio']> 0) & ((dataset2['zhichu_ratio']<=.01))])
plot1.set(ylim=(0,.01))
plt.title('0-0.01')
fig81.add_subplot(244)
plot1 = sns.boxplot(y='zhichu_ratio',x='cd',data=dataset2[(dataset2['zhichu_ratio']>.01) & (dataset2['zhichu_ratio']<=0.1)])
plot1.set(ylim=(0.01,0.1))
plt.title('0.01-0.1')
fig81.add_subplot(245)
plot1 = sns.boxplot(y='zhichu_ratio',x='cd',data=dataset2[(dataset2['zhichu_ratio']>0.1) & (dataset2['zhichu_ratio']<=1)])
plot1.set(ylim=(0,1))
plt.title('0.1-1')
fig81.add_subplot(246)
plot1 = sns.boxplot(y='zhichu_ratio',x='cd',data=dataset2[(dataset2['zhichu_ratio']>1) & (dataset2['zhichu_ratio']<=10)])
plot1.set(ylim=(0,10))
plt.title('1-10')
fig81.add_subplot(247)
plot1 = sns.boxplot(y='zhichu_ratio',x='cd',data=dataset2[(dataset2['zhichu_ratio']>10) & (dataset2['zhichu_ratio']<100)])
plot1.set(ylim=(10,100))
plt.title('10-100')
fig81.add_subplot(248)
plot1 = sns.boxplot(y='zhichu_ratio',x='cd',data=dataset2[(dataset2['zhichu_ratio']>100)])
plot1.set(ylim=(100,5000))
plt.title('>100')
Text(0.5,1,'>100')
频数统计
fig82 = plt.figure(figsize=(16,10))
fig82.suptitle('家庭月支出比重(频数)')
plt.tight_layout(pad=2) #设置默认的间距
plt.subplots_adjust(wspace=0.5, hspace=0.5)
fig82.add_subplot(241)
plot1 = sns.countplot(x='cd',data=dataset2)
plt.title('ALL')
fig82.add_subplot(242)
plot1 = sns.countplot(x='cd',data=dataset2[(dataset2['zhichu_ratio']== 0)])
plt.title('= 0')
fig82.add_subplot(243)
plot1 = sns.countplot(x='cd',data=dataset2[(dataset2['zhichu_ratio']> 0) & ((dataset2['zhichu_ratio']<=.01))])
plt.title('0-0.01')
fig82.add_subplot(244)
plot1 = sns.countplot(x='cd',data=dataset2[(dataset2['zhichu_ratio']>.01) & (dataset2['zhichu_ratio']<=0.1)])
# plot1.set(ylim=(0,1))
plt.title('0.01-0.1')
fig82.add_subplot(245)
plot1 = sns.countplot(x='cd',data=dataset2[(dataset2['zhichu_ratio']>0.1) & (dataset2['zhichu_ratio']<=1)])
# plot1.set(ylim=(0,1))
plt.title('0.1-1')
fig82.add_subplot(246)
plot1 = sns.countplot(x='cd',data=dataset2[(dataset2['zhichu_ratio']>1) & (dataset2['zhichu_ratio']<=10)])
# plot1.set(ylim=(0,1))
plt.title('1-10')
fig82.add_subplot(247)
plot1 = sns.countplot(x='cd',data=dataset2[(dataset2['zhichu_ratio']>10) & (dataset2['zhichu_ratio']<100)])
# plot1.set(ylim=(10,100))
plt.title('10-100')
fig82.add_subplot(248)
plot1 = sns.countplot(x='cd',data=dataset2[(dataset2['zhichu_ratio']>100)])
# plot1.set(ylim=(100,5000))
plt.title('>100')
Text(0.5,1,'>100')
结论与建议
结合原始数据(月收入month_income、家庭支出support_pre_pay),其数值存在超过10万以上的,一部分可能原因是线下录入有误,一部分原因可能是客户造假,其余部分可能是存在极个别客户填写数值是真实的。基于此考虑,在对两列处理之后,相应会存在比值大于100、以及=0情况,从箱线图结果观察到>100的部分,正常与逾期的分布无明显区别。因此,针对收入比重大于100的部分,可以考虑去除。
尝试(log变换)
dataset2['temp_log'] = dataset2['zhichu_ratio'].apply('log')
C:\Users\M4500\Anaconda3\envs\py35_xgboost\lib\site-packages\pandas\core\base.py:307: RuntimeWarning: divide by zero encountered in log
return f(self, *args, **kwargs)
值分布
fig84 = plt.figure(figsize=(16,8))
fig84.suptitle('家庭月支出log变换')
plt.tight_layout(pad=2) #设置默认的间距
plt.subplots_adjust(wspace=0.5, hspace=0.5)
fig84.add_subplot(241)
plot1 = sns.boxplot(y='temp_log',x='cd',data=dataset2)
# plot1.set(ylim=(0,1))
plt.title('ALL')
fig84.add_subplot(242)
plot1 = sns.boxplot(y='temp_log',x='cd',data=dataset2[(dataset2['zhichu_ratio']== 0)])
plt.title('= 0')
fig84.add_subplot(243)
plot1 = sns.boxplot(y='temp_log',x='cd',data=dataset2[(dataset2['zhichu_ratio']> 0) & ((dataset2['zhichu_ratio']<=.01))])
# plot1.set(ylim=(0,.01))
plt.title('0-0.01')
fig84.add_subplot(244)
plot1 = sns.boxplot(y='temp_log',x='cd',data=dataset2[(dataset2['zhichu_ratio']>.01) & (dataset2['zhichu_ratio']<=0.1)])
# plot1.set(ylim=(0.01,0.1))
plt.title('0.01-0.1')
fig84.add_subplot(245)
plot1 = sns.boxplot(y='temp_log',x='cd',data=dataset2[(dataset2['zhichu_ratio']>0.1) & (dataset2['zhichu_ratio']<=1)])
# plot1.set(ylim=(0,1))
plt.title('0.1-1')
fig84.add_subplot(246)
plot1 = sns.boxplot(y='temp_log',x='cd',data=dataset2[(dataset2['zhichu_ratio']>1) & (dataset2['zhichu_ratio']<=10)])
# plot1.set(ylim=(0,10))
plt.title('1-10')
fig84.add_subplot(247)
plot1 = sns.boxplot(y='temp_log',x='cd',data=dataset2[(dataset2['zhichu_ratio']>10) & (dataset2['zhichu_ratio']<100)])
# plot1.set(ylim=(10,100))
plt.title('10-100')
fig84.add_subplot(248)
plot1 = sns.boxplot(y='temp_log',x='cd',data=dataset2[(dataset2['zhichu_ratio']>100)])
# plot1.set(ylim=(100,5000))
plt.title('>100')
C:\Users\M4500\Anaconda3\envs\py35_xgboost\lib\site-packages\matplotlib\cbook\__init__.py:1847: RuntimeWarning: invalid value encountered in double_scalars
stats['iqr'] = q3 - q1
C:\Users\M4500\Anaconda3\envs\py35_xgboost\lib\site-packages\matplotlib\cbook\__init__.py:1872: RuntimeWarning: invalid value encountered in less_equal
wiskhi = np.compress(x <= hival, x)
C:\Users\M4500\Anaconda3\envs\py35_xgboost\lib\site-packages\matplotlib\cbook\__init__.py:1879: RuntimeWarning: invalid value encountered in greater_equal
wisklo = np.compress(x >= loval, x)
C:\Users\M4500\Anaconda3\envs\py35_xgboost\lib\site-packages\numpy\lib\function_base.py:4406: RuntimeWarning: invalid value encountered in multiply
x2 = take(ap, indices_above, axis=axis) * weights_above
Text(0.5,1,'>100')
结论与建议——(不推荐)
由上面的图表观察,log变换带来数据的区分度并不理想。因此,暂且不考虑log变换。
zhichu_ratio的衍生特征——zhichu_ratio_addcata
dataset2['zhichu_ratio_addcata'] = list(map(lambda x,y: income_ratio_addcata(x, y), dataset2['support_pre_pay'],dataset2['month_income']))
fig85 = plt.figure(figsize=(8,8))
plt.tight_layout(pad=2) #设置默认的间距
plt.subplots_adjust(wspace=0.5, hspace=0.5)
fig85.add_subplot(121)
plot1 = sns.countplot(x='zhichu_ratio_addcata',hue='cd',data=dataset2, dodge=False)
# plot1.set(ylim=(0,1))
plt.title('ALL')
Text(0.5,1,'ALL')
变量重要性判别
从统计相关的角度分析单变量的影响
数据集中的社交数据和购物数据可认为是数值型,即连续-离散的变量分析
社交和购物数据视为连续—离散
# dataset2.info()
# from sklearn.feature_selection import SelectKBest
# from sklearn.feature_selection import f_classif # 即假设检验-anova方法
# k_best = SelectKBest(f_classif, k='all')
# Kbest = k_best.fit(dataset2[['gouwu_num','shejiao_num']], dataset2[['cd']])
# Kbest.pvalues_
从上面的结果,p<0.05,故认为两个变量对结果均有影响
社交和购物数据视为离散—离散
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
dataset2_sub = dataset2[['gouwu_num','shejiao_num','cd']]
dataset2_sub['gouwu_num'] = dataset2_sub['gouwu_num'].astype(str)
dataset2_sub['shejiao_num'] = dataset2_sub['shejiao_num'].astype(str)
#选择K个最好的特征,返回选择特征后的数据
model1 = SelectKBest(chi2, k='all')
model1.fit_transform(dataset2_sub[['gouwu_num','shejiao_num']], dataset2_sub['cd'])[1]
# 各特征的得分
model1.scores_, dataset2_sub[['gouwu_num','shejiao_num']].columns
C:\Users\M4500\Anaconda3\envs\py35_xgboost\lib\site-packages\ipykernel_launcher.py:6: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
C:\Users\M4500\Anaconda3\envs\py35_xgboost\lib\site-packages\ipykernel_launcher.py:7: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
import sys
(array([ 5.31019853, 341.66495808]),
Index(['gouwu_num', 'shejiao_num'], dtype='object'))
# 各特征的p值
model1.pvalues_ , dataset2_sub[['gouwu_num','shejiao_num']].columns
(array([2.12009402e-02, 2.76869376e-76]),
Index(['gouwu_num', 'shejiao_num'], dtype='object'))
从上面的结果,p<0.05,故认为两个变量对目标变量均有影响
家庭支出比重(zhichu_ratio)的连续—离散
家庭支出比重(ALL)
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif # 即假设检验-anova方法
k_best = SelectKBest(f_classif, k='all')
Kbest = k_best.fit(dataset2[['month_income','support_pre_pay', 'zhichu_ratio']], dataset2[['cd']])
Kbest.pvalues_
C:\Users\M4500\Anaconda3\envs\py35_xgboost\lib\site-packages\sklearn\utils\validation.py:578: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
array([4.99626685e-06, 9.29123318e-01, 2.43230145e-01])
从假设检验的分析结果中观察到,对于原始单列数据month_income,它对目标变量有一定影响;原始单列数据support_pre_pay、组合数据对目标变量无显著影响
家庭支出比重((0<ratio≤10)
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif # 即假设检验-anova方法
k_best = SelectKBest(f_classif, k='all')
dataset2_sub3 = dataset2[(dataset2['zhichu_ratio']>0)&(dataset2['zhichu_ratio']<10)]
Kbest = k_best.fit(dataset2_sub3[['month_income','support_pre_pay', 'zhichu_ratio']], dataset2_sub3[['cd']])
Kbest.pvalues_
C:\Users\M4500\Anaconda3\envs\py35_xgboost\lib\site-packages\sklearn\utils\validation.py:578: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
array([4.55373808e-06, 3.58681469e-01, 6.01862819e-03])
家庭支出比重((0≤ratio≤10)
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif # 即假设检验-anova方法
k_best = SelectKBest(f_classif, k='all')
dataset2_sub3 = dataset2[(dataset2['zhichu_ratio']>=0)&(dataset2['zhichu_ratio']<10)]
Kbest = k_best.fit(dataset2_sub3[['month_income','support_pre_pay', 'zhichu_ratio']], dataset2_sub3[['cd']])
Kbest.pvalues_
C:\Users\M4500\Anaconda3\envs\py35_xgboost\lib\site-packages\sklearn\utils\validation.py:578: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
array([2.17489841e-06, 0.00000000e+00, 0.00000000e+00])
结论
分三种情况区别看待家庭支出比重,0,(0,10],(10,inf)。在对整体范围数据进行检验时,zhichu_ratio不显著,即难以判别对目标变量的影响;当在<10的情况下,zhichu_ratio显著,明显对目标变量有影响,尤其在=0的情况当中。
故建议添加一列特征作为辅助区分三种情况。
从多变量结合角度
离散—离散
[‘shejiao_num’,‘gouwu_num’,‘hunyin_cata’,‘monthincome_bins’,‘zhichu_ratio_addcata’]
社交、购物、家庭结构、收入分级、家庭比重(衍生类别)等离散变量判断。
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
dataset2_sub = dataset2[['gouwu_num','shejiao_num','hunyin_cata','monthincome_bins','zhichu_ratio_addcata', 'cd']]
dataset2_sub['gouwu_num'] = dataset2_sub['gouwu_num'].astype(str)
dataset2_sub['shejiao_num'] = dataset2_sub['shejiao_num'].astype(str)
#选择K个最好的特征,返回选择特征后的数据
model1 = SelectKBest(chi2, k='all')
model1.fit_transform(dataset2_sub[['gouwu_num','shejiao_num','hunyin_cata','monthincome_bins','zhichu_ratio_addcata']], dataset2_sub['cd'])[1]
# 各特征的得分
model1.scores_, dataset2_sub[['gouwu_num','shejiao_num','hunyin_cata','monthincome_bins','zhichu_ratio_addcata']].columns
C:\Users\M4500\Anaconda3\envs\py35_xgboost\lib\site-packages\ipykernel_launcher.py:6: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
C:\Users\M4500\Anaconda3\envs\py35_xgboost\lib\site-packages\ipykernel_launcher.py:7: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
import sys
(array([5.31019853e+00, 3.41664958e+02, 5.59435501e+02, 1.38677840e+03,
6.14311476e+04]),
Index(['gouwu_num', 'shejiao_num', 'hunyin_cata', 'monthincome_bins',
'zhichu_ratio_addcata'],
dtype='object'))
# 各特征的p值
model1.pvalues_ , dataset2_sub[['gouwu_num','shejiao_num','hunyin_cata','monthincome_bins','zhichu_ratio_addcata']].columns
(array([2.12009402e-002, 2.76869376e-076, 1.11536417e-123, 1.56862997e-303,
0.00000000e+000]),
Index(['gouwu_num', 'shejiao_num', 'hunyin_cata', 'monthincome_bins',
'zhichu_ratio_addcata'],
dtype='object'))
采用高级统计方法识别特征效用
from sklearn.linear_model import RandomizedLogisticRegression
r_logistic = RandomizedLogisticRegression(C=1,
scaling=0.5,
sample_fraction=0.75,
n_resampling=100,
# selection_thershold=0.1,
fit_intercept=True,
verbose=True,
# normolize=True,
random_state=1234,
n_jobs=1)
C:\Users\M4500\Anaconda3\envs\py35_xgboost\lib\site-packages\sklearn\utils\deprecation.py:58: DeprecationWarning: Class RandomizedLogisticRegression is deprecated; The class RandomizedLogisticRegression is deprecated in 0.19 and will be removed in 0.21.
warnings.warn(msg, category=DeprecationWarning)
r_logistic.fit(dataset2_sub[['gouwu_num','shejiao_num','hunyin_cata','monthincome_bins','zhichu_ratio_addcata']], dataset2_sub['cd'])
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 5.0min finished
RandomizedLogisticRegression(C=1, fit_intercept=True, memory=None, n_jobs=1,
n_resampling=100, normalize=True, pre_dispatch='3*n_jobs',
random_state=1234, sample_fraction=0.75, scaling=0.5,
selection_threshold=0.25, tol=0.001, verbose=True)
r_logistic.get_support()
array([ True, True, True, True, True])
r_logistic.all_scores_
array([[1. ],
[0.9 ],
[0.67],
[1. ],
[1. ]])
从结果观察可知,对于5个离散特征:[‘gouwu_num’,‘shejiao_num’,‘hunyin_cata’,‘monthincome_bins’,‘zhichu_ratio_addcata’]除了’hunyin_cata’重要性较低之外,其他变量都重要性得分很高。对于连续特征’zhichu_ratio’,在单因素分析评估中影响显著。
故可考虑将6个衍生特征视为重要特征使用。
结论
综合以上的判别和评估,认为社交数量、购物账号数量、收入分级、家庭支出比重及其类型的重要性最为主要,其次是家庭结构。因此,认为所有的衍生特征均可考虑添加到模型当中进一步测试和评估它们的贡献程度。