医药统计项目联系QQ:231469242
如果样本量太小,数据必须做分段化处理,否则会有很多空缺数据,woe效果不能有效发挥
随机森林结果
iv》0.02的因子在随机森林结果里都属于有效因子,但是随机森林重要性最强的因子没有出现在有效iv参数里,说明这些缺失重要变量没有做分段处理,数据离散造成。
数据文件
脚本备份
step1_customers_split_goodOrBad.py
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
|
# -*- coding: utf-8 -*-"""Created on Sun Jan 14 21:45:43 2018@author QQ:231469242把数据源分类为两个Excel,好客户Excel数据和坏客户Excel数据"""import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#读取文件readFileName="breast_cancer_总.xlsx"#保存文件saveFileName_good="result_good.xlsx"saveFileName_bad="result_bad.xlsx"#读取exceldf=pd.read_excel(readFileName)#帅选数据df_good=df[df.diagnosis=="B"]df_bad=df[df.diagnosis=="M"]#保存数据df_good.to_excel(saveFileName_good, sheet_name='Sheet1')df_bad.to_excel(saveFileName_bad, sheet_name='Sheet1') |
step2_automate_find_informative_variables.py
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
|
# -*- coding: utf-8 -*-"""Created on Sun Jan 14 22:13:30 2018@author: QQ:231469242woe负数,好客户<坏客户woe正数,好客户>坏客户"""import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
#创建save文件newFile=os.mkdir("save/") #读取文件FileName_good="result_good.xlsx"FileName_bad="result_bad.xlsx"#保存文件saveFileName="result_woe_iv.xlsx"#读取exceldf_good=pd.read_excel(FileName_good)df_bad=pd.read_excel(FileName_bad)#所有变量列表list_columns=list(df_good.columns[:-1])index=0def Ratio_goodDevideBad(index):
#第一列字段名(好客户属性) columnName=list(df_good.columns)[index] #第一列好客户内容和第二列坏客户内容 column_goodCustomers=df_good[columnName] column_badCustomers=df_bad[columnName] #去掉NAN num_goodCustomers=column_goodCustomers.dropna() #统计数量 num_goodCustomers=num_goodCustomers.size #去掉NAN num_badCustomers=column_badCustomers.dropna() #统计数量 num_badCustomers=num_badCustomers.size #第一列频率分析 frenquency_goodCustomers=column_goodCustomers.value_counts() #第二列频率分析 frenquency_badCustomers=column_badCustomers.value_counts() #各个元素占比 ratio_goodCustomers=frenquency_goodCustomers/num_goodCustomers ratio_badCustomers=frenquency_badCustomers/num_badCustomers #最终好坏比例 ratio_goodDevideBad=ratio_goodCustomers/ratio_badCustomers return (columnName,num_goodCustomers,num_badCustomers,frenquency_goodCustomers,frenquency_badCustomers,ratio_goodCustomers,ratio_badCustomers,ratio_goodDevideBad)
#woe函数,阵列计算def Woe(ratio_goodDevideBad):
woe=np.log(ratio_goodDevideBad) return woe
''' #iv函数,阵列计算def Iv(woe): iv=(ratio_goodCustomers-ratio_badCustomers)*woe return iv '''#iv参数评估,参数iv_sum(变量iv总值)def Iv_estimate(iv_sum):
#如果iv值大于0.02,为有效因子 if iv_sum>0.02:
print("informative") return "A"
#评估能力一般 else: print("not informative") return "B"
'''#详细参数输出def Print(): print ("columnName:",columnName) Iv_estimate(iv_sum) print("iv_sum",iv_sum) #print("",) #print("",) ''' #详细参数保存到excel,save文件里 def Write_singleVariable_to_Excel(index):
#index为变量索引,第一个变量,index=0 ratio=Ratio_goodDevideBad(index) columnName,num_goodCustomers,num_badCustomers,frenquency_goodCustomers,frenquency_badCustomers,ratio_goodCustomers,ratio_badCustomers,ratio_goodDevideBad=ratio[0],ratio[1],ratio[2],ratio[3],ratio[4],ratio[5],ratio[6],ratio[7] woe=Woe(ratio_goodDevideBad) iv=(ratio_goodCustomers-ratio_badCustomers)*woe df_woe_iv=pd.DataFrame({"num_goodCustomers":num_goodCustomers,"num_badCustomers":num_badCustomers,"frenquency_goodCustomers":frenquency_goodCustomers, "frenquency_badCustomers":frenquency_badCustomers,"ratio_goodCustomers":ratio_goodCustomers, "ratio_badCustomers":ratio_badCustomers,"ratio_goodDevideBad":ratio_goodDevideBad, "woe":woe,"iv":iv},columns=["num_goodCustomers","num_badCustomers","frenquency_goodCustomers","frenquency_badCustomers", "ratio_goodCustomers","ratio_badCustomers","ratio_goodDevideBad","woe","iv"]) #sort_values(by=...)用于对指定字段排序 df_sort=df_woe_iv.sort_values(by='iv',ascending=False) #ratio_badDevideGood数据写入到result_compare_badDevideGood.xlsx文件 df_sort.to_excel("save/"+columnName+".xlsx") #计算iv总和,评估整体变量 iv_sum=sum([i for i in iv if np.isnan(i)!=True])
print ("变量:",columnName)
#iv参数评估,参数iv_sum(变量iv总值) iv_estimate=Iv_estimate(iv_sum) print("iv_sum",iv_sum) return iv_estimate,columnName
#y\有价值变量列表存储器list_Informative_variables=[]#写入所有变量参数,保存到excel里,save文件for i in range(len(list_columns)):
status=Write_singleVariable_to_Excel(i)[0] columnName=Write_singleVariable_to_Excel(i)[1] if status=="A":
list_Informative_variables.append(columnName) |
最终得到一部分有效因子,共12个,经过数据分段化处理,会得到更多有效因子。