import os
import pandas as pd

os.chdir('E:\BaiduNetdiskDownload\违约贷款案例')

# ## 1.1导入数据
loanfile = os.listdir()
createVar = locals()
for i in loanfile:
    if i.endswith("csv"):        
        createVar[i.split('.')[0]] = pd.read_csv(i, encoding = 'gbk')
        print(i.split('.')[0])# 将数据表依次导入并写出表名

counts
card
clients
disp
district
loans
order
RFM_TRAD_FLOW
trans

# ## 1.2、生成被解释变量bad_good
bad_good = {'B':1, 'D':1, 'A':0, 'C': 2} #A为正常还款,BD违约,C还在还款
loans['bad_good'] = loans.status.map(bad_good)
loans.head()

loan_id account_id date amount duration payments status bad_good
0 5314 1787 1993-07-05 96396 12 8033 B 1
1 5316 1801 1993-07-11 165960 36 4610 A 0
2 6863 9188 1993-07-28 127080 60 2118 A 0
3 5325 1843 1993-08-03 105804 36 2939 A 0
4 7240 11013 1993-09-06 274740 60 4579 A 0

# ## 1.3、借款人的年龄、性别

data2 = pd.merge(loans, disp, on = 'account_id', how = 'left') #表之间关联起来,用left连接

data2 = pd.merge(data2, clients, on = 'client_id', how = 'left')

data2=data2[data2.type=='所有者']

data2.head()

loan_id account_id date amount duration payments status bad_good disp_id client_id type sex birth_date district_id
0 5314 1787 1993-07-05 96396 12 8033 B 1 2166 2166 所有者 1947-07-22 30
1 5316 1801 1993-07-11 165960 36 4610 A 0 2181 2181 所有者 1968-07-22 46
2 6863 9188 1993-07-28 127080 60 2118 A 0 11006 11314 所有者 1936-06-02 45
3 5325 1843 1993-08-03 105804 36 2939 A 0 2235 2235 所有者 1940-04-20 14
4 7240 11013 1993-09-06 274740 60 4579 A 0 13231 13539 所有者 1978-09-07 63

# ## 1.4、借款人居住地的经济状况
data3 = pd.merge(data2, district, left_on = 'district_id', right_on = 'A1', how = 'left')
data3.head()

loan_id account_id date amount duration payments status bad_good disp_id client_id ... A1 GDP A4 A10 A11 A12 A13 A14 A15 a16
0 5314 1787 1993-07-05 96396 12 8033 B 1 2166 2166 ... 30 16979 94812 81.8 9650 3.38 3.67 100 15.7 14.8
1 5316 1801 1993-07-11 165960 36 4610 A 0 2181 2181 ... 46 14111 112709 73.5 8369 1.79 2.31 117 12.7 11.6
2 6863 9188 1993-07-28 127080 60 2118 A 0 11006 11314 ... 45 12888 77917 53.5 8390 2.28 2.89 132 13.3 13.6
3 5325 1843 1993-08-03 105804 36 2939 A 0 2235 2235 ... 14 31891 177686 74.8 10045 1.42 1.71 135 18.6 17.7
4 7240 11013 1993-09-06 274740 60 4579 A 0 13231 13539 ... 63 11322 86513 50.5 8288 3.79 4.52 110 9.0 8.4

# ## 1.5、贷款前一年内的账户平均余额、余额的标准差、变异系数、平均收入和平均支出的比例

data_4temp1 = pd.merge(loans[['account_id', 'date']],trans[['account_id','type','amount','balance','date']],                       on = 'account_id')
data_4temp1.columns = ['account_id', 'date', 'type', 'amount', 'balance', 't_date']
data_4temp1 = data_4temp1.sort_values(by = ['account_id','t_date'])
data_4temp1['date']=pd.to_datetime(data_4temp1['date'])
data_4temp1['t_date']=pd.to_datetime(data_4temp1['t_date'])
data_4temp1.tail()
account_id date type amount balance t_date
127263 11362 1996-12-27 $56 $51420 1998-12-08
127264 11362 1996-12-27 $4,780 $46640 1998-12-10
127265 11362 1996-12-27 $5,392 $41248 1998-12-12
127266 11362 1996-12-27 $2,880 $38368 1998-12-19
127267 11362 1996-12-27 $163 $38531 1998-12-31

# ## 将对账户余额进行清洗

data_4temp1['balance2'] = data_4temp1['balance'].map(lambda x: int(''.join(x[1:].split(','))))

data_4temp1['amount2'] = data_4temp1['amount'].map(lambda x: int(''.join(x[1:].split(','))))

data_4temp1.tail()

account_id date type amount balance t_date balance2 amount2
127263 11362 1996-12-27 $56 $51420 1998-12-08 51420 56
127264 11362 1996-12-27 $4,780 $46640 1998-12-10 46640 4780
127265 11362 1996-12-27 $5,392 $41248 1998-12-12 41248 5392
127266 11362 1996-12-27 $2,880 $38368 1998-12-19 38368 2880
127267 11362 1996-12-27 $163 $38531 1998-12-31 38531 163

len(data_4temp1)#查看交易数据量

191556

# ## 根据取数窗口提取交易数据,只要贷款前1年时间内的数据
import datetime
data_4temp2 = data_4temp1[data_4temp1.date>data_4temp1.t_date][
    data_4temp1.date<data_4temp1.t_date+datetime.timedelta(days=365)]
data_4temp2.tail()

account_id date type amount balance t_date balance2 amount2
127026 11362 1996-12-27 $129 $39766 1996-12-06 39766 129
127027 11362 1996-12-27 $10400 $29366 1996-12-07 29366 10400
127028 11362 1996-12-27 $330 $29036 1996-12-07 29036 330
127029 11362 1996-12-27 $56 $28980 1996-12-08 28980 56
127030 11362 1996-12-27 $4,780 $24200 1996-12-10 24200 4780

# ### 1.5.1账户平均余额、余额的标准差、变异系数

data_4temp3 = data_4temp2.groupby('account_id')['balance2'].agg([('avg_balance','mean'), ('stdev_balance','std')])

data_4temp3['cv_balance'] = data_4temp3[['avg_balance','stdev_balance']].apply(lambda x: x[1]/x[0],axis = 1)

data_4temp3.head()

avg_balance stdev_balance cv_balance
account_id      
2 32590.759259 12061.802206 0.370099
19 25871.223684 15057.521648 0.582018
25 56916.984496 21058.667949 0.369989
37 36658.981308 20782.996690 0.566928
38 31383.581818 10950.723180 0.348932
# ### 1.5.2 平均支出和平均收入的比例
type_dict = {'借':'out','贷':'income'}
data_4temp2['type1'] = data_4temp2.type.map(type_dict)
data_4temp4 = data_4temp2.groupby(['account_id','type1'])[['amount2']].sum()
data_4temp4.head()
  amount2
account_id type1  
2 income 276514
out 153020
19 income 254255
out 198020
25 income 726479
#转置
data_4temp5 = pd.pivot_table(data_4temp4, values = 'amount2',index = 'account_id', columns = 'type1')
data_4temp5.fillna(0, inplace = True)
data_4temp5['r_out_in'] = data_4temp5[['out','income']].apply(lambda x: x[0]/x[1], axis = 1)data_4temp5.head()
type1 income out r_out_in
account_id      
2 276514.0 153020.0 0.553390
19 254255.0 198020.0 0.778824
25 726479.0 629108.0 0.865969
37 386357.0 328541.0 0.850356
38 154300.0 105091.0 0.681082

#添加到预测变量

data4 = pd.merge(data3, data_4temp3, left_on='account_id', right_index= True, how = 'left')

data4 = pd.merge(data4, data_4temp5, left_on='account_id', right_index= True, how = 'left')

data4.head()

loan_id account_id date amount duration payments status bad_good disp_id client_id ... A13 A14 A15 a16 avg_balance stdev_balance cv_balance income out r_out_in
0 5314 1787 1993-07-05 96396 12 8033 B 1 2166 2166 ... 3.67 100 15.7 14.8 12250.000000 8330.866301 0.680071 20100.0 0.0 0.000000
1 5316 1801 1993-07-11 165960 36 4610 A 0 2181 2181 ... 2.31 117 12.7 11.6 43975.810811 25468.748605 0.579154 243576.0 164004.0 0.673318
2 6863 9188 1993-07-28 127080 60 2118 A 0 11006 11314 ... 2.89 132 13.3 13.6 30061.041667 11520.127013 0.383224 75146.0 54873.0 0.730219
3 5325 1843 1993-08-03 105804 36 2939 A 0 2235 2235 ... 1.71 135 18.6 17.7 41297.640000 14151.357776 0.342667 120310.0 86018.0 0.714970
4 7240 11013 1993-09-06 274740 60 4579 A 0 13231 13539 ... 4.52 110 9.0 8.4 49780.777778 22172.541600 0.445404 276327.0 235214.0 0.851216

# ## 1.6、计算贷存比,贷收比

data4['r_lb'] = data4[['amount','avg_balance']].apply(lambda x: x[0]/x[1],axis = 1)

data4['r_lincome'] = data4[['amount','income']].apply(lambda x: x[0]/x[1],axis = 1)

data4.head()

loan_id account_id date amount duration payments status bad_good disp_id client_id ... A15 a16 avg_balance stdev_balance cv_balance income out r_out_in r_lb r_lincome
0 5314 1787 1993-07-05 96396 12 8033 B 1 2166 2166 ... 15.7 14.8 12250.000000 8330.866301 0.680071 20100.0 0.0 0.000000 7.869061 4.795821
1 5316 1801 1993-07-11 165960 36 4610 A 0 2181 2181 ... 12.7 11.6 43975.810811 25468.748605 0.579154 243576.0 164004.0 0.673318 3.773893 0.681348
2 6863 9188 1993-07-28 127080 60 2118 A 0 11006 11314 ... 13.3 13.6 30061.041667 11520.127013 0.383224 75146.0 54873.0 0.730219 4.227398 1.691108
3 5325 1843 1993-08-03 105804 36 2939 A 0 2235 2235 ... 18.6 17.7 41297.640000 14151.357776 0.342667 120310.0 86018.0 0.714970 2.561987 0.879428
4 7240 11013 1993-09-06 274740 60 4579 A 0 13231 13539 ... 9.0 8.4 49780.777778 22172.541600 0.445404 276327.0 235214.0 0.851216 5.518998 0.994257

# # 2 构建Logistic模型

data4.columns #查看预测变量

Index(['loan_id', 'account_id', 'date', 'amount', 'duration', 'payments',
       'status', 'bad_good', 'disp_id', 'client_id', 'type', 'sex',
       'birth_date', 'district_id', 'A1', 'GDP', 'A4', 'A10', 'A11', 'A12',
       'A13', 'A14', 'A15', 'a16', 'avg_balance', 'stdev_balance',
       'cv_balance', 'income', 'out', 'r_out_in', 'r_lb', 'r_lincome'],
      dtype='object')

# •提取状态为C的用于预测。其它样本随机抽样,建立训练集与测试集
data_model=data4[data4.status!='C']
for_predict=data4[data4.status=='C']

train = data_model.sample(frac=0.7, random_state=1235).copy()# 随机抽样 70%
test = data_model[~ data_model.index.isin(train.index)].copy()
print(' 训练集样本量: %i \n 测试集样本量: %i' %(len(train), len(test)))

 训练集样本量: 195 
 测试集样本量: 84

# 向前法 aic最小为标准,编写函数来遍历变量筛选变量
def forward_select(data, response):
    import statsmodels.api as sm
    import statsmodels.formula.api as smf
    remaining = set(data.columns)
    remaining.remove(response)
    selected = []
    current_score, best_new_score = float('inf'), float('inf')
    while remaining:
        aic_with_candidates=[]
        for candidate in remaining:
            formula = "{} ~ {}".format(
                response,' + '.join(selected + [candidate]))
            aic = smf.glm(
                formula=formula, data=data, 
                family=sm.families.Binomial(sm.families.links.logit)
            ).fit().aic
            aic_with_candidates.append((aic, candidate))
        aic_with_candidates.sort(reverse=True)
        best_new_score, best_candidate=aic_with_candidates.pop()
        if current_score > best_new_score: 
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
            print ('aic is {},continuing!'.format(current_score))
        else:        
            print ('forward selection over!')
            break
            
    formula = "{} ~ {} ".format(response,' + '.join(selected))
    print('final formula is {}'.format(formula))
    model = smf.glm(
        formula=formula, data=data, 
        family=sm.families.Binomial(sm.families.links.logit)
    ).fit()
    return(model)

 

candidates = ['bad_good', 'A1', 'GDP', 'A4', 'A10', 'A11', 'A12','amount', 'duration',
       'A13', 'A14', 'A15', 'a16', 'avg_balance', 'stdev_balance',
       'cv_balance', 'income', 'out', 'r_out_in', 'r_lb', 'r_lincome']
data_for_select = train[candidates]

lg_m1 = forward_select(data=data_for_select, response='bad_good')
lg_m1.summary().tables[1]

aic is 167.43311432504638,continuing!
aic is 135.82435856041837,continuing!
forward selection over!
final formula is bad_good ~ r_lb + cv_balance  #结论显示贷存比,存款的变异系数  对预测变量影响最大

  coef std err z P>|z| [0.025 0.975]
Intercept -7.4260 1.125 -6.599 0.000 -9.632 -5.220
r_lb 0.4391 0.094 4.651 0.000 0.254 0.624
cv_balance 10.1355 2.094 4.841 0.000 6.032 14.239

 

import sklearn.metrics as metrics#验证,查看roc曲线
import matplotlib.pyplot as plt
fpr, tpr, th = metrics.roc_curve(test.bad_good, lg_m1.predict(test))
plt.figure(figsize=[6, 6])
plt.plot(fpr, tpr, 'b--')
plt.title('ROC curve')
plt.show()

个人贷款违约预测模型

print('AUC = %.4f' %metrics.auc(fpr, tpr))#信用评级,一般大于0.8就可以了

AUC = 0.8846

for_predict['prob']=lg_m1.predict(for_predict) #使用模型预测用户的还款概率
for_predict[['account_id','prob']].head()

account_id prob
23 1071 0.704914
30 5313 0.852249
38 10079 0.118128
39 5385 0.177591
42 8321 0.024302

 

相关文章: