赛题链接: https://tianchi.aliyun.com/competition/entrance/231593/introduction
天池的这个比赛用于学习数据挖掘是一个很好的例子,该题核心是对特征的处理与提取。
从大神代码里习得了xgboost, 学习了特征处理的精髓后,训练调优了一版GBDT。 效果还不错:
最好成绩101:
数据处理部分:
1 # -*- coding: utf-8 -*- 2 """ 3 Created on Wed Dec 4 14:30:36 2019 4 5 @author: wenzhe.tian 6 7 8 本赛题提供用户在2016年1月1日至2016年6月30日之间真实线上线下消费行为,预测用户在2016年7月领取优惠券后15天以内的使用情况。 9 注意: 为了保护用户和商家的隐私,所有数据均作匿名处理,同时采用了有偏采样和必要过滤。 10 11 用户消费日期特征添加:法定节假日? 12 13 """ 14 15 import pandas as pd 16 import numpy as np 17 from datetime import date 18 19 # 数据导入 20 off_train = pd.read_csv('data/ccf_offline_stage1_train.csv') 21 off_train.columns = ['user_id','merchant_id','coupon_id','discount_rate','distance','date_received','date'] 22 23 off_test = pd.read_csv('data/ccf_offline_stage1_test_revised.csv') 24 off_test.columns = ['user_id','merchant_id','coupon_id','discount_rate','distance','date_received'] 25 26 on_train = pd.read_csv('data/ccf_online_stage1_train.csv') 27 on_train.columns = ['user_id','merchant_id','action','coupon_id','discount_rate','date_received','date'] 28 29 # 日期格式处理 30 off_train['date']= off_train['date'].fillna('null') 31 off_train['date_received']= off_train['date_received'].fillna('null') 32 off_train.date= off_train.date.astype('str') 33 off_train.date_received= off_train.date_received.astype('str') 34 35 # 时间差分 3个预测模型 36 dataset3 = off_test 37 feature3 = off_train[((off_train.date>='20160315')&(off_train.date<='20160630'))|((off_train.date=='null')&(off_train.date_received>='20160315')&(off_train.date_received<='20160630'))] 38 dataset2 = off_train[(off_train.date_received>='20160515')&(off_train.date_received<='20160615')] 39 feature2 = off_train[(off_train.date>='20160201')&(off_train.date<='20160514')|((off_train.date=='null')&(off_train.date_received>='20160201')&(off_train.date_received<='20160514'))] 40 dataset1 = off_train[(off_train.date_received>='20160414')&(off_train.date_received<='20160514')] 41 feature1 = off_train[(off_train.date>='20160101')&(off_train.date<='20160413')|((off_train.date=='null')&(off_train.date_received>='20160101')&(off_train.date_received<='20160413'))] 42 43 44 ############# other feature ##################3 45 """ 46 5. other feature: 47 this_month_user_receive_all_coupon_count 48 this_month_user_receive_same_coupon_count 49 this_month_user_receive_same_coupon_lastone 50 this_month_user_receive_same_coupon_firstone 51 this_day_user_receive_all_coupon_count 52 this_day_user_receive_same_coupon_count 53 day_gap_before, day_gap_after (receive the same coupon) 54 """ 55 56 #for dataset3 (标签) 需写出目的 57 # 按照用户id 进行重组统计齐获得优惠券的数目 58 t = dataset3[['user_id']] 59 t['this_month_user_receive_all_coupon_count'] = 1 60 t = t.groupby('user_id').agg('sum').reset_index() 61 62 # 按照用户id和优惠券id(类型) 进行重组统计齐获得相同优惠券的数目 63 t1 = dataset3[['user_id','coupon_id']] 64 t1['this_month_user_receive_same_coupon_count'] = 1 65 t1 = t1.groupby(['user_id','coupon_id']).agg('sum').reset_index() 66 67 #按照两个id 对收到优惠券的时间统计并按照:隔开 68 t2 = dataset3[['user_id','coupon_id','date_received']] 69 t2.date_received = t2.date_received.astype('str') 70 t2 = t2.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index() 71 #统计收到的优惠券数量 72 t2['receive_number'] = t2.date_received.apply(lambda s:len(s.split(':'))) 73 #只要发了2张及以上的数据 74 t2 = t2[t2.receive_number>1] 75 #添加最早和最晚获取优惠券的时间数据 并只取这两个数据和两个id作为t2 76 t2['max_date_received'] = t2.date_received.apply(lambda s:max([int(float(d)) for d in s.split(':')])) 77 t2['min_date_received'] = t2.date_received.apply(lambda s:min([int(float(d)) for d in s.split(':')])) 78 t2 = t2[['user_id','coupon_id','max_date_received','min_date_received']] 79 80 # t2按照两个id 81 t3 = dataset3[['user_id','coupon_id','date_received']] 82 t3 = pd.merge(t3,t2,on=['user_id','coupon_id'],how='left') 83 84 t3.max_date_received=t3.max_date_received.astype('float') 85 t3.date_received=t3.date_received.astype('float') 86 t3['this_month_user_receive_same_coupon_lastone'] = t3.max_date_received - t3.date_received 87 t3['this_month_user_receive_same_coupon_firstone'] = t3.date_received - t3.min_date_received 88 def is_firstlastone(x): 89 if x==0: 90 return 1 91 elif x>0: 92 return 0 93 else: 94 return -1 #those only receive once 95 96 t3.this_month_user_receive_same_coupon_lastone = t3.this_month_user_receive_same_coupon_lastone.apply(is_firstlastone) 97 t3.this_month_user_receive_same_coupon_firstone = t3.this_month_user_receive_same_coupon_firstone.apply(is_firstlastone) 98 t3 = t3[['user_id','coupon_id','date_received','this_month_user_receive_same_coupon_lastone','this_month_user_receive_same_coupon_firstone']] 99 100 t4 = dataset3[['user_id','date_received']] 101 t4['this_day_user_receive_all_coupon_count'] = 1 102 t4 = t4.groupby(['user_id','date_received']).agg('sum').reset_index() 103 104 t5 = dataset3[['user_id','coupon_id','date_received']] 105 t5['this_day_user_receive_same_coupon_count'] = 1 106 t5 = t5.groupby(['user_id','coupon_id','date_received']).agg('sum').reset_index() 107 108 t6 = dataset3[['user_id','coupon_id','date_received']] 109 t6.date_received = t6.date_received.astype('str') 110 t6 = t6.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index() 111 t6.rename(columns={'date_received':'dates'},inplace=True) 112 113 def get_day_gap_before(s): 114 date_received,dates = s.split('-') 115 dates = dates.split(':') 116 gaps = [] 117 for d in dates: 118 this_gap = (date(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))-date(int(d[0:4]),int(d[4:6]),int(d[6:8]))).days 119 if this_gap>0: 120 gaps.append(this_gap) 121 if len(gaps)==0: 122 return -1 123 else: 124 return min(gaps) 125 126 def get_day_gap_after(s): 127 date_received,dates = s.split('-') 128 dates = dates.split(':') 129 gaps = [] 130 for d in dates: 131 this_gap = (date(int(d[0:4]),int(d[4:6]),int(d[6:8]))-date(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))).days 132 if this_gap>0: 133 gaps.append(this_gap) 134 if len(gaps)==0: 135 return -1 136 else: 137 return min(gaps) 138 139 140 t7 = dataset3[['user_id','coupon_id','date_received']] 141 t7 = pd.merge(t7,t6,on=['user_id','coupon_id'],how='left') 142 t7['date_received_date'] = t7.date_received.astype('str') + '-' + t7.dates 143 t7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before) 144 t7['day_gap_after'] = t7.date_received_date.apply(get_day_gap_after) 145 t7 = t7[['user_id','coupon_id','date_received','day_gap_before','day_gap_after']] 146 147 other_feature3 = pd.merge(t1,t,on='user_id') 148 other_feature3 = pd.merge(other_feature3,t3,on=['user_id','coupon_id']) 149 other_feature3 = pd.merge(other_feature3,t4,on=['user_id','date_received']) 150 other_feature3 = pd.merge(other_feature3,t5,on=['user_id','coupon_id','date_received']) 151 other_feature3 = pd.merge(other_feature3,t7,on=['user_id','coupon_id','date_received']) 152 other_feature3.to_csv('data/other_feature3.csv',index=None) 153 print (other_feature3.shape) 154 155 156 157 #for dataset2 158 t = dataset2[['user_id']] 159 t['this_month_user_receive_all_coupon_count'] = 1 160 t = t.groupby('user_id').agg('sum').reset_index() 161 162 t1 = dataset2[['user_id','coupon_id']] 163 t1['this_month_user_receive_same_coupon_count'] = 1 164 t1 = t1.groupby(['user_id','coupon_id']).agg('sum').reset_index() 165 166 t2 = dataset2[['user_id','coupon_id','date_received']] 167 t2.date_received = t2.date_received.astype('str') 168 t2 = t2.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index() 169 t2['receive_number'] = t2.date_received.apply(lambda s:len(s.split(':'))) 170 t2 = t2[t2.receive_number>1] 171 t2['max_date_received'] = t2.date_received.apply(lambda s:max([int(float(d)) for d in s.split(':')])) 172 t2['min_date_received'] = t2.date_received.apply(lambda s:min([int(float(d)) for d in s.split(':')])) 173 t2 = t2[['user_id','coupon_id','max_date_received','min_date_received']] 174 175 t3 = dataset2[['user_id','coupon_id','date_received']] 176 t3 = pd.merge(t3,t2,on=['user_id','coupon_id'],how='left') 177 t3['this_month_user_receive_same_coupon_lastone'] = t3.max_date_received - t3.date_received.astype('float').astype('int') 178 t3['this_month_user_receive_same_coupon_firstone'] = t3.date_received.astype('float').astype('int') - t3.min_date_received 179 def is_firstlastone(x): 180 if x==0: 181 return 1 182 elif x>0: 183 return 0 184 else: 185 return -1 #those only receive once 186 187 t3.this_month_user_receive_same_coupon_lastone = t3.this_month_user_receive_same_coupon_lastone.apply(is_firstlastone) 188 t3.this_month_user_receive_same_coupon_firstone = t3.this_month_user_receive_same_coupon_firstone.apply(is_firstlastone) 189 t3 = t3[['user_id','coupon_id','date_received','this_month_user_receive_same_coupon_lastone','this_month_user_receive_same_coupon_firstone']] 190 191 t4 = dataset2[['user_id','date_received']] 192 t4['this_day_user_receive_all_coupon_count'] = 1 193 t4 = t4.groupby(['user_id','date_received']).agg('sum').reset_index() 194 195 t5 = dataset2[['user_id','coupon_id','date_received']] 196 t5['this_day_user_receive_same_coupon_count'] = 1 197 t5 = t5.groupby(['user_id','coupon_id','date_received']).agg('sum').reset_index() 198 199 t6 = dataset2[['user_id','coupon_id','date_received']] 200 t6.date_received = t6.date_received.astype('str') 201 t6 = t6.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index() 202 t6.rename(columns={'date_received':'dates'},inplace=True) 203 204 def get_day_gap_before(s): 205 date_received,dates = s.split('-') 206 dates = dates.split(':') 207 gaps = [] 208 for d in dates: 209 this_gap = (date(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))-date(int(d[0:4]),int(d[4:6]),int(d[6:8]))).days 210 if this_gap>0: 211 gaps.append(this_gap) 212 if len(gaps)==0: 213 return -1 214 else: 215 return min(gaps) 216 217 def get_day_gap_after(s): 218 date_received,dates = s.split('-') 219 dates = dates.split(':') 220 gaps = [] 221 for d in dates: 222 this_gap = (date(int(d[0:4]),int(d[4:6]),int(d[6:8]))-date(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))).days 223 if this_gap>0: 224 gaps.append(this_gap) 225 if len(gaps)==0: 226 return -1 227 else: 228 return min(gaps) 229 230 231 t7 = dataset2[['user_id','coupon_id','date_received']] 232 t7 = pd.merge(t7,t6,on=['user_id','coupon_id'],how='left') 233 t7['date_received_date'] = t7.date_received.astype('str') + '-' + t7.dates 234 t7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before) 235 t7['day_gap_after'] = t7.date_received_date.apply(get_day_gap_after) 236 t7 = t7[['user_id','coupon_id','date_received','day_gap_before','day_gap_after']] 237 238 other_feature2 = pd.merge(t1,t,on='user_id') 239 other_feature2 = pd.merge(other_feature2,t3,on=['user_id','coupon_id']) 240 other_feature2 = pd.merge(other_feature2,t4,on=['user_id','date_received']) 241 other_feature2 = pd.merge(other_feature2,t5,on=['user_id','coupon_id','date_received']) 242 other_feature2 = pd.merge(other_feature2,t7,on=['user_id','coupon_id','date_received']) 243 other_feature2.to_csv('data/other_feature2.csv',index=None) 244 print (other_feature2.shape) 245 246 247 248 #for dataset1 249 t = dataset1[['user_id']] 250 t['this_month_user_receive_all_coupon_count'] = 1 251 t = t.groupby('user_id').agg('sum').reset_index() 252 253 t1 = dataset1[['user_id','coupon_id']] 254 t1['this_month_user_receive_same_coupon_count'] = 1 255 t1 = t1.groupby(['user_id','coupon_id']).agg('sum').reset_index() 256 257 t2 = dataset1[['user_id','coupon_id','date_received']] 258 t2.date_received = t2.date_received.astype('str') 259 t2 = t2.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index() 260 t2['receive_number'] = t2.date_received.apply(lambda s:len(s.split(':'))) 261 t2 = t2[t2.receive_number>1] 262 t2['max_date_received'] = t2.date_received.apply(lambda s:max([int(float(d)) for d in s.split(':')])) 263 t2['min_date_received'] = t2.date_received.apply(lambda s:min([int(float(d)) for d in s.split(':')])) 264 t2 = t2[['user_id','coupon_id','max_date_received','min_date_received']] 265 266 t3 = dataset1[['user_id','coupon_id','date_received']] 267 t3 = pd.merge(t3,t2,on=['user_id','coupon_id'],how='left') 268 t3['this_month_user_receive_same_coupon_lastone'] = t3.max_date_received - t3.date_received.astype('float').astype('int') 269 t3['this_month_user_receive_same_coupon_firstone'] = t3.date_received.astype('float').astype('int') - t3.min_date_received 270 def is_firstlastone(x): 271 if x==0: 272 return 1 273 elif x>0: 274 return 0 275 else: 276 return -1 #those only receive once 277 278 t3.this_month_user_receive_same_coupon_lastone = t3.this_month_user_receive_same_coupon_lastone.apply(is_firstlastone) 279 t3.this_month_user_receive_same_coupon_firstone = t3.this_month_user_receive_same_coupon_firstone.apply(is_firstlastone) 280 t3 = t3[['user_id','coupon_id','date_received','this_month_user_receive_same_coupon_lastone','this_month_user_receive_same_coupon_firstone']] 281 282 t4 = dataset1[['user_id','date_received']] 283 t4['this_day_user_receive_all_coupon_count'] = 1 284 t4 = t4.groupby(['user_id','date_received']).agg('sum').reset_index() 285 286 t5 = dataset1[['user_id','coupon_id','date_received']] 287 t5['this_day_user_receive_same_coupon_count'] = 1 288 t5 = t5.groupby(['user_id','coupon_id','date_received']).agg('sum').reset_index() 289 290 t6 = dataset1[['user_id','coupon_id','date_received']] 291 t6.date_received = t6.date_received.astype('str') 292 t6 = t6.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index() 293 t6.rename(columns={'date_received':'dates'},inplace=True) 294 295 def get_day_gap_before(s): 296 date_received,dates = s.split('-') 297 dates = dates.split(':') 298 gaps = [] 299 for d in dates: 300 this_gap = (date(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))-date(int(d[0:4]),int(d[4:6]),int(d[6:8]))).days 301 if this_gap>0: 302 gaps.append(this_gap) 303 if len(gaps)==0: 304 return -1 305 else: 306 return min(gaps) 307 308 def get_day_gap_after(s): 309 date_received,dates = s.split('-') 310 dates = dates.split(':') 311 gaps = [] 312 for d in dates: 313 this_gap = (date(int(d[0:4]),int(d[4:6]),int(d[6:8]))-date(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))).days 314 if this_gap>0: 315 gaps.append(this_gap) 316 if len(gaps)==0: 317 return -1 318 else: 319 return min(gaps) 320 321 322 t7 = dataset1[['user_id','coupon_id','date_received']] 323 t7 = pd.merge(t7,t6,on=['user_id','coupon_id'],how='left') 324 t7['date_received_date'] = t7.date_received.astype('str') + '-' + t7.dates 325 t7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before) 326 t7['day_gap_after'] = t7.date_received_date.apply(get_day_gap_after) 327 t7 = t7[['user_id','coupon_id','date_received','day_gap_before','day_gap_after']] 328 329 other_feature1 = pd.merge(t1,t,on='user_id') 330 other_feature1 = pd.merge(other_feature1,t3,on=['user_id','coupon_id']) 331 other_feature1 = pd.merge(other_feature1,t4,on=['user_id','date_received']) 332 other_feature1 = pd.merge(other_feature1,t5,on=['user_id','coupon_id','date_received']) 333 other_feature1 = pd.merge(other_feature1,t7,on=['user_id','coupon_id','date_received']) 334 other_feature1.to_csv('data/other_feature1.csv',index=None) 335 print (other_feature1.shape) 336 337 338 339 340 341 342 ############# coupon related feature ############# 343 """ 344 2.coupon related: 345 discount_rate. discount_man. discount_jian. is_man_jian 346 day_of_week,day_of_month. (date_received) 347 """ 348 def calc_discount_rate(s): 349 s =str(s) 350 s = s.split(':') 351 if len(s)==1: 352 return float(s[0]) 353 else: 354 return 1.0-float(s[1])/float(s[0]) 355 356 def get_discount_man(s): 357 s =str(s) 358 s = s.split(':') 359 if len(s)==1: 360 return 'null' 361 else: 362 return int(s[0]) 363 364 def get_discount_jian(s): 365 s =str(s) 366 s = s.split(':') 367 if len(s)==1: 368 return 'null' 369 else: 370 return int(s[1]) 371 372 def is_man_jian(s): 373 s =str(s) 374 s = s.split(':') 375 if len(s)==1: 376 return 0 377 else: 378 return 1 379 380 #dataset3 381 dataset3['day_of_week'] = dataset3.date_received.astype('str').apply(lambda x:date(int(x[0:4]),int(x[4:6]),int(x[6:8])).weekday()+1) 382 dataset3['day_of_month'] = dataset3.date_received.astype('str').apply(lambda x:int(x[6:8])) 383 dataset3['days_distance'] = dataset3.date_received.astype('str').apply(lambda x:(date(int(x[0:4]),int(x[4:6]),int(x[6:8]))-date(2016,6,30)).days) 384 dataset3['discount_man'] = dataset3.discount_rate.apply(get_discount_man) 385 dataset3['discount_jian'] = dataset3.discount_rate.apply(get_discount_jian) 386 dataset3['is_man_jian'] = dataset3.discount_rate.apply(is_man_jian) 387 dataset3['discount_rate'] = dataset3.discount_rate.apply(calc_discount_rate) 388 d = dataset3[['coupon_id']] 389 d['coupon_count'] = 1 390 d = d.groupby('coupon_id').agg('sum').reset_index() 391 dataset3 = pd.merge(dataset3,d,on='coupon_id',how='left') 392 dataset3.to_csv('data/coupon3_feature.csv',index=None) 393 #dataset2 394 dataset2['day_of_week'] = dataset2.date_received.astype('str').apply(lambda x:date(int(x[0:4]),int(x[4:6]),int(x[6:8])).weekday()+1) 395 dataset2['day_of_month'] = dataset2.date_received.astype('str').apply(lambda x:int(x[6:8])) 396 dataset2['days_distance'] = dataset2.date_received.astype('str').apply(lambda x:(date(int(x[0:4]),int(x[4:6]),int(x[6:8]))-date(2016,5,14)).days) 397 dataset2['discount_man'] = dataset2.discount_rate.apply(get_discount_man) 398 dataset2['discount_jian'] = dataset2.discount_rate.apply(get_discount_jian) 399 dataset2['is_man_jian'] = dataset2.discount_rate.apply(is_man_jian) 400 dataset2['discount_rate'] = dataset2.discount_rate.apply(calc_discount_rate) 401 d = dataset2[['coupon_id']] 402 d['coupon_count'] = 1 403 d = d.groupby('coupon_id').agg('sum').reset_index() 404 dataset2 = pd.merge(dataset2,d,on='coupon_id',how='left') 405 dataset2.to_csv('data/coupon2_feature.csv',index=None) 406 #dataset1 407 dataset1['day_of_week'] = dataset1.date_received.astype('str').apply(lambda x:date(int(x[0:4]),int(x[4:6]),int(x[6:8])).weekday()+1) 408 dataset1['day_of_month'] = dataset1.date_received.astype('str').apply(lambda x:int(x[6:8])) 409 dataset1['days_distance'] = dataset1.date_received.astype('str').apply(lambda x:(date(int(x[0:4]),int(x[4:6]),int(x[6:8]))-date(2016,4,13)).days) 410 dataset1['discount_man'] = dataset1.discount_rate.apply(get_discount_man) 411 dataset1['discount_jian'] = dataset1.discount_rate.apply(get_discount_jian) 412 dataset1['is_man_jian'] = dataset1.discount_rate.apply(is_man_jian) 413 dataset1['discount_rate'] = dataset1.discount_rate.apply(calc_discount_rate) 414 d = dataset1[['coupon_id']] 415 d['coupon_count'] = 1 416 d = d.groupby('coupon_id').agg('sum').reset_index() 417 dataset1 = pd.merge(dataset1,d,on='coupon_id',how='left') 418 dataset1.to_csv('data/coupon1_feature.csv',index=None) 419 420 421 ############# merchant related feature ############# 422 """ 423 1.merchant related: 424 total_sales. sales_use_coupon. total_coupon 425 coupon_rate = sales_use_coupon/total_sales. 426 transfer_rate = sales_use_coupon/total_coupon. 427 merchant_avg_distance,merchant_min_distance,merchant_max_distance of those use coupon 428 429 """ 430 431 #for dataset3 432 merchant3 = feature3[['merchant_id','coupon_id','distance','date_received','date']] 433 434 t = merchant3[['merchant_id']] 435 t.drop_duplicates(inplace=True) 436 437 t1 = merchant3[merchant3.date!='null'][['merchant_id']] 438 t1['total_sales'] = 1 439 t1 = t1.groupby('merchant_id').agg('sum').reset_index() 440 441 merchant3.coupon_id=merchant3.coupon_id.replace(np.nan,'null') 442 t2 = merchant3[(merchant3.date!='null')&(merchant3.coupon_id!='null')][['merchant_id']] 443 t2['sales_use_coupon'] = 1 444 t2 = t2.groupby('merchant_id').agg('sum').reset_index() 445 446 t3 = merchant3[merchant3.coupon_id!='null'][['merchant_id']] 447 t3['total_coupon'] = 1 448 t3 = t3.groupby('merchant_id').agg('sum').reset_index() 449 450 t4 = merchant3[(merchant3.date!='null')&(merchant3.coupon_id!='null')][['merchant_id','distance']] 451 t4.replace('null',-1,inplace=True) 452 #t4.distance = t4.distance.astype('int') 453 t4.replace(-1,np.nan,inplace=True) 454 t5 = t4.groupby('merchant_id').agg('min').reset_index() 455 t5.rename(columns={'distance':'merchant_min_distance'},inplace=True) 456 457 t6 = t4.groupby('merchant_id').agg('max').reset_index() 458 t6.rename(columns={'distance':'merchant_max_distance'},inplace=True) 459 460 t7 = t4.groupby('merchant_id').agg('mean').reset_index() 461 t7.rename(columns={'distance':'merchant_mean_distance'},inplace=True) 462 463 t8 = t4.groupby('merchant_id').agg('median').reset_index() 464 t8.rename(columns={'distance':'merchant_median_distance'},inplace=True) 465 466 merchant3_feature = pd.merge(t,t1,on='merchant_id',how='left') 467 merchant3_feature = pd.merge(merchant3_feature,t2,on='merchant_id',how='left') 468 merchant3_feature = pd.merge(merchant3_feature,t3,on='merchant_id',how='left') 469 merchant3_feature = pd.merge(merchant3_feature,t5,on='merchant_id',how='left') 470 merchant3_feature = pd.merge(merchant3_feature,t6,on='merchant_id',how='left') 471 merchant3_feature = pd.merge(merchant3_feature,t7,on='merchant_id',how='left') 472 merchant3_feature = pd.merge(merchant3_feature,t8,on='merchant_id',how='left') 473 merchant3_feature.sales_use_coupon = merchant3_feature.sales_use_coupon.replace(np.nan,0) #fillna with 0 474 merchant3_feature['merchant_coupon_transfer_rate'] = merchant3_feature.sales_use_coupon.astype('float') / merchant3_feature.total_coupon 475 merchant3_feature['coupon_rate'] = merchant3_feature.sales_use_coupon.astype('float') / merchant3_feature.total_sales 476 merchant3_feature.total_coupon = merchant3_feature.total_coupon.replace(np.nan,0) #fillna with 0 477 merchant3_feature.to_csv('data/merchant3_feature.csv',index=None) 478 479 480 #for dataset2 481 merchant2 = feature2[['merchant_id','coupon_id','distance','date_received','date']] 482 483 t = merchant2[['merchant_id']] 484 t.drop_duplicates(inplace=True) 485 486 t1 = merchant2[merchant2.date!='null'][['merchant_id']] 487 t1['total_sales'] = 1 488 t1 = t1.groupby('merchant_id').agg('sum').reset_index() 489 490 merchant2.coupon_id=merchant2.coupon_id.replace(np.nan,'null') 491 t2 = merchant2[(merchant2.date!='null')&(merchant2.coupon_id!='null')][['merchant_id']] 492 t2['sales_use_coupon'] = 1 493 t2 = t2.groupby('merchant_id').agg('sum').reset_index() 494 495 t3 = merchant2[merchant2.coupon_id!='null'][['merchant_id']] 496 t3['total_coupon'] = 1 497 t3 = t3.groupby('merchant_id').agg('sum').reset_index() 498 499 t4 = merchant2[(merchant2.date!='null')&(merchant2.coupon_id!='null')][['merchant_id','distance']] 500 t4.replace('null',-1,inplace=True) 501 #t4.distance = t4.distance.astype('int') 502 t4.replace(-1,np.nan,inplace=True) 503 t5 = t4.groupby('merchant_id').agg('min').reset_index() 504 t5.rename(columns={'distance':'merchant_min_distance'},inplace=True) 505 506 t6 = t4.groupby('merchant_id').agg('max').reset_index() 507 t6.rename(columns={'distance':'merchant_max_distance'},inplace=True) 508 509 t7 = t4.groupby('merchant_id').agg('mean').reset_index() 510 t7.rename(columns={'distance':'merchant_mean_distance'},inplace=True) 511 512 t8 = t4.groupby('merchant_id').agg('median').reset_index() 513 t8.rename(columns={'distance':'merchant_median_distance'},inplace=True) 514 515 merchant2_feature = pd.merge(t,t1,on='merchant_id',how='left') 516 merchant2_feature = pd.merge(merchant2_feature,t2,on='merchant_id',how='left') 517 merchant2_feature = pd.merge(merchant2_feature,t3,on='merchant_id',how='left') 518 merchant2_feature = pd.merge(merchant2_feature,t5,on='merchant_id',how='left') 519 merchant2_feature = pd.merge(merchant2_feature,t6,on='merchant_id',how='left') 520 merchant2_feature = pd.merge(merchant2_feature,t7,on='merchant_id',how='left') 521 merchant2_feature = pd.merge(merchant2_feature,t8,on='merchant_id',how='left') 522 merchant2_feature.sales_use_coupon = merchant2_feature.sales_use_coupon.replace(np.nan,0) #fillna with 0 523 merchant2_feature['merchant_coupon_transfer_rate'] = merchant2_feature.sales_use_coupon.astype('float') / merchant2_feature.total_coupon 524 merchant2_feature['coupon_rate'] = merchant2_feature.sales_use_coupon.astype('float') / merchant2_feature.total_sales 525 merchant2_feature.total_coupon = merchant2_feature.total_coupon.replace(np.nan,0) #fillna with 0 526 merchant2_feature.to_csv('data/merchant2_feature.csv',index=None) 527 528 #for dataset1 529 merchant1 = feature1[['merchant_id','coupon_id','distance','date_received','date']] 530 531 t = merchant1[['merchant_id']] 532 t.drop_duplicates(inplace=True) 533 534 t1 = merchant1[merchant1.date!='null'][['merchant_id']] 535 t1['total_sales'] = 1 536 t1 = t1.groupby('merchant_id').agg('sum').reset_index() 537 538 merchant1.coupon_id=merchant1.coupon_id.replace(np.nan,'null') 539 t2 = merchant1[(merchant1.date!='null')&(merchant1.coupon_id!='null')][['merchant_id']] 540 t2['sales_use_coupon'] = 1 541 t2 = t2.groupby('merchant_id').agg('sum').reset_index() 542 543 t3 = merchant1[merchant1.coupon_id!='null'][['merchant_id']] 544 t3['total_coupon'] = 1 545 t3 = t3.groupby('merchant_id').agg('sum').reset_index() 546 547 t4 = merchant1[(merchant1.date!='null')&(merchant1.coupon_id!='null')][['merchant_id','distance']] 548 t4.replace('null',-1,inplace=True) 549 #t4.distance = t4.distance.astype('int') 550 t4.replace(-1,np.nan,inplace=True) 551 t5 = t4.groupby('merchant_id').agg('min').reset_index() 552 t5.rename(columns={'distance':'merchant_min_distance'},inplace=True) 553 554 t6 = t4.groupby('merchant_id').agg('max').reset_index() 555 t6.rename(columns={'distance':'merchant_max_distance'},inplace=True) 556 557 t7 = t4.groupby('merchant_id').agg('mean').reset_index() 558 t7.rename(columns={'distance':'merchant_mean_distance'},inplace=True) 559 560 t8 = t4.groupby('merchant_id').agg('median').reset_index() 561 t8.rename(columns={'distance':'merchant_median_distance'},inplace=True) 562 563 564 merchant1_feature = pd.merge(t,t1,on='merchant_id',how='left') 565 merchant1_feature = pd.merge(merchant1_feature,t2,on='merchant_id',how='left') 566 merchant1_feature = pd.merge(merchant1_feature,t3,on='merchant_id',how='left') 567 merchant1_feature = pd.merge(merchant1_feature,t5,on='merchant_id',how='left') 568 merchant1_feature = pd.merge(merchant1_feature,t6,on='merchant_id',how='left') 569 merchant1_feature = pd.merge(merchant1_feature,t7,on='merchant_id',how='left') 570 merchant1_feature = pd.merge(merchant1_feature,t8,on='merchant_id',how='left') 571 merchant1_feature.sales_use_coupon = merchant1_feature.sales_use_coupon.replace(np.nan,0) #fillna with 0 572 merchant1_feature['merchant_coupon_transfer_rate'] = merchant1_feature.sales_use_coupon.astype('float') / merchant1_feature.total_coupon 573 merchant1_feature['coupon_rate'] = merchant1_feature.sales_use_coupon.astype('float') / merchant1_feature.total_sales 574 merchant1_feature.total_coupon = merchant1_feature.total_coupon.replace(np.nan,0) #fillna with 0 575 merchant1_feature.to_csv('data/merchant1_feature.csv',index=None) 576 577 578 ############# user related feature ############# 579 """ 580 3.user related: 581 count_merchant. 582 user_avg_distance, user_min_distance,user_max_distance. 583 buy_use_coupon. buy_total. coupon_received. 584 buy_use_coupon/coupon_received. 585 buy_use_coupon/buy_total 586 user_date_datereceived_gap 587 588 589 """ 590 591 def get_user_date_datereceived_gap(s): 592 s = s.split(':') 593 return (date(int(s[0][0:4]),int(s[0][4:6]),int(s[0][6:8])) - date(int(s[1][0:4]),int(s[1][4:6]),int(s[1][6:8]))).days 594 595 #for dataset3 596 user3 = feature3[['user_id','merchant_id','coupon_id','discount_rate','distance','date_received','date']] 597 598 t = user3[['user_id']] 599 t.drop_duplicates(inplace=True) 600 601 t1 = user3[user3.date!='null'][['user_id','merchant_id']] 602 t1.drop_duplicates(inplace=True) 603 t1.merchant_id = 1 604 t1 = t1.groupby('user_id').agg('sum').reset_index() 605 t1.rename(columns={'merchant_id':'count_merchant'},inplace=True) 606 user3['coupon_id']=user3['coupon_id'].replace(np.nan,'null') 607 t2 = user3[(user3.date!='null')&(user3.coupon_id!='null')][['user_id','distance']] 608 t2.replace('null',-1,inplace=True) 609 #t2.distance = t2.distance.astype('int') 610 t2.replace(-1,np.nan,inplace=True) 611 t3 = t2.groupby('user_id').agg('min').reset_index() 612 t3.rename(columns={'distance':'user_min_distance'},inplace=True) 613 614 t4 = t2.groupby('user_id').agg('max').reset_index() 615 t4.rename(columns={'distance':'user_max_distance'},inplace=True) 616 617 t5 = t2.groupby('user_id').agg('mean').reset_index() 618 t5.rename(columns={'distance':'user_mean_distance'},inplace=True) 619 620 t6 = t2.groupby('user_id').agg('median').reset_index() 621 t6.rename(columns={'distance':'user_median_distance'},inplace=True) 622 623 t7 = user3[(user3.date!='null')&(user3.coupon_id!='null')][['user_id']] 624 t7['buy_use_coupon'] = 1 625 t7 = t7.groupby('user_id').agg('sum').reset_index() 626 627 t8 = user3[user3.date!='null'][['user_id']] 628 t8['buy_total'] = 1 629 t8 = t8.groupby('user_id').agg('sum').reset_index() 630 631 t9 = user3[user3.coupon_id!='null'][['user_id']] 632 t9['coupon_received'] = 1 633 t9 = t9.groupby('user_id').agg('sum').reset_index() 634 635 t10 = user3[(user3.date_received!='null')&(user3.date!='null')][['user_id','date_received','date']] 636 t10['user_date_datereceived_gap'] = t10.date + ':' + t10.date_received 637 t10.user_date_datereceived_gap = t10.user_date_datereceived_gap.apply(get_user_date_datereceived_gap) 638 t10 = t10[['user_id','user_date_datereceived_gap']] 639 640 t11 = t10.groupby('user_id').agg('mean').reset_index() 641 t11.rename(columns={'user_date_datereceived_gap':'avg_user_date_datereceived_gap'},inplace=True) 642 t12 = t10.groupby('user_id').agg('min').reset_index() 643 t12.rename(columns={'user_date_datereceived_gap':'min_user_date_datereceived_gap'},inplace=True) 644 t13 = t10.groupby('user_id').agg('max').reset_index() 645 t13.rename(columns={'user_date_datereceived_gap':'max_user_date_datereceived_gap'},inplace=True) 646 647 648 user3_feature = pd.merge(t,t1,on='user_id',how='left') 649 user3_feature = pd.merge(user3_feature,t3,on='user_id',how='left') 650 user3_feature = pd.merge(user3_feature,t4,on='user_id',how='left') 651 user3_feature = pd.merge(user3_feature,t5,on='user_id',how='left') 652 user3_feature = pd.merge(user3_feature,t6,on='user_id',how='left') 653 user3_feature = pd.merge(user3_feature,t7,on='user_id',how='left') 654 user3_feature = pd.merge(user3_feature,t8,on='user_id',how='left') 655 user3_feature = pd.merge(user3_feature,t9,on='user_id',how='left') 656 user3_feature = pd.merge(user3_feature,t11,on='user_id',how='left') 657 user3_feature = pd.merge(user3_feature,t12,on='user_id',how='left') 658 user3_feature = pd.merge(user3_feature,t13,on='user_id',how='left') 659 user3_feature.count_merchant = user3_feature.count_merchant.replace(np.nan,0) 660 user3_feature.buy_use_coupon = user3_feature.buy_use_coupon.replace(np.nan,0) 661 user3_feature['buy_use_coupon_rate'] = user3_feature.buy_use_coupon.astype('float') / user3_feature.buy_total.astype('float') 662 user3_feature['user_coupon_transfer_rate'] = user3_feature.buy_use_coupon.astype('float') / user3_feature.coupon_received.astype('float') 663 user3_feature.buy_total = user3_feature.buy_total.replace(np.nan,0) 664 user3_feature.coupon_received = user3_feature.coupon_received.replace(np.nan,0) 665 user3_feature.to_csv('data/user3_feature.csv',index=None) 666 667 668 #for dataset2 669 user2 = feature2[['user_id','merchant_id','coupon_id','discount_rate','distance','date_received','date']] 670 671 t = user2[['user_id']] 672 t.drop_duplicates(inplace=True) 673 674 t1 = user2[user2.date!='null'][['user_id','merchant_id']] 675 t1.drop_duplicates(inplace=True) 676 t1.merchant_id = 1 677 t1 = t1.groupby('user_id').agg('sum').reset_index() 678 t1.rename(columns={'merchant_id':'count_merchant'},inplace=True) 679 user2['coupon_id']=user3['coupon_id'].replace(np.nan,'null') 680 t2 = user2[(user2.date!='null')&(user2.coupon_id!='null')][['user_id','distance']] 681 t2.replace('null',-1,inplace=True) 682 #t2.distance = t2.distance.astype('int') 683 t2.replace(-1,np.nan,inplace=True) 684 t3 = t2.groupby('user_id').agg('min').reset_index() 685 t3.rename(columns={'distance':'user_min_distance'},inplace=True) 686 687 t4 = t2.groupby('user_id').agg('max').reset_index() 688 t4.rename(columns={'distance':'user_max_distance'},inplace=True) 689 690 t5 = t2.groupby('user_id').agg('mean').reset_index() 691 t5.rename(columns={'distance':'user_mean_distance'},inplace=True) 692 693 t6 = t2.groupby('user_id').agg('median').reset_index() 694 t6.rename(columns={'distance':'user_median_distance'},inplace=True) 695 696 t7 = user2[(user2.date!='null')&(user2.coupon_id!='null')][['user_id']] 697 t7['buy_use_coupon'] = 1 698 t7 = t7.groupby('user_id').agg('sum').reset_index() 699 700 t8 = user2[user2.date!='null'][['user_id']] 701 t8['buy_total'] = 1 702 t8 = t8.groupby('user_id').agg('sum').reset_index() 703 704 t9 = user2[user2.coupon_id!='null'][['user_id']] 705 t9['coupon_received'] = 1 706 t9 = t9.groupby('user_id').agg('sum').reset_index() 707 708 t10 = user2[(user2.date_received!='null')&(user2.date!='null')][['user_id','date_received','date']] 709 t10['user_date_datereceived_gap'] = t10.date + ':' + t10.date_received 710 t10.user_date_datereceived_gap = t10.user_date_datereceived_gap.apply(get_user_date_datereceived_gap) 711 t10 = t10[['user_id','user_date_datereceived_gap']] 712 713 t11 = t10.groupby('user_id').agg('mean').reset_index() 714 t11.rename(columns={'user_date_datereceived_gap':'avg_user_date_datereceived_gap'},inplace=True) 715 t12 = t10.groupby('user_id').agg('min').reset_index() 716 t12.rename(columns={'user_date_datereceived_gap':'min_user_date_datereceived_gap'},inplace=True) 717 t13 = t10.groupby('user_id').agg('max').reset_index() 718 t13.rename(columns={'user_date_datereceived_gap':'max_user_date_datereceived_gap'},inplace=True) 719 720 user2_feature = pd.merge(t,t1,on='user_id',how='left') 721 user2_feature = pd.merge(user2_feature,t3,on='user_id',how='left') 722 user2_feature = pd.merge(user2_feature,t4,on='user_id',how='left') 723 user2_feature = pd.merge(user2_feature,t5,on='user_id',how='left') 724 user2_feature = pd.merge(user2_feature,t6,on='user_id',how='left') 725 user2_feature = pd.merge(user2_feature,t7,on='user_id',how='left') 726 user2_feature = pd.merge(user2_feature,t8,on='user_id',how='left') 727 user2_feature = pd.merge(user2_feature,t9,on='user_id',how='left') 728 user2_feature = pd.merge(user2_feature,t11,on='user_id',how='left') 729 user2_feature = pd.merge(user2_feature,t12,on='user_id',how='left') 730 user2_feature = pd.merge(user2_feature,t13,on='user_id',how='left') 731 user2_feature.count_merchant = user2_feature.count_merchant.replace(np.nan,0) 732 user2_feature.buy_use_coupon = user2_feature.buy_use_coupon.replace(np.nan,0) 733 user2_feature['buy_use_coupon_rate'] = user2_feature.buy_use_coupon.astype('float') / user2_feature.buy_total.astype('float') 734 user2_feature['user_coupon_transfer_rate'] = user2_feature.buy_use_coupon.astype('float') / user2_feature.coupon_received.astype('float') 735 user2_feature.buy_total = user2_feature.buy_total.replace(np.nan,0) 736 user2_feature.coupon_received = user2_feature.coupon_received.replace(np.nan,0) 737 user2_feature.to_csv('data/user2_feature.csv',index=None) 738 739 740 #for dataset1 741 user1 = feature1[['user_id','merchant_id','coupon_id','discount_rate','distance','date_received','date']] 742 743 t = user1[['user_id']] 744 t.drop_duplicates(inplace=True) 745 746 t1 = user1[user1.date!='null'][['user_id','merchant_id']] 747 t1.drop_duplicates(inplace=True) 748 t1.merchant_id = 1 749 t1 = t1.groupby('user_id').agg('sum').reset_index() 750 t1.rename(columns={'merchant_id':'count_merchant'},inplace=True) 751 user1['coupon_id']=user3['coupon_id'].replace(np.nan,'null') 752 t2 = user1[(user1.date!='null')&(user1.coupon_id!='null')][['user_id','distance']] 753 t2.replace('null',-1,inplace=True) 754 #t2.distance = t2.distance.astype('int') 755 t2.replace(-1,np.nan,inplace=True) 756 t3 = t2.groupby('user_id').agg('min').reset_index() 757 t3.rename(columns={'distance':'user_min_distance'},inplace=True) 758 759 t4 = t2.groupby('user_id').agg('max').reset_index() 760 t4.rename(columns={'distance':'user_max_distance'},inplace=True) 761 762 t5 = t2.groupby('user_id').agg('mean').reset_index() 763 t5.rename(columns={'distance':'user_mean_distance'},inplace=True) 764 765 t6 = t2.groupby('user_id').agg('median').reset_index() 766 t6.rename(columns={'distance':'user_median_distance'},inplace=True) 767 768 t7 = user1[(user1.date!='null')&(user1.coupon_id!='null')][['user_id']] 769 t7['buy_use_coupon'] = 1 770 t7 = t7.groupby('user_id').agg('sum').reset_index() 771 772 t8 = user1[user1.date!='null'][['user_id']] 773 t8['buy_total'] = 1 774 t8 = t8.groupby('user_id').agg('sum').reset_index() 775 776 t9 = user1[user1.coupon_id!='null'][['user_id']] 777 t9['coupon_received'] = 1 778 t9 = t9.groupby('user_id').agg('sum').reset_index() 779 780 t10 = user1[(user1.date_received!='null')&(user1.date!='null')][['user_id','date_received','date']] 781 t10['user_date_datereceived_gap'] = t10.date + ':' + t10.date_received 782 t10.user_date_datereceived_gap = t10.user_date_datereceived_gap.apply(get_user_date_datereceived_gap) 783 t10 = t10[['user_id','user_date_datereceived_gap']] 784 785 t11 = t10.groupby('user_id').agg('mean').reset_index() 786 t11.rename(columns={'user_date_datereceived_gap':'avg_user_date_datereceived_gap'},inplace=True) 787 t12 = t10.groupby('user_id').agg('min').reset_index() 788 t12.rename(columns={'user_date_datereceived_gap':'min_user_date_datereceived_gap'},inplace=True) 789 t13 = t10.groupby('user_id').agg('max').reset_index() 790 t13.rename(columns={'user_date_datereceived_gap':'max_user_date_datereceived_gap'},inplace=True) 791 792 user1_feature = pd.merge(t,t1,on='user_id',how='left') 793 user1_feature = pd.merge(user1_feature,t3,on='user_id',how='left') 794 user1_feature = pd.merge(user1_feature,t4,on='user_id',how='left') 795 user1_feature = pd.merge(user1_feature,t5,on='user_id',how='left') 796 user1_feature = pd.merge(user1_feature,t6,on='user_id',how='left') 797 user1_feature = pd.merge(user1_feature,t7,on='user_id',how='left') 798 user1_feature = pd.merge(user1_feature,t8,on='user_id',how='left') 799 user1_feature = pd.merge(user1_feature,t9,on='user_id',how='left') 800 user1_feature = pd.merge(user1_feature,t11,on='user_id',how='left') 801 user1_feature = pd.merge(user1_feature,t12,on='user_id',how='left') 802 user1_feature = pd.merge(user1_feature,t13,on='user_id',how='left') 803 user1_feature.count_merchant = user1_feature.count_merchant.replace(np.nan,0) 804 user1_feature.buy_use_coupon = user1_feature.buy_use_coupon.replace(np.nan,0) 805 user1_feature['buy_use_coupon_rate'] = user1_feature.buy_use_coupon.astype('float') / user1_feature.buy_total.astype('float') 806 user1_feature['user_coupon_transfer_rate'] = user1_feature.buy_use_coupon.astype('float') / user1_feature.coupon_received.astype('float') 807 user1_feature.buy_total = user1_feature.buy_total.replace(np.nan,0) 808 user1_feature.coupon_received = user1_feature.coupon_received.replace(np.nan,0) 809 user1_feature.to_csv('data/user1_feature.csv',index=None) 810 811 812 813 ################## user_merchant related feature ######################### 814 815 """ 816 4.user_merchant: 817 times_user_buy_merchant_before. 818 """ 819 #for dataset3 820 all_user_merchant = feature3[['user_id','merchant_id']] 821 all_user_merchant.drop_duplicates(inplace=True) 822 823 t = feature3[['user_id','merchant_id','date']] 824 t = t[t.date!='null'][['user_id','merchant_id']] 825 t['user_merchant_buy_total'] = 1 826 t = t.groupby(['user_id','merchant_id']).agg('sum').reset_index() 827 t.drop_duplicates(inplace=True) 828 829 t1 = feature3[['user_id','merchant_id','coupon_id']] 830 t1.coupon_id=t1.coupon_id.replace(np.nan,'null') 831 t1 = t1[t1.coupon_id!='null'][['user_id','merchant_id']] 832 t1['user_merchant_received'] = 1 833 t1 = t1.groupby(['user_id','merchant_id']).agg('sum').reset_index() 834 t1.drop_duplicates(inplace=True) 835 836 t2 = feature3[['user_id','merchant_id','date','date_received']] 837 t2 = t2[(t2.date!='null')&(t2.date_received!='null')][['user_id','merchant_id']] 838 t2['user_merchant_buy_use_coupon'] = 1 839 t2 = t2.groupby(['user_id','merchant_id']).agg('sum').reset_index() 840 t2.drop_duplicates(inplace=True) 841 842 t3 = feature3[['user_id','merchant_id']] 843 t3['user_merchant_any'] = 1 844 t3 = t3.groupby(['user_id','merchant_id']).agg('sum').reset_index() 845 t3.drop_duplicates(inplace=True) 846 847 t4 = feature3[['user_id','merchant_id','date','coupon_id']] 848 t4.coupon_id=t4.coupon_id.replace(np.nan,'null') 849 t4 = t4[(t4.date!='null')&(t4.coupon_id=='null')][['user_id','merchant_id']] 850 t4['user_merchant_buy_common'] = 1 851 t4 = t4.groupby(['user_id','merchant_id']).agg('sum').reset_index() 852 t4.drop_duplicates(inplace=True) 853 854 user_merchant3 = pd.merge(all_user_merchant,t,on=['user_id','merchant_id'],how='left') 855 user_merchant3 = pd.merge(user_merchant3,t1,on=['user_id','merchant_id'],how='left') 856 user_merchant3 = pd.merge(user_merchant3,t2,on=['user_id','merchant_id'],how='left') 857 user_merchant3 = pd.merge(user_merchant3,t3,on=['user_id','merchant_id'],how='left') 858 user_merchant3 = pd.merge(user_merchant3,t4,on=['user_id','merchant_id'],how='left') 859 user_merchant3.user_merchant_buy_use_coupon = user_merchant3.user_merchant_buy_use_coupon.replace(np.nan,0) 860 user_merchant3.user_merchant_buy_common = user_merchant3.user_merchant_buy_common.replace(np.nan,0) 861 user_merchant3['user_merchant_coupon_transfer_rate'] = user_merchant3.user_merchant_buy_use_coupon.astype('float') / user_merchant3.user_merchant_received.astype('float') 862 user_merchant3['user_merchant_coupon_buy_rate'] = user_merchant3.user_merchant_buy_use_coupon.astype('float') / user_merchant3.user_merchant_buy_total.astype('float') 863 user_merchant3['user_merchant_rate'] = user_merchant3.user_merchant_buy_total.astype('float') / user_merchant3.user_merchant_any.astype('float') 864 user_merchant3['user_merchant_common_buy_rate'] = user_merchant3.user_merchant_buy_common.astype('float') / user_merchant3.user_merchant_buy_total.astype('float') 865 user_merchant3.to_csv('data/user_merchant3.csv',index=None) 866 867 #for dataset2 868 all_user_merchant = feature2[['user_id','merchant_id']] 869 all_user_merchant.drop_duplicates(inplace=True) 870 871 t = feature2[['user_id','merchant_id','date']] 872 t = t[t.date!='null'][['user_id','merchant_id']] 873 t['user_merchant_buy_total'] = 1 874 t = t.groupby(['user_id','merchant_id']).agg('sum').reset_index() 875 t.drop_duplicates(inplace=True) 876 877 t1 = feature2[['user_id','merchant_id','coupon_id']] 878 t1.coupon_id=t1.coupon_id.replace(np.nan,'null') 879 t1 = t1[t1.coupon_id!='null'][['user_id','merchant_id']] 880 t1['user_merchant_received'] = 1 881 t1 = t1.groupby(['user_id','merchant_id']).agg('sum').reset_index() 882 t1.drop_duplicates(inplace=True) 883 884 t2 = feature2[['user_id','merchant_id','date','date_received']] 885 t2 = t2[(t2.date!='null')&(t2.date_received!='null')][['user_id','merchant_id']] 886 t2['user_merchant_buy_use_coupon'] = 1 887 t2 = t2.groupby(['user_id','merchant_id']).agg('sum').reset_index() 888 t2.drop_duplicates(inplace=True) 889 890 t3 = feature2[['user_id','merchant_id']] 891 t3['user_merchant_any'] = 1 892 t3 = t3.groupby(['user_id','merchant_id']).agg('sum').reset_index() 893 t3.drop_duplicates(inplace=True) 894 895 t4 = feature2[['user_id','merchant_id','date','coupon_id']] 896 t4.coupon_id=t4.coupon_id.replace(np.nan,'null') 897 t4 = t4[(t4.date!='null')&(t4.coupon_id=='null')][['user_id','merchant_id']] 898 t4['user_merchant_buy_common'] = 1 899 t4 = t4.groupby(['user_id','merchant_id']).agg('sum').reset_index() 900 t4.drop_duplicates(inplace=True) 901 902 user_merchant2 = pd.merge(all_user_merchant,t,on=['user_id','merchant_id'],how='left') 903 user_merchant2 = pd.merge(user_merchant2,t1,on=['user_id','merchant_id'],how='left') 904 user_merchant2 = pd.merge(user_merchant2,t2,on=['user_id','merchant_id'],how='left') 905 user_merchant2 = pd.merge(user_merchant2,t3,on=['user_id','merchant_id'],how='left') 906 user_merchant2 = pd.merge(user_merchant2,t4,on=['user_id','merchant_id'],how='left') 907 user_merchant2.user_merchant_buy_use_coupon = user_merchant2.user_merchant_buy_use_coupon.replace(np.nan,0) 908 user_merchant2.user_merchant_buy_common = user_merchant2.user_merchant_buy_common.replace(np.nan,0) 909 user_merchant2['user_merchant_coupon_transfer_rate'] = user_merchant2.user_merchant_buy_use_coupon.astype('float') / user_merchant2.user_merchant_received.astype('float') 910 user_merchant2['user_merchant_coupon_buy_rate'] = user_merchant2.user_merchant_buy_use_coupon.astype('float') / user_merchant2.user_merchant_buy_total.astype('float') 911 user_merchant2['user_merchant_rate'] = user_merchant2.user_merchant_buy_total.astype('float') / user_merchant2.user_merchant_any.astype('float') 912 user_merchant2['user_merchant_common_buy_rate'] = user_merchant2.user_merchant_buy_common.astype('float') / user_merchant2.user_merchant_buy_total.astype('float') 913 user_merchant2.to_csv('data/user_merchant2.csv',index=None) 914 915 #for dataset2 916 all_user_merchant = feature1[['user_id','merchant_id']] 917 all_user_merchant.drop_duplicates(inplace=True) 918 919 t = feature1[['user_id','merchant_id','date']] 920 t = t[t.date!='null'][['user_id','merchant_id']] 921 t['user_merchant_buy_total'] = 1 922 t = t.groupby(['user_id','merchant_id']).agg('sum').reset_index() 923 t.drop_duplicates(inplace=True) 924 925 t1 = feature1[['user_id','merchant_id','coupon_id']] 926 t1.coupon_id=t1.coupon_id.replace(np.nan,'null') 927 t1 = t1[t1.coupon_id!='null'][['user_id','merchant_id']] 928 t1['user_merchant_received'] = 1 929 t1 = t1.groupby(['user_id','merchant_id']).agg('sum').reset_index() 930 t1.drop_duplicates(inplace=True) 931 932 t2 = feature1[['user_id','merchant_id','date','date_received']] 933 t2 = t2[(t2.date!='null')&(t2.date_received!='null')][['user_id','merchant_id']] 934 t2['user_merchant_buy_use_coupon'] = 1 935 t2 = t2.groupby(['user_id','merchant_id']).agg('sum').reset_index() 936 t2.drop_duplicates(inplace=True) 937 938 t3 = feature1[['user_id','merchant_id']] 939 t3['user_merchant_any'] = 1 940 t3 = t3.groupby(['user_id','merchant_id']).agg('sum').reset_index() 941 t3.drop_duplicates(inplace=True) 942 943 t4 = feature1[['user_id','merchant_id','date','coupon_id']] 944 t4.coupon_id=t4.coupon_id.replace(np.nan,'null') 945 t4 = t4[(t4.date!='null')&(t4.coupon_id=='null')][['user_id','merchant_id']] 946 t4['user_merchant_buy_common'] = 1 947 t4 = t4.groupby(['user_id','merchant_id']).agg('sum').reset_index() 948 t4.drop_duplicates(inplace=True) 949 950 user_merchant1 = pd.merge(all_user_merchant,t,on=['user_id','merchant_id'],how='left') 951 user_merchant1 = pd.merge(user_merchant1,t1,on=['user_id','merchant_id'],how='left') 952 user_merchant1 = pd.merge(user_merchant1,t2,on=['user_id','merchant_id'],how='left') 953 user_merchant1 = pd.merge(user_merchant1,t3,on=['user_id','merchant_id'],how='left') 954 user_merchant1 = pd.merge(user_merchant1,t4,on=['user_id','merchant_id'],how='left') 955 user_merchant1.user_merchant_buy_use_coupon = user_merchant1.user_merchant_buy_use_coupon.replace(np.nan,0) 956 user_merchant1.user_merchant_buy_common = user_merchant1.user_merchant_buy_common.replace(np.nan,0) 957 user_merchant1['user_merchant_coupon_transfer_rate'] = user_merchant1.user_merchant_buy_use_coupon.astype('float') / user_merchant1.user_merchant_received.astype('float') 958 user_merchant1['user_merchant_coupon_buy_rate'] = user_merchant1.user_merchant_buy_use_coupon.astype('float') / user_merchant1.user_merchant_buy_total.astype('float') 959 user_merchant1['user_merchant_rate'] = user_merchant1.user_merchant_buy_total.astype('float') / user_merchant1.user_merchant_any.astype('float') 960 user_merchant1['user_merchant_common_buy_rate'] = user_merchant1.user_merchant_buy_common.astype('float') / user_merchant1.user_merchant_buy_total.astype('float') 961 user_merchant1.to_csv('data/user_merchant1.csv',index=None) 962 963 ################## generate training and testing set ################ 964 def get_label(s): 965 s = s.split(':') 966 if s[0]=='nan': 967 return 0 968 elif (date(int(s[0][0:4]),int(s[0][4:6]),int(s[0][6:8]))-date(int(s[1][0:4]),int(s[1][4:6]),int(s[1][6:8]))).days<=15: 969 return 1 970 else: 971 return -1 972 973 974 coupon3 = pd.read_csv('data/coupon3_feature.csv') 975 merchant3 = pd.read_csv('data/merchant3_feature.csv') 976 user3 = pd.read_csv('data/user3_feature.csv') 977 user_merchant3 = pd.read_csv('data/user_merchant3.csv') 978 other_feature3 = pd.read_csv('data/other_feature3.csv') 979 dataset3 = pd.merge(coupon3,merchant3,on='merchant_id',how='left') 980 dataset3 = pd.merge(dataset3,user3,on='user_id',how='left') 981 dataset3 = pd.merge(dataset3,user_merchant3,on=['user_id','merchant_id'],how='left') 982 dataset3 = pd.merge(dataset3,other_feature3,on=['user_id','coupon_id','date_received'],how='left') 983 dataset3.drop_duplicates(inplace=True) 984 print (dataset3.shape) 985 986 dataset3.user_merchant_buy_total = dataset3.user_merchant_buy_total.replace(np.nan,0) 987 dataset3.user_merchant_any = dataset3.user_merchant_any.replace(np.nan,0) 988 dataset3.user_merchant_received = dataset3.user_merchant_received.replace(np.nan,0) 989 dataset3['is_weekend'] = dataset3.day_of_week.apply(lambda x:1 if x in (6,7) else 0) 990 weekday_dummies = pd.get_dummies(dataset3.day_of_week) 991 weekday_dummies.columns = ['weekday'+str(i+1) for i in range(weekday_dummies.shape[1])] 992 dataset3 = pd.concat([dataset3,weekday_dummies],axis=1) 993 dataset3.drop(['merchant_id','day_of_week','coupon_count'],axis=1,inplace=True) 994 dataset3 = dataset3.replace('null',np.nan) 995 dataset3.to_csv('data/dataset3.csv',index=None) 996 997 998 coupon2 = pd.read_csv('data/coupon2_feature.csv') 999 merchant2 = pd.read_csv('data/merchant2_feature.csv') 1000 user2 = pd.read_csv('data/user2_feature.csv') 1001 user_merchant2 = pd.read_csv('data/user_merchant2.csv') 1002 other_feature2 = pd.read_csv('data/other_feature2.csv') 1003 dataset2 = pd.merge(coupon2,merchant2,on='merchant_id',how='left') 1004 dataset2 = pd.merge(dataset2,user2,on='user_id',how='left') 1005 dataset2 = pd.merge(dataset2,user_merchant2,on=['user_id','merchant_id'],how='left') 1006 dataset2 = pd.merge(dataset2,other_feature2,on=['user_id','coupon_id','date_received'],how='left') 1007 dataset2.drop_duplicates(inplace=True) 1008 print( dataset2.shape) 1009 1010 dataset2.user_merchant_buy_total = dataset2.user_merchant_buy_total.replace(np.nan,0) 1011 dataset2.user_merchant_any = dataset2.user_merchant_any.replace(np.nan,0) 1012 dataset2.user_merchant_received = dataset2.user_merchant_received.replace(np.nan,0) 1013 dataset2['is_weekend'] = dataset2.day_of_week.apply(lambda x:1 if x in (6,7) else 0) 1014 weekday_dummies = pd.get_dummies(dataset2.day_of_week) 1015 weekday_dummies.columns = ['weekday'+str(i+1) for i in range(weekday_dummies.shape[1])] 1016 dataset2 = pd.concat([dataset2,weekday_dummies],axis=1) 1017 dataset2['label'] = dataset2.date.astype('str') + ':' + dataset2.date_received.astype('str') 1018 dataset2.label = dataset2.label.apply(get_label) 1019 dataset2.drop(['merchant_id','day_of_week','date','date_received','coupon_id','coupon_count'],axis=1,inplace=True) 1020 dataset2 = dataset2.replace('null',np.nan) 1021 dataset2.to_csv('data/dataset2.csv',index=None) 1022 1023 1024 coupon1 = pd.read_csv('data/coupon1_feature.csv') 1025 merchant1 = pd.read_csv('data/merchant1_feature.csv') 1026 user1 = pd.read_csv('data/user1_feature.csv') 1027 user_merchant1 = pd.read_csv('data/user_merchant1.csv') 1028 other_feature1 = pd.read_csv('data/other_feature1.csv') 1029 dataset1 = pd.merge(coupon1,merchant1,on='merchant_id',how='left') 1030 dataset1 = pd.merge(dataset1,user1,on='user_id',how='left') 1031 dataset1 = pd.merge(dataset1,user_merchant1,on=['user_id','merchant_id'],how='left') 1032 dataset1 = pd.merge(dataset1,other_feature1,on=['user_id','coupon_id','date_received'],how='left') 1033 dataset1.drop_duplicates(inplace=True) 1034 print (dataset1.shape) 1035 1036 dataset1.user_merchant_buy_total = dataset1.user_merchant_buy_total.replace(np.nan,0) 1037 dataset1.user_merchant_any = dataset1.user_merchant_any.replace(np.nan,0) 1038 dataset1.user_merchant_received = dataset1.user_merchant_received.replace(np.nan,0) 1039 dataset1['is_weekend'] = dataset1.day_of_week.apply(lambda x:1 if x in (6,7) else 0) 1040 weekday_dummies = pd.get_dummies(dataset1.day_of_week) 1041 weekday_dummies.columns = ['weekday'+str(i+1) for i in range(weekday_dummies.shape[1])] 1042 dataset1 = pd.concat([dataset1,weekday_dummies],axis=1) 1043 dataset1['label'] = dataset1.date.astype('str') + ':' + dataset1.date_received.astype('str') 1044 dataset1.label = dataset1.label.apply(get_label) 1045 dataset1.drop(['merchant_id','day_of_week','date','date_received','coupon_id','coupon_count'],axis=1,inplace=True) 1046 dataset1 = dataset1.replace('null',np.nan) 1047 dataset1.to_csv('data/dataset1.csv',index=None)