赛题链接: https://tianchi.aliyun.com/competition/entrance/231593/introduction

天池的这个比赛用于学习数据挖掘是一个很好的例子,该题核心是对特征的处理与提取。

从大神代码里习得了xgboost, 学习了特征处理的精髓后,训练调优了一版GBDT。 效果还不错:

最好成绩101:

阿里云天池新人赛o2o优惠券使用预测-------进阶心得

 

 

数据处理部分:

   1 # -*- coding: utf-8 -*-
   2 """
   3 Created on Wed Dec  4 14:30:36 2019
   4 
   5 @author: wenzhe.tian
   6 
   7 
   8     本赛题提供用户在2016年1月1日至2016年6月30日之间真实线上线下消费行为,预测用户在2016年7月领取优惠券后15天以内的使用情况。
   9 注意: 为了保护用户和商家的隐私,所有数据均作匿名处理,同时采用了有偏采样和必要过滤。
  10 
  11 用户消费日期特征添加:法定节假日?
  12 
  13 """
  14 
  15 import pandas as pd
  16 import numpy as np
  17 from datetime import date
  18 
  19 # 数据导入
  20 off_train = pd.read_csv('data/ccf_offline_stage1_train.csv')
  21 off_train.columns = ['user_id','merchant_id','coupon_id','discount_rate','distance','date_received','date']
  22 
  23 off_test = pd.read_csv('data/ccf_offline_stage1_test_revised.csv')
  24 off_test.columns = ['user_id','merchant_id','coupon_id','discount_rate','distance','date_received']
  25 
  26 on_train = pd.read_csv('data/ccf_online_stage1_train.csv')
  27 on_train.columns = ['user_id','merchant_id','action','coupon_id','discount_rate','date_received','date']
  28 
  29 # 日期格式处理
  30 off_train['date']= off_train['date'].fillna('null')
  31 off_train['date_received']= off_train['date_received'].fillna('null')
  32 off_train.date= off_train.date.astype('str')
  33 off_train.date_received= off_train.date_received.astype('str')
  34 
  35 # 时间差分 3个预测模型
  36 dataset3 = off_test
  37 feature3 = off_train[((off_train.date>='20160315')&(off_train.date<='20160630'))|((off_train.date=='null')&(off_train.date_received>='20160315')&(off_train.date_received<='20160630'))]
  38 dataset2 = off_train[(off_train.date_received>='20160515')&(off_train.date_received<='20160615')]
  39 feature2 = off_train[(off_train.date>='20160201')&(off_train.date<='20160514')|((off_train.date=='null')&(off_train.date_received>='20160201')&(off_train.date_received<='20160514'))]
  40 dataset1 = off_train[(off_train.date_received>='20160414')&(off_train.date_received<='20160514')]
  41 feature1 = off_train[(off_train.date>='20160101')&(off_train.date<='20160413')|((off_train.date=='null')&(off_train.date_received>='20160101')&(off_train.date_received<='20160413'))]
  42 
  43 
  44 ############# other feature ##################3
  45 """
  46 5. other feature:
  47       this_month_user_receive_all_coupon_count
  48       this_month_user_receive_same_coupon_count
  49       this_month_user_receive_same_coupon_lastone
  50       this_month_user_receive_same_coupon_firstone
  51       this_day_user_receive_all_coupon_count
  52       this_day_user_receive_same_coupon_count
  53       day_gap_before, day_gap_after  (receive the same coupon)
  54 """
  55 
  56 #for dataset3 (标签) 需写出目的
  57 # 按照用户id 进行重组统计齐获得优惠券的数目
  58 t = dataset3[['user_id']]
  59 t['this_month_user_receive_all_coupon_count'] = 1
  60 t = t.groupby('user_id').agg('sum').reset_index()
  61 
  62 # 按照用户id和优惠券id(类型) 进行重组统计齐获得相同优惠券的数目
  63 t1 = dataset3[['user_id','coupon_id']]
  64 t1['this_month_user_receive_same_coupon_count'] = 1
  65 t1 = t1.groupby(['user_id','coupon_id']).agg('sum').reset_index()
  66 
  67 #按照两个id 对收到优惠券的时间统计并按照:隔开
  68 t2 = dataset3[['user_id','coupon_id','date_received']]
  69 t2.date_received = t2.date_received.astype('str')
  70 t2 = t2.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index()
  71 #统计收到的优惠券数量
  72 t2['receive_number'] = t2.date_received.apply(lambda s:len(s.split(':')))
  73 #只要发了2张及以上的数据
  74 t2 = t2[t2.receive_number>1]
  75 #添加最早和最晚获取优惠券的时间数据  并只取这两个数据和两个id作为t2
  76 t2['max_date_received'] = t2.date_received.apply(lambda s:max([int(float(d)) for d in s.split(':')]))
  77 t2['min_date_received'] = t2.date_received.apply(lambda s:min([int(float(d)) for d in s.split(':')]))
  78 t2 = t2[['user_id','coupon_id','max_date_received','min_date_received']]
  79 
  80 # t2按照两个id
  81 t3 = dataset3[['user_id','coupon_id','date_received']]
  82 t3 = pd.merge(t3,t2,on=['user_id','coupon_id'],how='left')
  83 
  84 t3.max_date_received=t3.max_date_received.astype('float')
  85 t3.date_received=t3.date_received.astype('float')
  86 t3['this_month_user_receive_same_coupon_lastone'] = t3.max_date_received - t3.date_received
  87 t3['this_month_user_receive_same_coupon_firstone'] = t3.date_received - t3.min_date_received
  88 def is_firstlastone(x):
  89     if x==0:
  90         return 1
  91     elif x>0:
  92         return 0
  93     else:
  94         return -1 #those only receive once
  95 
  96 t3.this_month_user_receive_same_coupon_lastone = t3.this_month_user_receive_same_coupon_lastone.apply(is_firstlastone)
  97 t3.this_month_user_receive_same_coupon_firstone = t3.this_month_user_receive_same_coupon_firstone.apply(is_firstlastone)
  98 t3 = t3[['user_id','coupon_id','date_received','this_month_user_receive_same_coupon_lastone','this_month_user_receive_same_coupon_firstone']]
  99 
 100 t4 = dataset3[['user_id','date_received']]
 101 t4['this_day_user_receive_all_coupon_count'] = 1
 102 t4 = t4.groupby(['user_id','date_received']).agg('sum').reset_index()
 103 
 104 t5 = dataset3[['user_id','coupon_id','date_received']]
 105 t5['this_day_user_receive_same_coupon_count'] = 1
 106 t5 = t5.groupby(['user_id','coupon_id','date_received']).agg('sum').reset_index()
 107 
 108 t6 = dataset3[['user_id','coupon_id','date_received']]
 109 t6.date_received = t6.date_received.astype('str')
 110 t6 = t6.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index()
 111 t6.rename(columns={'date_received':'dates'},inplace=True)
 112 
 113 def get_day_gap_before(s):
 114     date_received,dates = s.split('-')
 115     dates = dates.split(':')
 116     gaps = []
 117     for d in dates:
 118         this_gap = (date(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))-date(int(d[0:4]),int(d[4:6]),int(d[6:8]))).days
 119         if this_gap>0:
 120             gaps.append(this_gap)
 121     if len(gaps)==0:
 122         return -1
 123     else:
 124         return min(gaps)
 125 
 126 def get_day_gap_after(s):
 127     date_received,dates = s.split('-')
 128     dates = dates.split(':')
 129     gaps = []
 130     for d in dates:
 131         this_gap = (date(int(d[0:4]),int(d[4:6]),int(d[6:8]))-date(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))).days
 132         if this_gap>0:
 133             gaps.append(this_gap)
 134     if len(gaps)==0:
 135         return -1
 136     else:
 137         return min(gaps)
 138 
 139 
 140 t7 = dataset3[['user_id','coupon_id','date_received']]
 141 t7 = pd.merge(t7,t6,on=['user_id','coupon_id'],how='left')
 142 t7['date_received_date'] = t7.date_received.astype('str') + '-' + t7.dates
 143 t7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before)
 144 t7['day_gap_after'] = t7.date_received_date.apply(get_day_gap_after)
 145 t7 = t7[['user_id','coupon_id','date_received','day_gap_before','day_gap_after']]
 146 
 147 other_feature3 = pd.merge(t1,t,on='user_id')
 148 other_feature3 = pd.merge(other_feature3,t3,on=['user_id','coupon_id'])
 149 other_feature3 = pd.merge(other_feature3,t4,on=['user_id','date_received'])
 150 other_feature3 = pd.merge(other_feature3,t5,on=['user_id','coupon_id','date_received'])
 151 other_feature3 = pd.merge(other_feature3,t7,on=['user_id','coupon_id','date_received'])
 152 other_feature3.to_csv('data/other_feature3.csv',index=None)
 153 print (other_feature3.shape)
 154 
 155 
 156 
 157 #for dataset2
 158 t = dataset2[['user_id']]
 159 t['this_month_user_receive_all_coupon_count'] = 1
 160 t = t.groupby('user_id').agg('sum').reset_index()
 161 
 162 t1 = dataset2[['user_id','coupon_id']]
 163 t1['this_month_user_receive_same_coupon_count'] = 1
 164 t1 = t1.groupby(['user_id','coupon_id']).agg('sum').reset_index()
 165 
 166 t2 = dataset2[['user_id','coupon_id','date_received']]
 167 t2.date_received = t2.date_received.astype('str')
 168 t2 = t2.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index()
 169 t2['receive_number'] = t2.date_received.apply(lambda s:len(s.split(':')))
 170 t2 = t2[t2.receive_number>1]
 171 t2['max_date_received'] = t2.date_received.apply(lambda s:max([int(float(d)) for d in s.split(':')]))
 172 t2['min_date_received'] = t2.date_received.apply(lambda s:min([int(float(d)) for d in s.split(':')]))
 173 t2 = t2[['user_id','coupon_id','max_date_received','min_date_received']]
 174 
 175 t3 = dataset2[['user_id','coupon_id','date_received']]
 176 t3 = pd.merge(t3,t2,on=['user_id','coupon_id'],how='left')
 177 t3['this_month_user_receive_same_coupon_lastone'] = t3.max_date_received - t3.date_received.astype('float').astype('int')
 178 t3['this_month_user_receive_same_coupon_firstone'] = t3.date_received.astype('float').astype('int') - t3.min_date_received
 179 def is_firstlastone(x):
 180     if x==0:
 181         return 1
 182     elif x>0:
 183         return 0
 184     else:
 185         return -1 #those only receive once
 186 
 187 t3.this_month_user_receive_same_coupon_lastone = t3.this_month_user_receive_same_coupon_lastone.apply(is_firstlastone)
 188 t3.this_month_user_receive_same_coupon_firstone = t3.this_month_user_receive_same_coupon_firstone.apply(is_firstlastone)
 189 t3 = t3[['user_id','coupon_id','date_received','this_month_user_receive_same_coupon_lastone','this_month_user_receive_same_coupon_firstone']]
 190 
 191 t4 = dataset2[['user_id','date_received']]
 192 t4['this_day_user_receive_all_coupon_count'] = 1
 193 t4 = t4.groupby(['user_id','date_received']).agg('sum').reset_index()
 194 
 195 t5 = dataset2[['user_id','coupon_id','date_received']]
 196 t5['this_day_user_receive_same_coupon_count'] = 1
 197 t5 = t5.groupby(['user_id','coupon_id','date_received']).agg('sum').reset_index()
 198 
 199 t6 = dataset2[['user_id','coupon_id','date_received']]
 200 t6.date_received = t6.date_received.astype('str')
 201 t6 = t6.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index()
 202 t6.rename(columns={'date_received':'dates'},inplace=True)
 203 
 204 def get_day_gap_before(s):
 205     date_received,dates = s.split('-')
 206     dates = dates.split(':')
 207     gaps = []
 208     for d in dates:
 209         this_gap = (date(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))-date(int(d[0:4]),int(d[4:6]),int(d[6:8]))).days
 210         if this_gap>0:
 211             gaps.append(this_gap)
 212     if len(gaps)==0:
 213         return -1
 214     else:
 215         return min(gaps)
 216 
 217 def get_day_gap_after(s):
 218     date_received,dates = s.split('-')
 219     dates = dates.split(':')
 220     gaps = []
 221     for d in dates:
 222         this_gap = (date(int(d[0:4]),int(d[4:6]),int(d[6:8]))-date(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))).days
 223         if this_gap>0:
 224             gaps.append(this_gap)
 225     if len(gaps)==0:
 226         return -1
 227     else:
 228         return min(gaps)
 229 
 230 
 231 t7 = dataset2[['user_id','coupon_id','date_received']]
 232 t7 = pd.merge(t7,t6,on=['user_id','coupon_id'],how='left')
 233 t7['date_received_date'] = t7.date_received.astype('str') + '-' + t7.dates
 234 t7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before)
 235 t7['day_gap_after'] = t7.date_received_date.apply(get_day_gap_after)
 236 t7 = t7[['user_id','coupon_id','date_received','day_gap_before','day_gap_after']]
 237 
 238 other_feature2 = pd.merge(t1,t,on='user_id')
 239 other_feature2 = pd.merge(other_feature2,t3,on=['user_id','coupon_id'])
 240 other_feature2 = pd.merge(other_feature2,t4,on=['user_id','date_received'])
 241 other_feature2 = pd.merge(other_feature2,t5,on=['user_id','coupon_id','date_received'])
 242 other_feature2 = pd.merge(other_feature2,t7,on=['user_id','coupon_id','date_received'])
 243 other_feature2.to_csv('data/other_feature2.csv',index=None)
 244 print (other_feature2.shape)
 245 
 246 
 247 
 248 #for dataset1
 249 t = dataset1[['user_id']]
 250 t['this_month_user_receive_all_coupon_count'] = 1
 251 t = t.groupby('user_id').agg('sum').reset_index()
 252 
 253 t1 = dataset1[['user_id','coupon_id']]
 254 t1['this_month_user_receive_same_coupon_count'] = 1
 255 t1 = t1.groupby(['user_id','coupon_id']).agg('sum').reset_index()
 256 
 257 t2 = dataset1[['user_id','coupon_id','date_received']]
 258 t2.date_received = t2.date_received.astype('str')
 259 t2 = t2.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index()
 260 t2['receive_number'] = t2.date_received.apply(lambda s:len(s.split(':')))
 261 t2 = t2[t2.receive_number>1]
 262 t2['max_date_received'] = t2.date_received.apply(lambda s:max([int(float(d)) for d in s.split(':')]))
 263 t2['min_date_received'] = t2.date_received.apply(lambda s:min([int(float(d)) for d in s.split(':')]))
 264 t2 = t2[['user_id','coupon_id','max_date_received','min_date_received']]
 265 
 266 t3 = dataset1[['user_id','coupon_id','date_received']]
 267 t3 = pd.merge(t3,t2,on=['user_id','coupon_id'],how='left')
 268 t3['this_month_user_receive_same_coupon_lastone'] = t3.max_date_received - t3.date_received.astype('float').astype('int')
 269 t3['this_month_user_receive_same_coupon_firstone'] = t3.date_received.astype('float').astype('int') - t3.min_date_received
 270 def is_firstlastone(x):
 271     if x==0:
 272         return 1
 273     elif x>0:
 274         return 0
 275     else:
 276         return -1 #those only receive once
 277 
 278 t3.this_month_user_receive_same_coupon_lastone = t3.this_month_user_receive_same_coupon_lastone.apply(is_firstlastone)
 279 t3.this_month_user_receive_same_coupon_firstone = t3.this_month_user_receive_same_coupon_firstone.apply(is_firstlastone)
 280 t3 = t3[['user_id','coupon_id','date_received','this_month_user_receive_same_coupon_lastone','this_month_user_receive_same_coupon_firstone']]
 281 
 282 t4 = dataset1[['user_id','date_received']]
 283 t4['this_day_user_receive_all_coupon_count'] = 1
 284 t4 = t4.groupby(['user_id','date_received']).agg('sum').reset_index()
 285 
 286 t5 = dataset1[['user_id','coupon_id','date_received']]
 287 t5['this_day_user_receive_same_coupon_count'] = 1
 288 t5 = t5.groupby(['user_id','coupon_id','date_received']).agg('sum').reset_index()
 289 
 290 t6 = dataset1[['user_id','coupon_id','date_received']]
 291 t6.date_received = t6.date_received.astype('str')
 292 t6 = t6.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index()
 293 t6.rename(columns={'date_received':'dates'},inplace=True)
 294 
 295 def get_day_gap_before(s):
 296     date_received,dates = s.split('-')
 297     dates = dates.split(':')
 298     gaps = []
 299     for d in dates:
 300         this_gap = (date(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))-date(int(d[0:4]),int(d[4:6]),int(d[6:8]))).days
 301         if this_gap>0:
 302             gaps.append(this_gap)
 303     if len(gaps)==0:
 304         return -1
 305     else:
 306         return min(gaps)
 307 
 308 def get_day_gap_after(s):
 309     date_received,dates = s.split('-')
 310     dates = dates.split(':')
 311     gaps = []
 312     for d in dates:
 313         this_gap = (date(int(d[0:4]),int(d[4:6]),int(d[6:8]))-date(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))).days
 314         if this_gap>0:
 315             gaps.append(this_gap)
 316     if len(gaps)==0:
 317         return -1
 318     else:
 319         return min(gaps)
 320 
 321 
 322 t7 = dataset1[['user_id','coupon_id','date_received']]
 323 t7 = pd.merge(t7,t6,on=['user_id','coupon_id'],how='left')
 324 t7['date_received_date'] = t7.date_received.astype('str') + '-' + t7.dates
 325 t7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before)
 326 t7['day_gap_after'] = t7.date_received_date.apply(get_day_gap_after)
 327 t7 = t7[['user_id','coupon_id','date_received','day_gap_before','day_gap_after']]
 328 
 329 other_feature1 = pd.merge(t1,t,on='user_id')
 330 other_feature1 = pd.merge(other_feature1,t3,on=['user_id','coupon_id'])
 331 other_feature1 = pd.merge(other_feature1,t4,on=['user_id','date_received'])
 332 other_feature1 = pd.merge(other_feature1,t5,on=['user_id','coupon_id','date_received'])
 333 other_feature1 = pd.merge(other_feature1,t7,on=['user_id','coupon_id','date_received'])
 334 other_feature1.to_csv('data/other_feature1.csv',index=None)
 335 print (other_feature1.shape)
 336 
 337 
 338 
 339 
 340 
 341 
 342 ############# coupon related feature   #############
 343 """
 344 2.coupon related:
 345       discount_rate. discount_man. discount_jian. is_man_jian
 346       day_of_week,day_of_month. (date_received)
 347 """
 348 def calc_discount_rate(s):
 349     s =str(s)
 350     s = s.split(':')
 351     if len(s)==1:
 352         return float(s[0])
 353     else:
 354         return 1.0-float(s[1])/float(s[0])
 355 
 356 def get_discount_man(s):
 357     s =str(s)
 358     s = s.split(':')
 359     if len(s)==1:
 360         return 'null'
 361     else:
 362         return int(s[0])
 363 
 364 def get_discount_jian(s):
 365     s =str(s)
 366     s = s.split(':')
 367     if len(s)==1:
 368         return 'null'
 369     else:
 370         return int(s[1])
 371 
 372 def is_man_jian(s):
 373     s =str(s)
 374     s = s.split(':')
 375     if len(s)==1:
 376         return 0
 377     else:
 378         return 1
 379 
 380 #dataset3
 381 dataset3['day_of_week'] = dataset3.date_received.astype('str').apply(lambda x:date(int(x[0:4]),int(x[4:6]),int(x[6:8])).weekday()+1)
 382 dataset3['day_of_month'] = dataset3.date_received.astype('str').apply(lambda x:int(x[6:8]))
 383 dataset3['days_distance'] = dataset3.date_received.astype('str').apply(lambda x:(date(int(x[0:4]),int(x[4:6]),int(x[6:8]))-date(2016,6,30)).days)
 384 dataset3['discount_man'] = dataset3.discount_rate.apply(get_discount_man)
 385 dataset3['discount_jian'] = dataset3.discount_rate.apply(get_discount_jian)
 386 dataset3['is_man_jian'] = dataset3.discount_rate.apply(is_man_jian)
 387 dataset3['discount_rate'] = dataset3.discount_rate.apply(calc_discount_rate)
 388 d = dataset3[['coupon_id']]
 389 d['coupon_count'] = 1
 390 d = d.groupby('coupon_id').agg('sum').reset_index()
 391 dataset3 = pd.merge(dataset3,d,on='coupon_id',how='left')
 392 dataset3.to_csv('data/coupon3_feature.csv',index=None)
 393 #dataset2
 394 dataset2['day_of_week'] = dataset2.date_received.astype('str').apply(lambda x:date(int(x[0:4]),int(x[4:6]),int(x[6:8])).weekday()+1)
 395 dataset2['day_of_month'] = dataset2.date_received.astype('str').apply(lambda x:int(x[6:8]))
 396 dataset2['days_distance'] = dataset2.date_received.astype('str').apply(lambda x:(date(int(x[0:4]),int(x[4:6]),int(x[6:8]))-date(2016,5,14)).days)
 397 dataset2['discount_man'] = dataset2.discount_rate.apply(get_discount_man)
 398 dataset2['discount_jian'] = dataset2.discount_rate.apply(get_discount_jian)
 399 dataset2['is_man_jian'] = dataset2.discount_rate.apply(is_man_jian)
 400 dataset2['discount_rate'] = dataset2.discount_rate.apply(calc_discount_rate)
 401 d = dataset2[['coupon_id']]
 402 d['coupon_count'] = 1
 403 d = d.groupby('coupon_id').agg('sum').reset_index()
 404 dataset2 = pd.merge(dataset2,d,on='coupon_id',how='left')
 405 dataset2.to_csv('data/coupon2_feature.csv',index=None)
 406 #dataset1
 407 dataset1['day_of_week'] = dataset1.date_received.astype('str').apply(lambda x:date(int(x[0:4]),int(x[4:6]),int(x[6:8])).weekday()+1)
 408 dataset1['day_of_month'] = dataset1.date_received.astype('str').apply(lambda x:int(x[6:8]))
 409 dataset1['days_distance'] = dataset1.date_received.astype('str').apply(lambda x:(date(int(x[0:4]),int(x[4:6]),int(x[6:8]))-date(2016,4,13)).days)
 410 dataset1['discount_man'] = dataset1.discount_rate.apply(get_discount_man)
 411 dataset1['discount_jian'] = dataset1.discount_rate.apply(get_discount_jian)
 412 dataset1['is_man_jian'] = dataset1.discount_rate.apply(is_man_jian)
 413 dataset1['discount_rate'] = dataset1.discount_rate.apply(calc_discount_rate)
 414 d = dataset1[['coupon_id']]
 415 d['coupon_count'] = 1
 416 d = d.groupby('coupon_id').agg('sum').reset_index()
 417 dataset1 = pd.merge(dataset1,d,on='coupon_id',how='left')
 418 dataset1.to_csv('data/coupon1_feature.csv',index=None)
 419 
 420 
 421 ############# merchant related feature   #############
 422 """
 423 1.merchant related:
 424       total_sales. sales_use_coupon.  total_coupon
 425       coupon_rate = sales_use_coupon/total_sales.
 426       transfer_rate = sales_use_coupon/total_coupon.
 427       merchant_avg_distance,merchant_min_distance,merchant_max_distance of those use coupon
 428 
 429 """
 430 
 431 #for dataset3
 432 merchant3 = feature3[['merchant_id','coupon_id','distance','date_received','date']]
 433 
 434 t = merchant3[['merchant_id']]
 435 t.drop_duplicates(inplace=True)
 436 
 437 t1 = merchant3[merchant3.date!='null'][['merchant_id']]
 438 t1['total_sales'] = 1
 439 t1 = t1.groupby('merchant_id').agg('sum').reset_index()
 440 
 441 merchant3.coupon_id=merchant3.coupon_id.replace(np.nan,'null')
 442 t2 = merchant3[(merchant3.date!='null')&(merchant3.coupon_id!='null')][['merchant_id']]
 443 t2['sales_use_coupon'] = 1
 444 t2 = t2.groupby('merchant_id').agg('sum').reset_index()
 445 
 446 t3 = merchant3[merchant3.coupon_id!='null'][['merchant_id']]
 447 t3['total_coupon'] = 1
 448 t3 = t3.groupby('merchant_id').agg('sum').reset_index()
 449 
 450 t4 = merchant3[(merchant3.date!='null')&(merchant3.coupon_id!='null')][['merchant_id','distance']]
 451 t4.replace('null',-1,inplace=True)
 452 #t4.distance = t4.distance.astype('int')
 453 t4.replace(-1,np.nan,inplace=True)
 454 t5 = t4.groupby('merchant_id').agg('min').reset_index()
 455 t5.rename(columns={'distance':'merchant_min_distance'},inplace=True)
 456 
 457 t6 = t4.groupby('merchant_id').agg('max').reset_index()
 458 t6.rename(columns={'distance':'merchant_max_distance'},inplace=True)
 459 
 460 t7 = t4.groupby('merchant_id').agg('mean').reset_index()
 461 t7.rename(columns={'distance':'merchant_mean_distance'},inplace=True)
 462 
 463 t8 = t4.groupby('merchant_id').agg('median').reset_index()
 464 t8.rename(columns={'distance':'merchant_median_distance'},inplace=True)
 465 
 466 merchant3_feature = pd.merge(t,t1,on='merchant_id',how='left')
 467 merchant3_feature = pd.merge(merchant3_feature,t2,on='merchant_id',how='left')
 468 merchant3_feature = pd.merge(merchant3_feature,t3,on='merchant_id',how='left')
 469 merchant3_feature = pd.merge(merchant3_feature,t5,on='merchant_id',how='left')
 470 merchant3_feature = pd.merge(merchant3_feature,t6,on='merchant_id',how='left')
 471 merchant3_feature = pd.merge(merchant3_feature,t7,on='merchant_id',how='left')
 472 merchant3_feature = pd.merge(merchant3_feature,t8,on='merchant_id',how='left')
 473 merchant3_feature.sales_use_coupon = merchant3_feature.sales_use_coupon.replace(np.nan,0) #fillna with 0
 474 merchant3_feature['merchant_coupon_transfer_rate'] = merchant3_feature.sales_use_coupon.astype('float') / merchant3_feature.total_coupon
 475 merchant3_feature['coupon_rate'] = merchant3_feature.sales_use_coupon.astype('float') / merchant3_feature.total_sales
 476 merchant3_feature.total_coupon = merchant3_feature.total_coupon.replace(np.nan,0) #fillna with 0
 477 merchant3_feature.to_csv('data/merchant3_feature.csv',index=None)
 478 
 479 
 480 #for dataset2
 481 merchant2 = feature2[['merchant_id','coupon_id','distance','date_received','date']]
 482 
 483 t = merchant2[['merchant_id']]
 484 t.drop_duplicates(inplace=True)
 485 
 486 t1 = merchant2[merchant2.date!='null'][['merchant_id']]
 487 t1['total_sales'] = 1
 488 t1 = t1.groupby('merchant_id').agg('sum').reset_index()
 489 
 490 merchant2.coupon_id=merchant2.coupon_id.replace(np.nan,'null')
 491 t2 = merchant2[(merchant2.date!='null')&(merchant2.coupon_id!='null')][['merchant_id']]
 492 t2['sales_use_coupon'] = 1
 493 t2 = t2.groupby('merchant_id').agg('sum').reset_index()
 494 
 495 t3 = merchant2[merchant2.coupon_id!='null'][['merchant_id']]
 496 t3['total_coupon'] = 1
 497 t3 = t3.groupby('merchant_id').agg('sum').reset_index()
 498 
 499 t4 = merchant2[(merchant2.date!='null')&(merchant2.coupon_id!='null')][['merchant_id','distance']]
 500 t4.replace('null',-1,inplace=True)
 501 #t4.distance = t4.distance.astype('int')
 502 t4.replace(-1,np.nan,inplace=True)
 503 t5 = t4.groupby('merchant_id').agg('min').reset_index()
 504 t5.rename(columns={'distance':'merchant_min_distance'},inplace=True)
 505 
 506 t6 = t4.groupby('merchant_id').agg('max').reset_index()
 507 t6.rename(columns={'distance':'merchant_max_distance'},inplace=True)
 508 
 509 t7 = t4.groupby('merchant_id').agg('mean').reset_index()
 510 t7.rename(columns={'distance':'merchant_mean_distance'},inplace=True)
 511 
 512 t8 = t4.groupby('merchant_id').agg('median').reset_index()
 513 t8.rename(columns={'distance':'merchant_median_distance'},inplace=True)
 514 
 515 merchant2_feature = pd.merge(t,t1,on='merchant_id',how='left')
 516 merchant2_feature = pd.merge(merchant2_feature,t2,on='merchant_id',how='left')
 517 merchant2_feature = pd.merge(merchant2_feature,t3,on='merchant_id',how='left')
 518 merchant2_feature = pd.merge(merchant2_feature,t5,on='merchant_id',how='left')
 519 merchant2_feature = pd.merge(merchant2_feature,t6,on='merchant_id',how='left')
 520 merchant2_feature = pd.merge(merchant2_feature,t7,on='merchant_id',how='left')
 521 merchant2_feature = pd.merge(merchant2_feature,t8,on='merchant_id',how='left')
 522 merchant2_feature.sales_use_coupon = merchant2_feature.sales_use_coupon.replace(np.nan,0) #fillna with 0
 523 merchant2_feature['merchant_coupon_transfer_rate'] = merchant2_feature.sales_use_coupon.astype('float') / merchant2_feature.total_coupon
 524 merchant2_feature['coupon_rate'] = merchant2_feature.sales_use_coupon.astype('float') / merchant2_feature.total_sales
 525 merchant2_feature.total_coupon = merchant2_feature.total_coupon.replace(np.nan,0) #fillna with 0
 526 merchant2_feature.to_csv('data/merchant2_feature.csv',index=None)
 527 
 528 #for dataset1
 529 merchant1 = feature1[['merchant_id','coupon_id','distance','date_received','date']]
 530 
 531 t = merchant1[['merchant_id']]
 532 t.drop_duplicates(inplace=True)
 533 
 534 t1 = merchant1[merchant1.date!='null'][['merchant_id']]
 535 t1['total_sales'] = 1
 536 t1 = t1.groupby('merchant_id').agg('sum').reset_index()
 537 
 538 merchant1.coupon_id=merchant1.coupon_id.replace(np.nan,'null')
 539 t2 = merchant1[(merchant1.date!='null')&(merchant1.coupon_id!='null')][['merchant_id']]
 540 t2['sales_use_coupon'] = 1
 541 t2 = t2.groupby('merchant_id').agg('sum').reset_index()
 542 
 543 t3 = merchant1[merchant1.coupon_id!='null'][['merchant_id']]
 544 t3['total_coupon'] = 1
 545 t3 = t3.groupby('merchant_id').agg('sum').reset_index()
 546 
 547 t4 = merchant1[(merchant1.date!='null')&(merchant1.coupon_id!='null')][['merchant_id','distance']]
 548 t4.replace('null',-1,inplace=True)
 549 #t4.distance = t4.distance.astype('int')
 550 t4.replace(-1,np.nan,inplace=True)
 551 t5 = t4.groupby('merchant_id').agg('min').reset_index()
 552 t5.rename(columns={'distance':'merchant_min_distance'},inplace=True)
 553 
 554 t6 = t4.groupby('merchant_id').agg('max').reset_index()
 555 t6.rename(columns={'distance':'merchant_max_distance'},inplace=True)
 556 
 557 t7 = t4.groupby('merchant_id').agg('mean').reset_index()
 558 t7.rename(columns={'distance':'merchant_mean_distance'},inplace=True)
 559 
 560 t8 = t4.groupby('merchant_id').agg('median').reset_index()
 561 t8.rename(columns={'distance':'merchant_median_distance'},inplace=True)
 562 
 563 
 564 merchant1_feature = pd.merge(t,t1,on='merchant_id',how='left')
 565 merchant1_feature = pd.merge(merchant1_feature,t2,on='merchant_id',how='left')
 566 merchant1_feature = pd.merge(merchant1_feature,t3,on='merchant_id',how='left')
 567 merchant1_feature = pd.merge(merchant1_feature,t5,on='merchant_id',how='left')
 568 merchant1_feature = pd.merge(merchant1_feature,t6,on='merchant_id',how='left')
 569 merchant1_feature = pd.merge(merchant1_feature,t7,on='merchant_id',how='left')
 570 merchant1_feature = pd.merge(merchant1_feature,t8,on='merchant_id',how='left')
 571 merchant1_feature.sales_use_coupon = merchant1_feature.sales_use_coupon.replace(np.nan,0) #fillna with 0
 572 merchant1_feature['merchant_coupon_transfer_rate'] = merchant1_feature.sales_use_coupon.astype('float') / merchant1_feature.total_coupon
 573 merchant1_feature['coupon_rate'] = merchant1_feature.sales_use_coupon.astype('float') / merchant1_feature.total_sales
 574 merchant1_feature.total_coupon = merchant1_feature.total_coupon.replace(np.nan,0) #fillna with 0
 575 merchant1_feature.to_csv('data/merchant1_feature.csv',index=None)
 576 
 577 
 578 ############# user related feature   #############
 579 """
 580 3.user related:
 581       count_merchant.
 582       user_avg_distance, user_min_distance,user_max_distance.
 583       buy_use_coupon. buy_total. coupon_received.
 584       buy_use_coupon/coupon_received.
 585       buy_use_coupon/buy_total
 586       user_date_datereceived_gap
 587 
 588 
 589 """
 590 
 591 def get_user_date_datereceived_gap(s):
 592     s = s.split(':')
 593     return (date(int(s[0][0:4]),int(s[0][4:6]),int(s[0][6:8])) - date(int(s[1][0:4]),int(s[1][4:6]),int(s[1][6:8]))).days
 594 
 595 #for dataset3
 596 user3 = feature3[['user_id','merchant_id','coupon_id','discount_rate','distance','date_received','date']]
 597 
 598 t = user3[['user_id']]
 599 t.drop_duplicates(inplace=True)
 600 
 601 t1 = user3[user3.date!='null'][['user_id','merchant_id']]
 602 t1.drop_duplicates(inplace=True)
 603 t1.merchant_id = 1
 604 t1 = t1.groupby('user_id').agg('sum').reset_index()
 605 t1.rename(columns={'merchant_id':'count_merchant'},inplace=True)
 606 user3['coupon_id']=user3['coupon_id'].replace(np.nan,'null')
 607 t2 = user3[(user3.date!='null')&(user3.coupon_id!='null')][['user_id','distance']]
 608 t2.replace('null',-1,inplace=True)
 609 #t2.distance = t2.distance.astype('int')
 610 t2.replace(-1,np.nan,inplace=True)
 611 t3 = t2.groupby('user_id').agg('min').reset_index()
 612 t3.rename(columns={'distance':'user_min_distance'},inplace=True)
 613 
 614 t4 = t2.groupby('user_id').agg('max').reset_index()
 615 t4.rename(columns={'distance':'user_max_distance'},inplace=True)
 616 
 617 t5 = t2.groupby('user_id').agg('mean').reset_index()
 618 t5.rename(columns={'distance':'user_mean_distance'},inplace=True)
 619 
 620 t6 = t2.groupby('user_id').agg('median').reset_index()
 621 t6.rename(columns={'distance':'user_median_distance'},inplace=True)
 622 
 623 t7 = user3[(user3.date!='null')&(user3.coupon_id!='null')][['user_id']]
 624 t7['buy_use_coupon'] = 1
 625 t7 = t7.groupby('user_id').agg('sum').reset_index()
 626 
 627 t8 = user3[user3.date!='null'][['user_id']]
 628 t8['buy_total'] = 1
 629 t8 = t8.groupby('user_id').agg('sum').reset_index()
 630 
 631 t9 = user3[user3.coupon_id!='null'][['user_id']]
 632 t9['coupon_received'] = 1
 633 t9 = t9.groupby('user_id').agg('sum').reset_index()
 634 
 635 t10 = user3[(user3.date_received!='null')&(user3.date!='null')][['user_id','date_received','date']]
 636 t10['user_date_datereceived_gap'] = t10.date + ':' + t10.date_received
 637 t10.user_date_datereceived_gap = t10.user_date_datereceived_gap.apply(get_user_date_datereceived_gap)
 638 t10 = t10[['user_id','user_date_datereceived_gap']]
 639 
 640 t11 = t10.groupby('user_id').agg('mean').reset_index()
 641 t11.rename(columns={'user_date_datereceived_gap':'avg_user_date_datereceived_gap'},inplace=True)
 642 t12 = t10.groupby('user_id').agg('min').reset_index()
 643 t12.rename(columns={'user_date_datereceived_gap':'min_user_date_datereceived_gap'},inplace=True)
 644 t13 = t10.groupby('user_id').agg('max').reset_index()
 645 t13.rename(columns={'user_date_datereceived_gap':'max_user_date_datereceived_gap'},inplace=True)
 646 
 647 
 648 user3_feature = pd.merge(t,t1,on='user_id',how='left')
 649 user3_feature = pd.merge(user3_feature,t3,on='user_id',how='left')
 650 user3_feature = pd.merge(user3_feature,t4,on='user_id',how='left')
 651 user3_feature = pd.merge(user3_feature,t5,on='user_id',how='left')
 652 user3_feature = pd.merge(user3_feature,t6,on='user_id',how='left')
 653 user3_feature = pd.merge(user3_feature,t7,on='user_id',how='left')
 654 user3_feature = pd.merge(user3_feature,t8,on='user_id',how='left')
 655 user3_feature = pd.merge(user3_feature,t9,on='user_id',how='left')
 656 user3_feature = pd.merge(user3_feature,t11,on='user_id',how='left')
 657 user3_feature = pd.merge(user3_feature,t12,on='user_id',how='left')
 658 user3_feature = pd.merge(user3_feature,t13,on='user_id',how='left')
 659 user3_feature.count_merchant = user3_feature.count_merchant.replace(np.nan,0)
 660 user3_feature.buy_use_coupon = user3_feature.buy_use_coupon.replace(np.nan,0)
 661 user3_feature['buy_use_coupon_rate'] = user3_feature.buy_use_coupon.astype('float') / user3_feature.buy_total.astype('float')
 662 user3_feature['user_coupon_transfer_rate'] = user3_feature.buy_use_coupon.astype('float') / user3_feature.coupon_received.astype('float')
 663 user3_feature.buy_total = user3_feature.buy_total.replace(np.nan,0)
 664 user3_feature.coupon_received = user3_feature.coupon_received.replace(np.nan,0)
 665 user3_feature.to_csv('data/user3_feature.csv',index=None)
 666 
 667 
 668 #for dataset2
 669 user2 = feature2[['user_id','merchant_id','coupon_id','discount_rate','distance','date_received','date']]
 670 
 671 t = user2[['user_id']]
 672 t.drop_duplicates(inplace=True)
 673 
 674 t1 = user2[user2.date!='null'][['user_id','merchant_id']]
 675 t1.drop_duplicates(inplace=True)
 676 t1.merchant_id = 1
 677 t1 = t1.groupby('user_id').agg('sum').reset_index()
 678 t1.rename(columns={'merchant_id':'count_merchant'},inplace=True)
 679 user2['coupon_id']=user3['coupon_id'].replace(np.nan,'null')
 680 t2 = user2[(user2.date!='null')&(user2.coupon_id!='null')][['user_id','distance']]
 681 t2.replace('null',-1,inplace=True)
 682 #t2.distance = t2.distance.astype('int')
 683 t2.replace(-1,np.nan,inplace=True)
 684 t3 = t2.groupby('user_id').agg('min').reset_index()
 685 t3.rename(columns={'distance':'user_min_distance'},inplace=True)
 686 
 687 t4 = t2.groupby('user_id').agg('max').reset_index()
 688 t4.rename(columns={'distance':'user_max_distance'},inplace=True)
 689 
 690 t5 = t2.groupby('user_id').agg('mean').reset_index()
 691 t5.rename(columns={'distance':'user_mean_distance'},inplace=True)
 692 
 693 t6 = t2.groupby('user_id').agg('median').reset_index()
 694 t6.rename(columns={'distance':'user_median_distance'},inplace=True)
 695 
 696 t7 = user2[(user2.date!='null')&(user2.coupon_id!='null')][['user_id']]
 697 t7['buy_use_coupon'] = 1
 698 t7 = t7.groupby('user_id').agg('sum').reset_index()
 699 
 700 t8 = user2[user2.date!='null'][['user_id']]
 701 t8['buy_total'] = 1
 702 t8 = t8.groupby('user_id').agg('sum').reset_index()
 703 
 704 t9 = user2[user2.coupon_id!='null'][['user_id']]
 705 t9['coupon_received'] = 1
 706 t9 = t9.groupby('user_id').agg('sum').reset_index()
 707 
 708 t10 = user2[(user2.date_received!='null')&(user2.date!='null')][['user_id','date_received','date']]
 709 t10['user_date_datereceived_gap'] = t10.date + ':' + t10.date_received
 710 t10.user_date_datereceived_gap = t10.user_date_datereceived_gap.apply(get_user_date_datereceived_gap)
 711 t10 = t10[['user_id','user_date_datereceived_gap']]
 712 
 713 t11 = t10.groupby('user_id').agg('mean').reset_index()
 714 t11.rename(columns={'user_date_datereceived_gap':'avg_user_date_datereceived_gap'},inplace=True)
 715 t12 = t10.groupby('user_id').agg('min').reset_index()
 716 t12.rename(columns={'user_date_datereceived_gap':'min_user_date_datereceived_gap'},inplace=True)
 717 t13 = t10.groupby('user_id').agg('max').reset_index()
 718 t13.rename(columns={'user_date_datereceived_gap':'max_user_date_datereceived_gap'},inplace=True)
 719 
 720 user2_feature = pd.merge(t,t1,on='user_id',how='left')
 721 user2_feature = pd.merge(user2_feature,t3,on='user_id',how='left')
 722 user2_feature = pd.merge(user2_feature,t4,on='user_id',how='left')
 723 user2_feature = pd.merge(user2_feature,t5,on='user_id',how='left')
 724 user2_feature = pd.merge(user2_feature,t6,on='user_id',how='left')
 725 user2_feature = pd.merge(user2_feature,t7,on='user_id',how='left')
 726 user2_feature = pd.merge(user2_feature,t8,on='user_id',how='left')
 727 user2_feature = pd.merge(user2_feature,t9,on='user_id',how='left')
 728 user2_feature = pd.merge(user2_feature,t11,on='user_id',how='left')
 729 user2_feature = pd.merge(user2_feature,t12,on='user_id',how='left')
 730 user2_feature = pd.merge(user2_feature,t13,on='user_id',how='left')
 731 user2_feature.count_merchant = user2_feature.count_merchant.replace(np.nan,0)
 732 user2_feature.buy_use_coupon = user2_feature.buy_use_coupon.replace(np.nan,0)
 733 user2_feature['buy_use_coupon_rate'] = user2_feature.buy_use_coupon.astype('float') / user2_feature.buy_total.astype('float')
 734 user2_feature['user_coupon_transfer_rate'] = user2_feature.buy_use_coupon.astype('float') / user2_feature.coupon_received.astype('float')
 735 user2_feature.buy_total = user2_feature.buy_total.replace(np.nan,0)
 736 user2_feature.coupon_received = user2_feature.coupon_received.replace(np.nan,0)
 737 user2_feature.to_csv('data/user2_feature.csv',index=None)
 738 
 739 
 740 #for dataset1
 741 user1 = feature1[['user_id','merchant_id','coupon_id','discount_rate','distance','date_received','date']]
 742 
 743 t = user1[['user_id']]
 744 t.drop_duplicates(inplace=True)
 745 
 746 t1 = user1[user1.date!='null'][['user_id','merchant_id']]
 747 t1.drop_duplicates(inplace=True)
 748 t1.merchant_id = 1
 749 t1 = t1.groupby('user_id').agg('sum').reset_index()
 750 t1.rename(columns={'merchant_id':'count_merchant'},inplace=True)
 751 user1['coupon_id']=user3['coupon_id'].replace(np.nan,'null')
 752 t2 = user1[(user1.date!='null')&(user1.coupon_id!='null')][['user_id','distance']]
 753 t2.replace('null',-1,inplace=True)
 754 #t2.distance = t2.distance.astype('int')
 755 t2.replace(-1,np.nan,inplace=True)
 756 t3 = t2.groupby('user_id').agg('min').reset_index()
 757 t3.rename(columns={'distance':'user_min_distance'},inplace=True)
 758 
 759 t4 = t2.groupby('user_id').agg('max').reset_index()
 760 t4.rename(columns={'distance':'user_max_distance'},inplace=True)
 761 
 762 t5 = t2.groupby('user_id').agg('mean').reset_index()
 763 t5.rename(columns={'distance':'user_mean_distance'},inplace=True)
 764 
 765 t6 = t2.groupby('user_id').agg('median').reset_index()
 766 t6.rename(columns={'distance':'user_median_distance'},inplace=True)
 767 
 768 t7 = user1[(user1.date!='null')&(user1.coupon_id!='null')][['user_id']]
 769 t7['buy_use_coupon'] = 1
 770 t7 = t7.groupby('user_id').agg('sum').reset_index()
 771 
 772 t8 = user1[user1.date!='null'][['user_id']]
 773 t8['buy_total'] = 1
 774 t8 = t8.groupby('user_id').agg('sum').reset_index()
 775 
 776 t9 = user1[user1.coupon_id!='null'][['user_id']]
 777 t9['coupon_received'] = 1
 778 t9 = t9.groupby('user_id').agg('sum').reset_index()
 779 
 780 t10 = user1[(user1.date_received!='null')&(user1.date!='null')][['user_id','date_received','date']]
 781 t10['user_date_datereceived_gap'] = t10.date + ':' + t10.date_received
 782 t10.user_date_datereceived_gap = t10.user_date_datereceived_gap.apply(get_user_date_datereceived_gap)
 783 t10 = t10[['user_id','user_date_datereceived_gap']]
 784 
 785 t11 = t10.groupby('user_id').agg('mean').reset_index()
 786 t11.rename(columns={'user_date_datereceived_gap':'avg_user_date_datereceived_gap'},inplace=True)
 787 t12 = t10.groupby('user_id').agg('min').reset_index()
 788 t12.rename(columns={'user_date_datereceived_gap':'min_user_date_datereceived_gap'},inplace=True)
 789 t13 = t10.groupby('user_id').agg('max').reset_index()
 790 t13.rename(columns={'user_date_datereceived_gap':'max_user_date_datereceived_gap'},inplace=True)
 791 
 792 user1_feature = pd.merge(t,t1,on='user_id',how='left')
 793 user1_feature = pd.merge(user1_feature,t3,on='user_id',how='left')
 794 user1_feature = pd.merge(user1_feature,t4,on='user_id',how='left')
 795 user1_feature = pd.merge(user1_feature,t5,on='user_id',how='left')
 796 user1_feature = pd.merge(user1_feature,t6,on='user_id',how='left')
 797 user1_feature = pd.merge(user1_feature,t7,on='user_id',how='left')
 798 user1_feature = pd.merge(user1_feature,t8,on='user_id',how='left')
 799 user1_feature = pd.merge(user1_feature,t9,on='user_id',how='left')
 800 user1_feature = pd.merge(user1_feature,t11,on='user_id',how='left')
 801 user1_feature = pd.merge(user1_feature,t12,on='user_id',how='left')
 802 user1_feature = pd.merge(user1_feature,t13,on='user_id',how='left')
 803 user1_feature.count_merchant = user1_feature.count_merchant.replace(np.nan,0)
 804 user1_feature.buy_use_coupon = user1_feature.buy_use_coupon.replace(np.nan,0)
 805 user1_feature['buy_use_coupon_rate'] = user1_feature.buy_use_coupon.astype('float') / user1_feature.buy_total.astype('float')
 806 user1_feature['user_coupon_transfer_rate'] = user1_feature.buy_use_coupon.astype('float') / user1_feature.coupon_received.astype('float')
 807 user1_feature.buy_total = user1_feature.buy_total.replace(np.nan,0)
 808 user1_feature.coupon_received = user1_feature.coupon_received.replace(np.nan,0)
 809 user1_feature.to_csv('data/user1_feature.csv',index=None)
 810 
 811 
 812 
 813 ##################  user_merchant related feature #########################
 814 
 815 """
 816 4.user_merchant:
 817       times_user_buy_merchant_before.
 818 """
 819 #for dataset3
 820 all_user_merchant = feature3[['user_id','merchant_id']]
 821 all_user_merchant.drop_duplicates(inplace=True)
 822 
 823 t = feature3[['user_id','merchant_id','date']]
 824 t = t[t.date!='null'][['user_id','merchant_id']]
 825 t['user_merchant_buy_total'] = 1
 826 t = t.groupby(['user_id','merchant_id']).agg('sum').reset_index()
 827 t.drop_duplicates(inplace=True)
 828 
 829 t1 = feature3[['user_id','merchant_id','coupon_id']]
 830 t1.coupon_id=t1.coupon_id.replace(np.nan,'null')
 831 t1 = t1[t1.coupon_id!='null'][['user_id','merchant_id']]
 832 t1['user_merchant_received'] = 1
 833 t1 = t1.groupby(['user_id','merchant_id']).agg('sum').reset_index()
 834 t1.drop_duplicates(inplace=True)
 835 
 836 t2 = feature3[['user_id','merchant_id','date','date_received']]
 837 t2 = t2[(t2.date!='null')&(t2.date_received!='null')][['user_id','merchant_id']]
 838 t2['user_merchant_buy_use_coupon'] = 1
 839 t2 = t2.groupby(['user_id','merchant_id']).agg('sum').reset_index()
 840 t2.drop_duplicates(inplace=True)
 841 
 842 t3 = feature3[['user_id','merchant_id']]
 843 t3['user_merchant_any'] = 1
 844 t3 = t3.groupby(['user_id','merchant_id']).agg('sum').reset_index()
 845 t3.drop_duplicates(inplace=True)
 846 
 847 t4 = feature3[['user_id','merchant_id','date','coupon_id']]
 848 t4.coupon_id=t4.coupon_id.replace(np.nan,'null')
 849 t4 = t4[(t4.date!='null')&(t4.coupon_id=='null')][['user_id','merchant_id']]
 850 t4['user_merchant_buy_common'] = 1
 851 t4 = t4.groupby(['user_id','merchant_id']).agg('sum').reset_index()
 852 t4.drop_duplicates(inplace=True)
 853 
 854 user_merchant3 = pd.merge(all_user_merchant,t,on=['user_id','merchant_id'],how='left')
 855 user_merchant3 = pd.merge(user_merchant3,t1,on=['user_id','merchant_id'],how='left')
 856 user_merchant3 = pd.merge(user_merchant3,t2,on=['user_id','merchant_id'],how='left')
 857 user_merchant3 = pd.merge(user_merchant3,t3,on=['user_id','merchant_id'],how='left')
 858 user_merchant3 = pd.merge(user_merchant3,t4,on=['user_id','merchant_id'],how='left')
 859 user_merchant3.user_merchant_buy_use_coupon = user_merchant3.user_merchant_buy_use_coupon.replace(np.nan,0)
 860 user_merchant3.user_merchant_buy_common = user_merchant3.user_merchant_buy_common.replace(np.nan,0)
 861 user_merchant3['user_merchant_coupon_transfer_rate'] = user_merchant3.user_merchant_buy_use_coupon.astype('float') / user_merchant3.user_merchant_received.astype('float')
 862 user_merchant3['user_merchant_coupon_buy_rate'] = user_merchant3.user_merchant_buy_use_coupon.astype('float') / user_merchant3.user_merchant_buy_total.astype('float')
 863 user_merchant3['user_merchant_rate'] = user_merchant3.user_merchant_buy_total.astype('float') / user_merchant3.user_merchant_any.astype('float')
 864 user_merchant3['user_merchant_common_buy_rate'] = user_merchant3.user_merchant_buy_common.astype('float') / user_merchant3.user_merchant_buy_total.astype('float')
 865 user_merchant3.to_csv('data/user_merchant3.csv',index=None)
 866 
 867 #for dataset2
 868 all_user_merchant = feature2[['user_id','merchant_id']]
 869 all_user_merchant.drop_duplicates(inplace=True)
 870 
 871 t = feature2[['user_id','merchant_id','date']]
 872 t = t[t.date!='null'][['user_id','merchant_id']]
 873 t['user_merchant_buy_total'] = 1
 874 t = t.groupby(['user_id','merchant_id']).agg('sum').reset_index()
 875 t.drop_duplicates(inplace=True)
 876 
 877 t1 = feature2[['user_id','merchant_id','coupon_id']]
 878 t1.coupon_id=t1.coupon_id.replace(np.nan,'null')
 879 t1 = t1[t1.coupon_id!='null'][['user_id','merchant_id']]
 880 t1['user_merchant_received'] = 1
 881 t1 = t1.groupby(['user_id','merchant_id']).agg('sum').reset_index()
 882 t1.drop_duplicates(inplace=True)
 883 
 884 t2 = feature2[['user_id','merchant_id','date','date_received']]
 885 t2 = t2[(t2.date!='null')&(t2.date_received!='null')][['user_id','merchant_id']]
 886 t2['user_merchant_buy_use_coupon'] = 1
 887 t2 = t2.groupby(['user_id','merchant_id']).agg('sum').reset_index()
 888 t2.drop_duplicates(inplace=True)
 889 
 890 t3 = feature2[['user_id','merchant_id']]
 891 t3['user_merchant_any'] = 1
 892 t3 = t3.groupby(['user_id','merchant_id']).agg('sum').reset_index()
 893 t3.drop_duplicates(inplace=True)
 894 
 895 t4 = feature2[['user_id','merchant_id','date','coupon_id']]
 896 t4.coupon_id=t4.coupon_id.replace(np.nan,'null')
 897 t4 = t4[(t4.date!='null')&(t4.coupon_id=='null')][['user_id','merchant_id']]
 898 t4['user_merchant_buy_common'] = 1
 899 t4 = t4.groupby(['user_id','merchant_id']).agg('sum').reset_index()
 900 t4.drop_duplicates(inplace=True)
 901 
 902 user_merchant2 = pd.merge(all_user_merchant,t,on=['user_id','merchant_id'],how='left')
 903 user_merchant2 = pd.merge(user_merchant2,t1,on=['user_id','merchant_id'],how='left')
 904 user_merchant2 = pd.merge(user_merchant2,t2,on=['user_id','merchant_id'],how='left')
 905 user_merchant2 = pd.merge(user_merchant2,t3,on=['user_id','merchant_id'],how='left')
 906 user_merchant2 = pd.merge(user_merchant2,t4,on=['user_id','merchant_id'],how='left')
 907 user_merchant2.user_merchant_buy_use_coupon = user_merchant2.user_merchant_buy_use_coupon.replace(np.nan,0)
 908 user_merchant2.user_merchant_buy_common = user_merchant2.user_merchant_buy_common.replace(np.nan,0)
 909 user_merchant2['user_merchant_coupon_transfer_rate'] = user_merchant2.user_merchant_buy_use_coupon.astype('float') / user_merchant2.user_merchant_received.astype('float')
 910 user_merchant2['user_merchant_coupon_buy_rate'] = user_merchant2.user_merchant_buy_use_coupon.astype('float') / user_merchant2.user_merchant_buy_total.astype('float')
 911 user_merchant2['user_merchant_rate'] = user_merchant2.user_merchant_buy_total.astype('float') / user_merchant2.user_merchant_any.astype('float')
 912 user_merchant2['user_merchant_common_buy_rate'] = user_merchant2.user_merchant_buy_common.astype('float') / user_merchant2.user_merchant_buy_total.astype('float')
 913 user_merchant2.to_csv('data/user_merchant2.csv',index=None)
 914 
 915 #for dataset2
 916 all_user_merchant = feature1[['user_id','merchant_id']]
 917 all_user_merchant.drop_duplicates(inplace=True)
 918 
 919 t = feature1[['user_id','merchant_id','date']]
 920 t = t[t.date!='null'][['user_id','merchant_id']]
 921 t['user_merchant_buy_total'] = 1
 922 t = t.groupby(['user_id','merchant_id']).agg('sum').reset_index()
 923 t.drop_duplicates(inplace=True)
 924 
 925 t1 = feature1[['user_id','merchant_id','coupon_id']]
 926 t1.coupon_id=t1.coupon_id.replace(np.nan,'null')
 927 t1 = t1[t1.coupon_id!='null'][['user_id','merchant_id']]
 928 t1['user_merchant_received'] = 1
 929 t1 = t1.groupby(['user_id','merchant_id']).agg('sum').reset_index()
 930 t1.drop_duplicates(inplace=True)
 931 
 932 t2 = feature1[['user_id','merchant_id','date','date_received']]
 933 t2 = t2[(t2.date!='null')&(t2.date_received!='null')][['user_id','merchant_id']]
 934 t2['user_merchant_buy_use_coupon'] = 1
 935 t2 = t2.groupby(['user_id','merchant_id']).agg('sum').reset_index()
 936 t2.drop_duplicates(inplace=True)
 937 
 938 t3 = feature1[['user_id','merchant_id']]
 939 t3['user_merchant_any'] = 1
 940 t3 = t3.groupby(['user_id','merchant_id']).agg('sum').reset_index()
 941 t3.drop_duplicates(inplace=True)
 942 
 943 t4 = feature1[['user_id','merchant_id','date','coupon_id']]
 944 t4.coupon_id=t4.coupon_id.replace(np.nan,'null')
 945 t4 = t4[(t4.date!='null')&(t4.coupon_id=='null')][['user_id','merchant_id']]
 946 t4['user_merchant_buy_common'] = 1
 947 t4 = t4.groupby(['user_id','merchant_id']).agg('sum').reset_index()
 948 t4.drop_duplicates(inplace=True)
 949 
 950 user_merchant1 = pd.merge(all_user_merchant,t,on=['user_id','merchant_id'],how='left')
 951 user_merchant1 = pd.merge(user_merchant1,t1,on=['user_id','merchant_id'],how='left')
 952 user_merchant1 = pd.merge(user_merchant1,t2,on=['user_id','merchant_id'],how='left')
 953 user_merchant1 = pd.merge(user_merchant1,t3,on=['user_id','merchant_id'],how='left')
 954 user_merchant1 = pd.merge(user_merchant1,t4,on=['user_id','merchant_id'],how='left')
 955 user_merchant1.user_merchant_buy_use_coupon = user_merchant1.user_merchant_buy_use_coupon.replace(np.nan,0)
 956 user_merchant1.user_merchant_buy_common = user_merchant1.user_merchant_buy_common.replace(np.nan,0)
 957 user_merchant1['user_merchant_coupon_transfer_rate'] = user_merchant1.user_merchant_buy_use_coupon.astype('float') / user_merchant1.user_merchant_received.astype('float')
 958 user_merchant1['user_merchant_coupon_buy_rate'] = user_merchant1.user_merchant_buy_use_coupon.astype('float') / user_merchant1.user_merchant_buy_total.astype('float')
 959 user_merchant1['user_merchant_rate'] = user_merchant1.user_merchant_buy_total.astype('float') / user_merchant1.user_merchant_any.astype('float')
 960 user_merchant1['user_merchant_common_buy_rate'] = user_merchant1.user_merchant_buy_common.astype('float') / user_merchant1.user_merchant_buy_total.astype('float')
 961 user_merchant1.to_csv('data/user_merchant1.csv',index=None)
 962 
 963 ##################  generate training and testing set ################
 964 def get_label(s):
 965     s = s.split(':')
 966     if s[0]=='nan':
 967         return 0
 968     elif (date(int(s[0][0:4]),int(s[0][4:6]),int(s[0][6:8]))-date(int(s[1][0:4]),int(s[1][4:6]),int(s[1][6:8]))).days<=15:
 969         return 1
 970     else:
 971         return -1
 972 
 973 
 974 coupon3 = pd.read_csv('data/coupon3_feature.csv')
 975 merchant3 = pd.read_csv('data/merchant3_feature.csv')
 976 user3 = pd.read_csv('data/user3_feature.csv')
 977 user_merchant3 = pd.read_csv('data/user_merchant3.csv')
 978 other_feature3 = pd.read_csv('data/other_feature3.csv')
 979 dataset3 = pd.merge(coupon3,merchant3,on='merchant_id',how='left')
 980 dataset3 = pd.merge(dataset3,user3,on='user_id',how='left')
 981 dataset3 = pd.merge(dataset3,user_merchant3,on=['user_id','merchant_id'],how='left')
 982 dataset3 = pd.merge(dataset3,other_feature3,on=['user_id','coupon_id','date_received'],how='left')
 983 dataset3.drop_duplicates(inplace=True)
 984 print (dataset3.shape)
 985 
 986 dataset3.user_merchant_buy_total = dataset3.user_merchant_buy_total.replace(np.nan,0)
 987 dataset3.user_merchant_any = dataset3.user_merchant_any.replace(np.nan,0)
 988 dataset3.user_merchant_received = dataset3.user_merchant_received.replace(np.nan,0)
 989 dataset3['is_weekend'] = dataset3.day_of_week.apply(lambda x:1 if x in (6,7) else 0)
 990 weekday_dummies = pd.get_dummies(dataset3.day_of_week)
 991 weekday_dummies.columns = ['weekday'+str(i+1) for i in range(weekday_dummies.shape[1])]
 992 dataset3 = pd.concat([dataset3,weekday_dummies],axis=1)
 993 dataset3.drop(['merchant_id','day_of_week','coupon_count'],axis=1,inplace=True)
 994 dataset3 = dataset3.replace('null',np.nan)
 995 dataset3.to_csv('data/dataset3.csv',index=None)
 996 
 997 
 998 coupon2 = pd.read_csv('data/coupon2_feature.csv')
 999 merchant2 = pd.read_csv('data/merchant2_feature.csv')
1000 user2 = pd.read_csv('data/user2_feature.csv')
1001 user_merchant2 = pd.read_csv('data/user_merchant2.csv')
1002 other_feature2 = pd.read_csv('data/other_feature2.csv')
1003 dataset2 = pd.merge(coupon2,merchant2,on='merchant_id',how='left')
1004 dataset2 = pd.merge(dataset2,user2,on='user_id',how='left')
1005 dataset2 = pd.merge(dataset2,user_merchant2,on=['user_id','merchant_id'],how='left')
1006 dataset2 = pd.merge(dataset2,other_feature2,on=['user_id','coupon_id','date_received'],how='left')
1007 dataset2.drop_duplicates(inplace=True)
1008 print( dataset2.shape)
1009 
1010 dataset2.user_merchant_buy_total = dataset2.user_merchant_buy_total.replace(np.nan,0)
1011 dataset2.user_merchant_any = dataset2.user_merchant_any.replace(np.nan,0)
1012 dataset2.user_merchant_received = dataset2.user_merchant_received.replace(np.nan,0)
1013 dataset2['is_weekend'] = dataset2.day_of_week.apply(lambda x:1 if x in (6,7) else 0)
1014 weekday_dummies = pd.get_dummies(dataset2.day_of_week)
1015 weekday_dummies.columns = ['weekday'+str(i+1) for i in range(weekday_dummies.shape[1])]
1016 dataset2 = pd.concat([dataset2,weekday_dummies],axis=1)
1017 dataset2['label'] = dataset2.date.astype('str') + ':' +  dataset2.date_received.astype('str')
1018 dataset2.label = dataset2.label.apply(get_label)
1019 dataset2.drop(['merchant_id','day_of_week','date','date_received','coupon_id','coupon_count'],axis=1,inplace=True)
1020 dataset2 = dataset2.replace('null',np.nan)
1021 dataset2.to_csv('data/dataset2.csv',index=None)
1022 
1023 
1024 coupon1 = pd.read_csv('data/coupon1_feature.csv')
1025 merchant1 = pd.read_csv('data/merchant1_feature.csv')
1026 user1 = pd.read_csv('data/user1_feature.csv')
1027 user_merchant1 = pd.read_csv('data/user_merchant1.csv')
1028 other_feature1 = pd.read_csv('data/other_feature1.csv')
1029 dataset1 = pd.merge(coupon1,merchant1,on='merchant_id',how='left')
1030 dataset1 = pd.merge(dataset1,user1,on='user_id',how='left')
1031 dataset1 = pd.merge(dataset1,user_merchant1,on=['user_id','merchant_id'],how='left')
1032 dataset1 = pd.merge(dataset1,other_feature1,on=['user_id','coupon_id','date_received'],how='left')
1033 dataset1.drop_duplicates(inplace=True)
1034 print (dataset1.shape)
1035 
1036 dataset1.user_merchant_buy_total = dataset1.user_merchant_buy_total.replace(np.nan,0)
1037 dataset1.user_merchant_any = dataset1.user_merchant_any.replace(np.nan,0)
1038 dataset1.user_merchant_received = dataset1.user_merchant_received.replace(np.nan,0)
1039 dataset1['is_weekend'] = dataset1.day_of_week.apply(lambda x:1 if x in (6,7) else 0)
1040 weekday_dummies = pd.get_dummies(dataset1.day_of_week)
1041 weekday_dummies.columns = ['weekday'+str(i+1) for i in range(weekday_dummies.shape[1])]
1042 dataset1 = pd.concat([dataset1,weekday_dummies],axis=1)
1043 dataset1['label'] = dataset1.date.astype('str') + ':' +  dataset1.date_received.astype('str')
1044 dataset1.label = dataset1.label.apply(get_label)
1045 dataset1.drop(['merchant_id','day_of_week','date','date_received','coupon_id','coupon_count'],axis=1,inplace=True)
1046 dataset1 = dataset1.replace('null',np.nan)
1047 dataset1.to_csv('data/dataset1.csv',index=None)
View Code

相关文章: