1. 通过行为习惯对移动用户人口属性(年龄+性别)进行预测。
2. 数据及包含~20万用户数据,分成12组,同时提供了用户行为属性,如:手机品牌、型号、APP的类型等。
3. 通过logloss评价
main.py
1 # -*- coding: utf-8 -*- 2 3 4 import pandas as pd 5 import os 6 from pd_tools import split_train_test, get_part_data 7 import numpy as np 8 from sklearn.preprocessing import LabelEncoder, OneHotEncoder 9 from sklearn.preprocessing import StandardScaler 10 from sklearn.linear_model import LogisticRegression 11 from sklearn import svm 12 from sklearn.decomposition import PCA 13 from ml_tools import get_best_model 14 from sklearn.metrics import log_loss 15 from sklearn.feature_selection import VarianceThreshold 16 17 # 数据集变量声明 18 dataset_path = \'./dataset\' 19 gender_age_filename = \'gender_age.csv\' 20 phone_brand_device_model_filename = \'phone_brand_device_model.csv\' 21 events_filename = \'events.csv\' 22 app_events_filename = \'app_events.csv\' 23 app_labels_filename = \'app_labels.csv\' 24 label_categories_filename = \'label_categories.csv\' 25 26 train_gender_age_filename = \'gender_age_train.csv\' 27 test_gender_age_filename = \'gender_age_test.csv\' 28 29 is_first_run = False 30 31 32 def run_main(): 33 """ 34 主函数 35 """ 36 if is_first_run: 37 # 1. 分割数据集 38 print(\'分割数据集\') 39 all_gender_age = pd.read_csv(os.path.join(dataset_path, gender_age_filename)) 40 df_train, df_test = split_train_test(all_gender_age) 41 # 查看训练集测试集基本信息 42 print(\'训练集中各类的数据个数:\', df_train.groupby(\'group\').size()) 43 print(\'测试集中各类的数据个数:\', df_test.groupby(\'group\').size()) 44 45 # 保存分割的数据集 46 df_train.to_csv(os.path.join(dataset_path, train_gender_age_filename), 47 index=False) 48 df_test.to_csv(os.path.join(dataset_path, test_gender_age_filename), 49 index=False) 50 51 # 2. 加载数据 52 print(\'加载数据\') 53 # 加载数据 54 gender_age_train = pd.read_csv(os.path.join(dataset_path, train_gender_age_filename), 55 index_col=\'device_id\') 56 gender_age_test = pd.read_csv(os.path.join(dataset_path, test_gender_age_filename), 57 index_col=\'device_id\') 58 59 # 选取部分数据用于实验 60 percent = 0.1 61 gender_age_train = get_part_data(gender_age_train, percent=percent) 62 gender_age_test = get_part_data(gender_age_test, percent=percent) 63 64 phone_brand_device_model = pd.read_csv(os.path.join(dataset_path, phone_brand_device_model_filename)) 65 # 去掉重复数据 66 phone_brand_device_model = phone_brand_device_model.drop_duplicates(\'device_id\').set_index(\'device_id\') 67 68 events = pd.read_csv(os.path.join(dataset_path, events_filename), 69 usecols=[\'device_id\', \'event_id\'], index_col=\'event_id\') 70 app_events = pd.read_csv(os.path.join(dataset_path, app_events_filename), 71 usecols=[\'event_id\', \'app_id\']) 72 # app_labels = pd.read_csv(os.path.join(dataset_path, app_labels_filename)) 73 74 # 3. 特征工程 75 # 3.1 手机品牌特征 76 # 使用LabelEncoder将类别转换为数字 77 brand_label_encoder = LabelEncoder() 78 brand_label_encoder.fit(phone_brand_device_model[\'phone_brand\'].values) 79 phone_brand_device_model[\'brand_label_code\'] = \ 80 brand_label_encoder.transform(phone_brand_device_model[\'phone_brand\'].values) 81 gender_age_train[\'brand_label_code\'] = phone_brand_device_model[\'brand_label_code\'] 82 gender_age_test[\'brand_label_code\'] = phone_brand_device_model[\'brand_label_code\'] 83 84 # 使用OneHotEncoder将数字转换为OneHot码 85 brand_onehot_encoder = OneHotEncoder() 86 brand_onehot_encoder.fit(phone_brand_device_model[\'brand_label_code\'].values.reshape(-1, 1)) 87 tr_brand_feat = brand_onehot_encoder.transform(gender_age_train[\'brand_label_code\'].values.reshape(-1, 1)) 88 te_brand_feat = brand_onehot_encoder.transform(gender_age_test[\'brand_label_code\'].values.reshape(-1, 1)) 89 90 print(\'[手机品牌]特征维度:\', tr_brand_feat.shape[1]) 91 92 # 3.2 手机型号特征 93 # 合并手机品牌与型号字符串 94 phone_brand_device_model[\'brand_model\'] = \ 95 phone_brand_device_model[\'phone_brand\'].str.cat(phone_brand_device_model[\'device_model\']) 96 97 # 使用LabelEncoder将类别转换为数字 98 model_label_encoder = LabelEncoder() 99 model_label_encoder.fit(phone_brand_device_model[\'brand_model\'].values) 100 phone_brand_device_model[\'brand_model_label_code\'] = \ 101 model_label_encoder.transform(phone_brand_device_model[\'brand_model\'].values) 102 gender_age_train[\'brand_model_label_code\'] = phone_brand_device_model[\'brand_model_label_code\'] 103 gender_age_test[\'brand_model_label_code\'] = phone_brand_device_model[\'brand_model_label_code\'] 104 105 # 使用OneHotEncoder将数字转换为OneHot码 106 model_onehot_encoder = OneHotEncoder() 107 model_onehot_encoder.fit(phone_brand_device_model[\'brand_model_label_code\'].values.reshape(-1, 1)) 108 tr_model_feat = model_onehot_encoder.transform(gender_age_train[\'brand_model_label_code\'].values.reshape(-1, 1)) 109 te_model_feat = model_onehot_encoder.transform(gender_age_test[\'brand_model_label_code\'].values.reshape(-1, 1)) 110 111 print(\'[手机型号]特征维度:\', tr_model_feat.shape[1]) 112 113 # 3.3 安装app特征 114 device_app = app_events.merge(events, how=\'left\', left_on=\'event_id\', right_index=True) 115 # 运行app的总次数 116 n_run_s = device_app[\'app_id\'].groupby(device_app[\'device_id\']).size() 117 118 # 运行app的个数 119 n_app_s = device_app[\'app_id\'].groupby(device_app[\'device_id\']).nunique() 120 121 gender_age_train[\'n_run\'] = n_run_s 122 gender_age_train[\'n_app\'] = n_app_s 123 124 # 填充缺失数据 125 gender_age_train[\'n_run\'].fillna(0, inplace=True) 126 gender_age_train[\'n_app\'].fillna(0, inplace=True) 127 128 gender_age_test[\'n_run\'] = n_run_s 129 gender_age_test[\'n_app\'] = n_app_s 130 131 # 填充缺失数据 132 gender_age_test[\'n_run\'].fillna(0, inplace=True) 133 gender_age_test[\'n_app\'].fillna(0, inplace=True) 134 135 tr_run_feat = gender_age_train[\'n_run\'].values.reshape(-1, 1) 136 tr_app_feat = gender_age_train[\'n_app\'].values.reshape(-1, 1) 137 138 te_run_feat = gender_age_test[\'n_run\'].values.reshape(-1, 1) 139 te_app_feat = gender_age_test[\'n_app\'].values.reshape(-1, 1) 140 141 # 3.4 合并所有特征 142 tr_feat = np.hstack((tr_brand_feat.toarray(), tr_model_feat.toarray(), tr_run_feat, tr_app_feat)) 143 te_feat = np.hstack((te_brand_feat.toarray(), te_model_feat.toarray(), te_run_feat, te_app_feat)) 144 print(\'特征提取结束\') 145 print(\'每个样本特征维度:\', tr_feat.shape[1]) 146 147 # 3.5 特征范围归一化 148 scaler = StandardScaler() 149 tr_feat_scaled = scaler.fit_transform(tr_feat) 150 te_feat_scaled = scaler.transform(te_feat) 151 152 # 3.6 特征选择 153 sel = VarianceThreshold(threshold=(.8 * (1 - .8))) 154 tr_feat_scaled_sel = sel.fit_transform(tr_feat_scaled) 155 te_feat_scaled_sel = sel.transform(te_feat_scaled) 156 157 # 3.7 PCA降维操作 158 pca = PCA(n_components=0.95) # 保留95%共享率的特征向量 159 tr_feat_scaled_sel_pca = pca.fit_transform(tr_feat_scaled_sel) 160 te_feat_scaled_sel_pca = pca.transform(te_feat_scaled_sel) 161 print(\'特征处理结束\') 162 print(\'处理后每个样本特征维度:\', tr_feat_scaled_sel_pca.shape[1]) 163 164 # 4 为数据添加标签 165 group_label_encoder = LabelEncoder() 166 group_label_encoder.fit(gender_age_train[\'group\'].values) 167 y_train = group_label_encoder.transform(gender_age_train[\'group\'].values) 168 y_test = group_label_encoder.transform(gender_age_test[\'group\'].values) 169 170 # 5. 训练模型 171 # 5.1 逻辑回归模型 172 print(\'训练逻辑回归模型...\') 173 lr_param_grid = [ 174 {\'C\': [1e-3, 1e-2, 1e-1, 1, 10, 100]} 175 ] 176 lr_model = LogisticRegression() 177 best_lr_model = get_best_model(lr_model, 178 tr_feat_scaled_sel_pca, y_train, 179 lr_param_grid, cv=3) 180 y_pred_lr = best_lr_model.predict_proba(te_feat_scaled_sel_pca) 181 182 # 5.2 SVM 183 print(\'训练SVM模型...\') 184 svm_param_grid = [ 185 {\'C\': [1e-2, 1e-1, 1, 10, 100], \'gamma\': [0.001, 0.0001], \'kernel\': [\'rbf\']}, 186 ] 187 188 # 设置probability=True用于输出预测概率 189 svm_model = svm.SVC(probability=True) 190 best_svm_model = get_best_model(svm_model, 191 tr_feat_scaled_sel_pca, y_train, 192 svm_param_grid, cv=3) 193 y_pred_svm = best_svm_model.predict_proba(te_feat_scaled_sel_pca) 194 195 # 6. 查看结果 196 print(\'逻辑回归模型 logloss:\', log_loss(y_test, y_pred_lr)) 197 print(\'SVM logloss:\', log_loss(y_test, y_pred_svm)) 198 199 200 if __name__ == \'__main__\': 201 run_main()
ml_tools.py
1 # -*- coding: utf-8 -*- 2 3 from sklearn.model_selection import GridSearchCV 4 5 6 def get_best_model(model, X_train, y_train, params, cv=5): 7 """ 8 交叉验证获取最优模型 9 默认5折交叉验证 10 """ 11 clf = GridSearchCV(model, params, cv=cv) 12 clf.fit(X_train, y_train) 13 return clf.best_estimator_
pd_tools.py
1 # -*- coding: utf-8 -*- 2 3 import pandas as pd 4 import math 5 6 7 def split_train_test(df_data, size=0.8): 8 """ 9 分割训练集和测试集 10 """ 11 # 为保证每个类中的数据能在训练集中和测试集中的比例相同,所以需要依次对每个类进行处理 12 df_train = pd.DataFrame() 13 df_test = pd.DataFrame() 14 15 labels = df_data[\'group\'].unique().tolist() 16 for label in labels: 17 # 找出group的记录 18 df_w_label = df_data[df_data[\'group\'] == label] 19 # 重新设置索引,保证每个类的记录是从0开始索引,方便之后的拆分 20 df_w_label = df_w_label.reset_index() 21 22 # 默认按80%训练集,20%测试集分割 23 # 这里为了简化操作,取前80%放到训练集中,后20%放到测试集中 24 # 当然也可以随机拆分80%,20%(尝试实现下DataFrame中的随机拆分) 25 26 # 该类数据的行数 27 n_lines = df_w_label.shape[0] 28 split_line_no = math.floor(n_lines * size) 29 text_df_w_label_train = df_w_label.iloc[:split_line_no, :] 30 text_df_w_label_test = df_w_label.iloc[split_line_no:, :] 31 32 # 放入整体训练集,测试集中 33 df_train = df_train.append(text_df_w_label_train) 34 df_test = df_test.append(text_df_w_label_test) 35 36 df_train = df_train.reset_index() 37 df_test = df_test.reset_index() 38 return df_train, df_test 39 40 41 def get_part_data(df_data, percent=1): 42 """ 43 从df_data中按percent选取部分数据 44 """ 45 df_result = pd.DataFrame() 46 grouped = df_data.groupby(\'group\') 47 for group_name, group in grouped: 48 n_group_size = group.shape[0] 49 n_part_size = math.floor(n_group_size * percent) 50 part_df = group.iloc[:n_part_size, :] 51 df_result = df_result.append(part_df) 52 53 return df_result
dataset下载地址
链接:http://pan.baidu.com/s/1dE7D0bf
密码:yapd