Kaggle预测房价知识点 01数据预处理

Stacked Regressions : Top 4% on LeaderBoard

1. subprocess的check_output模块，用来得到命令行的输出结果

# kaggle代码: 用来输出显示目录下的文件
print(check_output(["ls", "../input"]).decode("utf8"))

# 示例代码: 对命令行输出的结果进行操作
output = subprocess.check_output(["python3", "xx.py"], shell = False)
if (output.find("yes") >= 0): print("yes")
else: print("no")

2. csv操作

train = pd.read_csv(\'../input/train.csv\')
# 显示csv的前五行
train.head(5)
# 丢弃ID列
train.drop("Id", axis = 1, inplace = True)
# 删除特定数据
train = train.drop(train[(train[\'GrLivArea\']>4000) & (train[\'SalePrice\']<300000)].index)
# 连接数据
all_data = pd.concat((train, test)).reset_index(drop=True)

3. 可视化

fig, ax = plt.subplots()
ax.scatter(x = train[\'GrLivArea\'], y = train[\'SalePrice\'])
plt.ylabel(\'SalePrice\', fontsize=13)
plt.xlabel(\'GrLivArea\', fontsize=13)
plt.show()

可视化xy数据

# seaborn可视化数据分布
sns.distplot(train[\'SalePrice\'] , fit=norm);

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(train[\'SalePrice\'])
print( \'\n mu = {:.2f} and sigma = {:.2f}\n\'.format(mu, sigma))

#Now plot the distribution
plt.legend([\'Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )\'.format(mu, sigma)],
            loc=\'best\')
plt.ylabel(\'Frequency\')
plt.title(\'SalePrice distribution\')

#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(train[\'SalePrice\'], plot=plt)
plt.show()

4. NULL值检查、处理

all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({\'Missing Ratio\' :all_data_na})
missing_data.head(20)

# 以None代替
for col in (\'GarageType\', \'GarageFinish\', \'GarageQual\', \'GarageCond\'):
    all_data[col] = all_data[col].fillna(\'None\')
# 以临近值代替
all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))

5. 数据关联性检查

corrmat = train.corr()
plt.subplots(figsize=(12,9))
sns.heatmap(corrmat, vmax=0.9, square=True)

6. Label Encoding

from sklearn.preprocessing import LabelEncoder
cols = (\'FireplaceQu\', \'BsmtQual\', \'BsmtCond\', \'GarageQual\', \'GarageCond\', 
        \'ExterQual\', \'ExterCond\',\'HeatingQC\', \'PoolQC\', \'KitchenQual\', \'BsmtFinType1\', 
        \'BsmtFinType2\', \'Functional\', \'Fence\', \'BsmtExposure\', \'GarageFinish\', \'LandSlope\',
        \'LotShape\', \'PavedDrive\', \'Street\', \'Alley\', \'CentralAir\', \'MSSubClass\', \'OverallCond\', 
        \'YrSold\', \'MoSold\')
# process columns, apply LabelEncoder to categorical features
for c in cols:
    lbl = LabelEncoder() 
    lbl.fit(list(all_data[c].values)) 
    all_data[c] = lbl.transform(list(all_data[c].values))

# shape        
print(\'Shape all_data: {}\'.format(all_data.shape))

7. OneHot Encoding

all_data = pd.get_dummies(all_data)
print(all_data.shape)