Stacked Regressions : Top 4% on LeaderBoard
1. subprocess的check_output模块,用来得到命令行的输出结果
# kaggle代码: 用来输出显示目录下的文件
print(check_output(["ls", "../input"]).decode("utf8"))
# 示例代码: 对命令行输出的结果进行操作
output = subprocess.check_output(["python3", "xx.py"], shell = False)
if (output.find("yes") >= 0): print("yes")
else: print("no")
2. csv操作
train = pd.read_csv(\'../input/train.csv\')
# 显示csv的前五行
train.head(5)
# 丢弃ID列
train.drop("Id", axis = 1, inplace = True)
# 删除特定数据
train = train.drop(train[(train[\'GrLivArea\']>4000) & (train[\'SalePrice\']<300000)].index)
# 连接数据
all_data = pd.concat((train, test)).reset_index(drop=True)
3. 可视化
fig, ax = plt.subplots()
ax.scatter(x = train[\'GrLivArea\'], y = train[\'SalePrice\'])
plt.ylabel(\'SalePrice\', fontsize=13)
plt.xlabel(\'GrLivArea\', fontsize=13)
plt.show()
# seaborn可视化数据分布
sns.distplot(train[\'SalePrice\'] , fit=norm);
# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(train[\'SalePrice\'])
print( \'\n mu = {:.2f} and sigma = {:.2f}\n\'.format(mu, sigma))
#Now plot the distribution
plt.legend([\'Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )\'.format(mu, sigma)],
loc=\'best\')
plt.ylabel(\'Frequency\')
plt.title(\'SalePrice distribution\')
#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(train[\'SalePrice\'], plot=plt)
plt.show()
4. NULL值检查、处理
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({\'Missing Ratio\' :all_data_na})
missing_data.head(20)
# 以None代替
for col in (\'GarageType\', \'GarageFinish\', \'GarageQual\', \'GarageCond\'):
all_data[col] = all_data[col].fillna(\'None\')
# 以临近值代替
all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(
lambda x: x.fillna(x.median()))
5. 数据关联性检查
corrmat = train.corr()
plt.subplots(figsize=(12,9))
sns.heatmap(corrmat, vmax=0.9, square=True)
6. Label Encoding
from sklearn.preprocessing import LabelEncoder
cols = (\'FireplaceQu\', \'BsmtQual\', \'BsmtCond\', \'GarageQual\', \'GarageCond\',
\'ExterQual\', \'ExterCond\',\'HeatingQC\', \'PoolQC\', \'KitchenQual\', \'BsmtFinType1\',
\'BsmtFinType2\', \'Functional\', \'Fence\', \'BsmtExposure\', \'GarageFinish\', \'LandSlope\',
\'LotShape\', \'PavedDrive\', \'Street\', \'Alley\', \'CentralAir\', \'MSSubClass\', \'OverallCond\',
\'YrSold\', \'MoSold\')
# process columns, apply LabelEncoder to categorical features
for c in cols:
lbl = LabelEncoder()
lbl.fit(list(all_data[c].values))
all_data[c] = lbl.transform(list(all_data[c].values))
# shape
print(\'Shape all_data: {}\'.format(all_data.shape))
7. OneHot Encoding
all_data = pd.get_dummies(all_data)
print(all_data.shape)