import pandas as pd
housing = pd.read_csv("housing.csv")
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)
# 输出
median_house_value 1.000000
median_income 0.685211
total_rooms 0.138753
housing_median_age 0.116584
Unnamed: 0 0.070593
households 0.067639
total_bedrooms 0.051664
population -0.025932
longitude -0.051261
latitude -0.140618
Name: median_house_value, dtype: float64
from pandas.plotting import scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))
housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1)
housing["rooms_per_household"] = housing['total_rooms']/housing['households']
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"] = housing["population"]/housing["households"]
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)
# 输出
median_house_value 1.000000
median_income 0.685211
rooms_per_household 0.146777
total_rooms 0.138753
housing_median_age 0.116584
Unnamed: 0 0.070593
households 0.067639
total_bedrooms 0.051664
population -0.025932
population_per_household -0.029628
longitude -0.051261
latitude -0.140618
bedrooms_per_room -0.255092
Name: median_house_value, dtype: float64