import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
iris = load_iris()
y = iris.target
X = iris.data
X.shape
import pandas as pd
pd.DataFrame(X)
| 0 | 1 | 2 | 3 | |
|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 |
| 5 | 5.4 | 3.9 | 1.7 | 0.4 |
| 6 | 4.6 | 3.4 | 1.4 | 0.3 |
| 7 | 5.0 | 3.4 | 1.5 | 0.2 |
| 8 | 4.4 | 2.9 | 1.4 | 0.2 |
| 9 | 4.9 | 3.1 | 1.5 | 0.1 |
| 10 | 5.4 | 3.7 | 1.5 | 0.2 |
pd.DataFrame(y)
pca = PCA(n_components=2)
pca = pca.fit(X)
X_dr = pca.transform(X)
# 查看降维后每个特征向量上的所带信息大小【可解释为方差大小】
pca.explained_variance_
# 查看降维后每个特征向量上的信息量所占原始数据总信息量的百分比
pca.explained_variance_ratio_
X_dr
X_dr[y == 1,0]
colors = ['red','black','orange']
iris.target_names
plt.figure()
for i in [0,1,2]:
plt.scatter(X_dr[y==i,0],X_dr[y==i,1],alpha=.7,c=colors[i],label=iris.target_names[i])
plt.legend()
plt.title('PCA of IRIS dataset')
plt.show()
import numpy as np
pca_line = PCA().fit(X)
plt.plot([1,2,3,4],np.cumsum(pca_line.explained_variance_ratio_))
plt.xticks([1,2,3,4])
plt.xlabel("number of components after dimension reduction")
plt.ylabel("cumulative explained variance")
plt.show()
对于PCA降维方式,主要是根据前后的方差的大小来确定信息量的损失情况!!!!